Full Code of millanek/Dsuite for AI

master a547f99599d7 cached
27 files
451.9 KB
120.8k tokens
221 symbols
1 requests
Download .txt
Showing preview only (467K chars total). Download the full file or copy to clipboard to get everything.
Repository: millanek/Dsuite
Branch: master
Commit: a547f99599d7
Files: 27
Total size: 451.9 KB

Directory structure:
gitextract_vdae3bmp/

├── Build/
│   └── README.md
├── D.cpp
├── D.h
├── Dmin.cpp
├── Dmin.h
├── Dmin_combine.cpp
├── Dmin_combine.h
├── Dquartets.cpp
├── Dquartets.h
├── Dsuite.cpp
├── Dsuite_common.cpp
├── Dsuite_common.h
├── Dsuite_fBranch.cpp
├── Dsuite_fBranch.h
├── Dsuite_utils.cpp
├── Dsuite_utils.h
├── KolmogorovSmirnovDist.cpp
├── KolmogorovSmirnovDist.hpp
├── Makefile
├── README.md
├── gzstream.cpp
├── gzstream.h
├── kstest.cpp
├── kstest.h
└── utils/
    ├── DtriosParallel
    ├── dtools.py
    └── setup.py

================================================
FILE CONTENTS
================================================

================================================
FILE: Build/README.md
================================================
### The Build folder.
Here will be the executable after compilation.




================================================
FILE: D.cpp
================================================
//
//  D.cpp
//  Dsuite
//
//  Created by Milan Malinsky on 11/04/2019.
//

#include "D.h"
#include "Dsuite_common.h"
#include "kstest.h"
#include <deque>
#include <list>
#define SUBPROGRAM "Dinvestigate"

#define DEBUG 1
#define MIN_SETS 3

static const char *ABBA_USAGE_MESSAGE =
"Usage: " PROGRAM_BIN " " SUBPROGRAM " [OPTIONS] INPUT_FILE.vcf.gz SETS.txt test_trios.txt\n"
"Outputs D, f_d (Martin et al. 2014 MBE), f_dM (Malinsky et al., 2015), and d_f (Pfeifer & Kapan, 2019) in genomic windows\n"
"The SETS.txt file should have two columns: SAMPLE_ID    POPULATION_ID\n"
"The test_trios.txt should contain names of three populations for which the statistics will be calculated:\n"
"POP1   POP2    POP3\n"
"There can be multiple lines and then the program generates multiple ouput files, named like POP1_POP2_POP3_localFstats_SIZE_STEP.txt\n"
"\n"
"       -h, --help                              display this help and exit\n"
"       -w SIZE,STEP --window=SIZE,STEP         (required) D, f_D, f_dM, and d_f statistics for windows containing SIZE useable SNPs, moving by STEP (default: 50,25)\n"
"       -g, --use-genotype-probabilities        (optional) use probabilities (GP tag) or calculate them from likelihoods (GL or PL tags) using a Hardy-Weinberg prior\n"
"                                               the probabilities are used to estimate allele frequencies in each population/species\n"
"       -n, --run-name                          run-name will be included in the output file name\n"
"\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";


//enum { OPT_F_JK };

static const char* shortopts = "hw:n:g";

//static const int JK_WINDOW = 5000;

static const struct option longopts[] = {
    { "run-name",   required_argument, NULL, 'n' },
    { "window",   required_argument, NULL, 'w' },
    { "help",   no_argument, NULL, 'h' },
    { "use-genotype-probabilities", no_argument, NULL, 'g'},
    { NULL, 0, NULL, 0 }
};

namespace opt
{
    static string vcfFile;
    static string setsFile;
    static string testTriosFile;
    static string runName = "";
    static int minScLength = 0;
    static int windowSize = 50;
    static int windowStep = 25;
    static bool useGenotypeProbabilities = false;
    //int jkWindowSize = JK_WINDOW;
}


void doAbbaBaba() {
    string line; // for reading the input files
    
    std::istream* vcfFile = createReader(opt::vcfFile);
    std::ifstream* testTriosFile = new std::ifstream(opt::testTriosFile.c_str());
    if (!testTriosFile->good()) { std::cerr << "The file " << opt::testTriosFile << " could not be opened. Exiting..." << std::endl; exit(EXIT_FAILURE);}
    
    // Get the sample sets
    SetInformation setInfo(opt::setsFile, MIN_SETS, OutgroupRequired);
    
    // Get the test trios
    std::vector<std::ofstream*> outFiles;
    std::vector<std::ofstream*> outFilesGenes;
    std::vector<std::vector<string> > testTrios;
    while (getline(*testTriosFile,line)) {
        line.erase(std::remove(line.begin(), line.end(), '\r'), line.end()); // Deal with any left over \r from files prepared on Windows
        // std::cerr << line << std::endl;
        std::vector<string> threePops = split(line, '\t'); assert(threePops.size() == 3);
        for (int i = 0; i != threePops.size(); i++) { // Check that the test trios are in the sets file
            if (setInfo.popToIDsMap.count(threePops[i]) == 0) {
                std::cerr << threePops[i] << " is present in the " << opt::testTriosFile << " but missing from the " << opt::setsFile << std::endl;
            }
        }
        std::ofstream* outFile = new std::ofstream(threePops[0] + "_" + threePops[1] + "_" + threePops[2]+ "_localFstats_" + opt::runName + "_" + numToString(opt::windowSize) + "_" + numToString(opt::windowStep) + ".txt");
        *outFile << "chr\twindowStart\twindowEnd\tD\tf_d\tf_dM\td_f" << std::endl;
        outFiles.push_back(outFile);
        testTrios.push_back(threePops);
    }
    
    // Create objects to hold the results for each trio
    TestTrioInfo info(opt::windowSize); std::vector<TestTrioInfo> testTrioInfos(testTrios.size(), info);
    
    // Now go through the vcf and calculate D
    int totalVariantNumber = 0;
    int reportProgressEvery = 1000; string chr; string coord;

   // int lastPrint = 0; int lastWindowVariant = 0;
    std::vector<string> sampleNames; std::vector<std::string> fields;
    clock_t start = clock(); // clock_t startGettingCounts; clock_t startCalculation;
    //double durationGettingCounts; double durationCalculation;
    while (getline(*vcfFile, line)) {
        line.erase(std::remove(line.begin(), line.end(), '\r'), line.end()); // Deal with any left over \r from files prepared on Windows
        if (line[0] == '#' && line[1] == '#')
            continue;
        else if (line[0] == '#' && line[1] == 'C') {
            fields = split(line, '\t');
            std::vector<std::string> sampleNames(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());
            setInfo.linkSetsAndVCFpositions(sampleNames);
        } else {
            totalVariantNumber++;
            if (totalVariantNumber % reportProgressEvery == 0) reportProgessVCF(totalVariantNumber, start);
        
            fields = split(line, '\t'); chr = fields[0]; coord = fields[1];
            std::vector<std::string> genotypes(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());
            // Only consider biallelic SNPs
            string refAllele = fields[3]; string altAllele = fields[4];
            if (refAllele.length() > 1 || altAllele.length() > 1 || altAllele == "*") {
                refAllele.clear(); refAllele.shrink_to_fit(); altAllele.clear(); altAllele.shrink_to_fit();
                genotypes.clear(); genotypes.shrink_to_fit(); continue;
            }
            
            // startGettingCounts = clock();
            GeneralSetCounts* c = new GeneralSetCounts(setInfo.popToPosMap, (int)genotypes.size());
            try { c->getSetVariantCounts(genotypes, setInfo.posToPopMap); } catch (const std::out_of_range& oor) {
                std::cerr << "Problems getting splitCounts for " << chr << " " << coord << std::endl; }
            if (opt::useGenotypeProbabilities) {
                int likelihoodsOrProbabilitiesTagPosition = c->checkForGenotypeLikelihoodsOrProbabilities(fields);
                if (likelihoodsOrProbabilitiesTagPosition == LikelihoodsProbabilitiesAbsent) {
                    printMissingLikelihoodsWarning(fields[0], fields[1]);
                    opt::useGenotypeProbabilities = false;
                } else c->getAFsFromGenotypeLikelihoodsOrProbabilities(genotypes,setInfo.posToPopMap,likelihoodsOrProbabilitiesTagPosition);
            }
            genotypes.clear(); genotypes.shrink_to_fit();
            // durationGettingCounts = ( clock() - startGettingCounts ) / (double) CLOCKS_PER_SEC;
            
            // startCalculation = clock();
            double p_O; try { p_O = c->setDAFs.at("Outgroup"); } catch (const std::out_of_range& oor) {
                std::cerr << "Counts don't contain derived allele frequency for the Outgroup" << std::endl; }
            if (p_O == -1) { delete c; continue; } // We need to make sure that the outgroup is defined
            
            double p_S1; double p_S2; double p_S3; double ABBA; double BABA; double F_d_denom; double F_dM_denom;
            for (int i = 0; i != testTrios.size(); i++) {
                try {
                    if (!opt::useGenotypeProbabilities) p_S1 = c->setDAFs.at(testTrios[i][0]);
                    else p_S1 = c->setDAFsFromLikelihoods.at(testTrios[i][0]);
                } catch (const std::out_of_range& oor) {
                std::cerr << "Counts don't contain derived allele frequency for " << testTrios[i][0] << std::endl; }
                if (p_S1 == -1) continue;  // If any member of the trio has entirely missing data, just move on to the next trio
                try {
                    if (!opt::useGenotypeProbabilities) p_S2 = c->setDAFs.at(testTrios[i][1]);
                    else p_S2 = c->setDAFsFromLikelihoods.at(testTrios[i][1]);
                } catch (const std::out_of_range& oor) {
                    std::cerr << "Counts don't contain derived allele frequency for " << testTrios[i][1] << std::endl; }
                if (p_S2 == -1) continue;
                try {
                    if (!opt::useGenotypeProbabilities) p_S3 = c->setDAFs.at(testTrios[i][2]);
                    else p_S3 = c->setDAFsFromLikelihoods.at(testTrios[i][2]);
                } catch (const std::out_of_range& oor) {
                    std::cerr << "Counts don't contain derived allele frequency for " << testTrios[i][2] << std::endl; }
                if (p_S3 == -1) continue;
                //if (p_S3 == 0) continue; // XXAA pattern is not informative
                if (p_S1 == 0 && p_S2 == 0 && p_S3 == 0) continue; // Checking if the SNP is variable in the trio
                if (p_S1 == 1 && p_S2 == 1 && p_S3 == 1) continue; // Checking if the SNP is variable in the trio
                //if (p_S1 == 1 && p_S2 == 1) continue; // BBAA pattern is not informative
                //if (p_S1 == 0 && p_S2 == 0) continue; // AABA pattern is not informative
                
                
                ABBA = ((1-p_S1)*p_S2*p_S3*(1-p_O)); testTrioInfos[i].ABBAtotal += ABBA;
                if(ABBA > 0.5) {
                    testTrioInfos[i].ABBAsitePositionsPerChomosome[chr].push_back(atoi(coord.c_str()));
                }
                BABA = (p_S1*(1-p_S2)*p_S3*(1-p_O)); testTrioInfos[i].BABAtotal += BABA;
                if(BABA > 0.5) {
                    testTrioInfos[i].BABAsitePositionsPerChomosome[chr].push_back(atoi(coord.c_str()));
                }
                
                if (p_S2 > p_S3) {
                    F_d_denom = ((1-p_S1)*p_S2*p_S2*(1-p_O)) - (p_S1*(1-p_S2)*p_S2*(1-p_O));
                } else {
                    F_d_denom = ((1-p_S1)*p_S3*p_S3*(1-p_O)) - (p_S1*(1-p_S3)*p_S3*(1-p_O));
                } testTrioInfos[i].F_d_denom += F_d_denom; testTrioInfos[i].interimF_d_denom += F_d_denom;
                
                if (p_S1 <= p_S2) {
                    if (p_S2 > p_S3) {
                        F_dM_denom = ((1-p_S1)*p_S2*p_S2*(1-p_O)) - (p_S1*(1-p_S2)*p_S2*(1-p_O));
                    } else {
                        F_dM_denom = ((1-p_S1)*p_S3*p_S3*(1-p_O)) - (p_S1*(1-p_S3)*p_S3*(1-p_O));
                    }
                } else {
                    if (p_S1 > p_S3) {
                        F_dM_denom = -(((1-p_S1)*p_S2*p_S1*(1-p_O)) - (p_S1*(1-p_S2)*p_S1)*(1-p_O));
                    } else {
                        F_dM_denom = -(((1-p_S3)*p_S2*p_S3*(1-p_O)) - (p_S3*(1-p_S2)*p_S3)*(1-p_O));
                    }
                } testTrioInfos[i].F_dM_denom += F_dM_denom; testTrioInfos[i].interimF_dM_denom += F_dM_denom;
                
                
                // d_f
                double d13 = p_S1 + p_S3 - (2*p_S1*p_S3); double d23 = p_S2 + p_S3 - (2*p_S2*p_S3);
                double dfNum = p_S2 * d13 - p_S1 * d23;
                double dfDenom = p_S2 * d13 + p_S1 * d23;
                
                double ABBAplusBABA = ABBA + BABA;
                if (ABBAplusBABA != 0) {
                    testTrioInfos[i].windowABBAs.push_back(ABBA);  testTrioInfos[i].windowBABAs.push_back(BABA);
                    testTrioInfos[i].windowF_d_denoms.push_back(testTrioInfos[i].interimF_d_denom);
                    testTrioInfos[i].windowF_dM_denoms.push_back(testTrioInfos[i].interimF_dM_denom);
                    testTrioInfos[i].window_d_f_nums.push_back(dfNum); testTrioInfos[i].window_d_f_denoms.push_back(dfDenom);
                    testTrioInfos[i].windowInformativeSitesCords.push_back(atoi(coord.c_str()));
                    testTrioInfos[i].windowABBAs.pop_front(); testTrioInfos[i].windowBABAs.pop_front();
                    testTrioInfos[i].windowF_d_denoms.pop_front(); testTrioInfos[i].windowF_dM_denoms.pop_front();
                    testTrioInfos[i].windowInformativeSitesCords.pop_front();
                    testTrioInfos[i].window_d_f_nums.pop_front(); testTrioInfos[i].window_d_f_denoms.pop_front();
                    testTrioInfos[i].interimF_d_denom = 0; testTrioInfos[i].interimF_dM_denom = 0;
                    testTrioInfos[i].usedVars++;
                
                    if ((testTrioInfos[i].usedVars > opt::windowSize) && (testTrioInfos[i].usedVars % opt::windowStep == 0)) {
                        double windowABBAtotal = vector_sum(testTrioInfos[i].windowABBAs); double windowBABAtotal = vector_sum(testTrioInfos[i].windowBABAs);
                        double windowF_d_denom = vector_sum(testTrioInfos[i].windowF_d_denoms); double windowF_dM_denom = vector_sum(testTrioInfos[i].windowF_dM_denoms);
                        double wDnum = windowABBAtotal - windowBABAtotal; double wDdenom = windowABBAtotal + windowBABAtotal;
                        double w_d_f_num = vector_sum(testTrioInfos[i].window_d_f_nums);
                        double w_d_f_denom = vector_sum(testTrioInfos[i].window_d_f_denoms);
                        if ((atoi(coord.c_str()) - testTrioInfos[i].windowInformativeSitesCords[0]) > 0) {
                            *outFiles[i] << std::fixed << chr << "\t" << testTrioInfos[i].windowInformativeSitesCords[0] << "\t" << coord << "\t" << wDnum/wDdenom << "\t" << wDnum/windowF_d_denom << "\t" << wDnum/windowF_dM_denom << "\t" << w_d_f_num/w_d_f_denom << std::endl;
                        }
                    }
                }
            }
           // durationCalculation = ( clock() - startCalculation ) / (double) CLOCKS_PER_SEC;
            delete c;
        }
    }
    
    for (int i = 0; i != testTrios.size(); i++) {
        testTrioInfos[i].mergeABBA_BABA_SiteCoordsOverChoms(); testTrioInfos[i].testIfSitesUniformlyDistributed();
        
        std::cout << testTrios[i][0] << "\t" << testTrios[i][1] << "\t" << testTrios[i][2] << std::endl;
        std::cout << "D=" << (double)(testTrioInfos[i].ABBAtotal-testTrioInfos[i].BABAtotal)/(testTrioInfos[i].ABBAtotal+testTrioInfos[i].BABAtotal) << std::endl;
        std::cout << "f_d=" << (double)(testTrioInfos[i].ABBAtotal-testTrioInfos[i].BABAtotal)/testTrioInfos[i].F_d_denom << "\t" << (testTrioInfos[i].ABBAtotal-testTrioInfos[i].BABAtotal) << "/" << testTrioInfos[i].F_d_denom << std::endl;
        std::cout << "f_dM=" << (double)(testTrioInfos[i].ABBAtotal-testTrioInfos[i].BABAtotal)/testTrioInfos[i].F_dM_denom << "\t" << (testTrioInfos[i].ABBAtotal-testTrioInfos[i].BABAtotal) << "/" << testTrioInfos[i].F_dM_denom << std::endl;
        std::cout << "ABBA_KSpval = " << testTrioInfos[i].ABBA_KSpval << std::endl;
        std::cout << "BABA_KSpval = " << testTrioInfos[i].BABA_KSpval << std::endl;
        std::cout << std::endl;
    }
}


int abbaBabaMain(int argc, char** argv) {
    parseAbbaBabaOptions(argc, argv);
    doAbbaBaba();
    return 0;
    
}

void TestTrioInfo::testIfSitesUniformlyDistributed() {
    // Take care of the splits by random sampling with replacement:
    std::random_device rd;     // only used once to initialise (seed) engine
    std::mt19937 rng(rd());    // random-number engine used (Mersenne-Twister in this case)
    std::uniform_int_distribution<int> uniABBA(0,linearABBApos.back()); // guaranteed unbiased
    std::uniform_int_distribution<int> uniBABA(0,linearBABApos.back()); // guaranteed unbiased
    std::list<int64_t> uniABBAvals; std::list<int64_t> uniBABAvals;
    // uniABBAvals.re(linearABBApos.size()); uniBABAvals.resize(linearBABApos.size());
    
    
    int numUniformSamples = (int)linearABBApos.size(); if (numUniformSamples < 10000) { numUniformSamples = 10000; }
    for (int i = 0; i < numUniformSamples; i++) {
        uniABBAvals.push_back(uniABBA(rng));
    }
    
    numUniformSamples = (int)linearBABApos.size(); if (numUniformSamples < 10000) { numUniformSamples = 10000; }
    for (int i = 0; i < numUniformSamples; i++) {
        uniBABAvals.push_back(uniBABA(rng));
    }
    
    std::list<int64_t> linearABBAposList(linearABBApos.begin(),linearABBApos.end());
    std::list<int64_t> linearBABAposList(linearBABApos.begin(),linearBABApos.end());
    
    ABBA_KSpval = ks_test(uniABBAvals, linearABBAposList, std::cerr, false);
    BABA_KSpval = ks_test(uniBABAvals, linearBABAposList, std::cerr, false);
    
    //double BABApval = ks_test(uniBABAvals, linearBABApos, std::cerr);
    
}

 


void TestTrioInfo::mergeABBA_BABA_SiteCoordsOverChoms() {
    int totalNumABBAsites = 0;
    for(std::map<string,std::vector<int>>::iterator it = ABBAsitePositionsPerChomosome.begin(); it != ABBAsitePositionsPerChomosome.end(); it++) {
        totalNumABBAsites = totalNumABBAsites + (int)it->second.size();
    } linearABBApos.reserve(totalNumABBAsites);
    
    int linearPosSoFar = 0;
    for(std::map<string,std::vector<int>>::iterator it = ABBAsitePositionsPerChomosome.begin(); it != ABBAsitePositionsPerChomosome.end(); it++) {
        for (std::vector<int>::size_type i = 0; i < it->second.size(); i++) {
            linearABBApos.push_back(it->second[i] + linearPosSoFar);
        }
    }
    
    int totalNumBABAsites = 0;
    for(std::map<string,std::vector<int>>::iterator it = BABAsitePositionsPerChomosome.begin(); it != BABAsitePositionsPerChomosome.end(); it++) {
        totalNumBABAsites = totalNumBABAsites + (int)it->second.size();
    } linearBABApos.reserve(totalNumBABAsites);
    
    linearPosSoFar = 0;
    for(std::map<string,std::vector<int>>::iterator it = BABAsitePositionsPerChomosome.begin(); it != BABAsitePositionsPerChomosome.end(); it++) {
        for (std::vector<int>::size_type i = 0; i < it->second.size(); i++) {
            linearBABApos.push_back(it->second[i] + linearPosSoFar);
        }
    }
    
}

void parseAbbaBabaOptions(int argc, char** argv) {
    bool die = false;
    std::vector<string> windowSizeStep;
    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
    {
        std::istringstream arg(optarg != NULL ? optarg : "");
        switch (c)
        {
            case '?': die = true; break;
            case 'w':
                windowSizeStep = split(arg.str(), ',');
                if(windowSizeStep.size() != 2) {std::cerr << "The -w option requires two arguments, separated by a comma ','\n"; exit(EXIT_FAILURE);}
                opt::windowSize = atoi(windowSizeStep[0].c_str());
                opt::windowStep = atoi(windowSizeStep[1].c_str());
                break;
            case 'n': arg >> opt::runName; break;
            case 'g': opt::useGenotypeProbabilities = true; break;
            case 'h':
                std::cout << ABBA_USAGE_MESSAGE;
                exit(EXIT_SUCCESS);
        }
    }
    
    if (argc - optind < 3) {
        std::cerr << "missing arguments\n";
        die = true;
    }
    else if (argc - optind > 3)
    {
        std::cerr << "too many arguments\n";
        die = true;
    }
    
    if (die) {
        std::cout << "\n" << ABBA_USAGE_MESSAGE;
        exit(EXIT_FAILURE);
    }
    
    // Parse the input filenames
    opt::vcfFile = argv[optind++];
    opt::setsFile = argv[optind++];
    opt::testTriosFile = argv[optind++];
}


================================================
FILE: D.h
================================================
//
//  D.h
//  Dsuite
//
//  Created by Milan Malinsky on 11/04/2019.
//

#ifndef D_h
#define D_h

#include "Dsuite_utils.h"

class TestTrioInfo {
public:
    TestTrioInfo(int windowSize) {
        windowABBAs.resize(windowSize); windowBABAs.resize(windowSize);
        windowF_d_denoms.resize(windowSize); windowF_dM_denoms.resize(windowSize);
        windowInformativeSitesCords.resize(windowSize);
        window_d_f_nums.resize(windowSize); window_d_f_denoms.resize(windowSize);
        interimF_d_denom = 0; interimF_dM_denom = 0;

        usedVars = 0;
        ABBAtotal = 0; BABAtotal = 0;
        F_d_denom = 0; F_dM_denom = 0;
        F_G_denom = 0; F_G_num = 0;
        

    };
    
    // string P1; string P2; string P3;
    std::map<string,std::vector<int>> ABBAsitePositionsPerChomosome; std::vector<int> linearABBApos;
    std::map<string,std::vector<int>> BABAsitePositionsPerChomosome; std::vector<int> linearBABApos;
    std::deque<double> windowABBAs; std::deque<double> windowBABAs;
    std::deque<double> windowF_d_denoms; std::deque<double> windowF_dM_denoms;
    std::deque<double> window_d_f_nums; std::deque<double> window_d_f_denoms;
    std::deque<int> windowInformativeSitesCords;
    double interimF_d_denom; double interimF_dM_denom;
    //double D1; double D2; double D3; double D1_p; double D2_p; double D3_p;
    
    double ABBAtotal; double BABAtotal;
    double F_d_denom; double F_dM_denom; double F_G_denom; double F_G_num;
    int usedVars;
    double ABBA_KSpval; double BABA_KSpval; 
    
    void mergeABBA_BABA_SiteCoordsOverChoms();
    void testIfSitesUniformlyDistributed();
    
};


void parseAbbaBabaOptions(int argc, char** argv);
int abbaBabaMain(int argc, char** argv);
#endif /* D_h */


================================================
FILE: Dmin.cpp
================================================
//
//  Dmin.cpp
//  Dsuite
//
//  Created by Milan Malinsky on 02/04/2019.
//

#include "Dmin.h"
#include "Dsuite_common.h"

#define SUBPROGRAM "Dtrios"

#define DEBUG 0
#define MIN_SETS 3

static const char *DMIN_USAGE_MESSAGE =
"Usage: " PROGRAM_BIN " " SUBPROGRAM " [OPTIONS] INPUT_FILE.vcf SETS.txt\n"
"Calculate the D (ABBA/BABA) and f4-ratio statistics for all trios of species in the dataset (the outgroup being fixed)\n"
"The results are as definded in Patterson et al. 2012 (equivalent to Durand et al. 2011 when the Outgroup is fixed for the ancestral allele)\n"
"The SETS.txt should have two columns: SAMPLE_ID    SPECIES_ID\n"
"The outgroup (can be multiple samples) should be specified by using the keywork Outgroup in place of the SPECIES_ID\n"
"\n"
stdInInfo
"       -h, --help                              display this help and exit\n"
"       -k, --JKnum                             (default=20) the number of Jackknife blocks to divide the dataset into; should be at least 20 for the whole dataset\n"
"       -j, --JKwindow                          (default=NA) Jackknife block size in number of informative SNPs (as used in v0.2)\n"
"                                               when specified, this is used in place of the --JKnum option\n"
regionOption    // -r
treeOption      // -t
outOption       // -o
"       -n, --run-name                          (optional) run-name will be included in the output file name after the PREFIX\n"
"       --no-f4-ratio                           (optional) don't calculate the f4-ratio\n"
"       -l NUMLINES                             (optional) the number of lines in the VCF input - required if reading the VCF via a unix pipe\n"
"       -g, --use-genotype-probabilities        (optional) use probabilities (GP tag) or calculate them from likelihoods (GL or PL tags) using a Hardy-Weinberg prior\n"
"                                               the probabilities are used to estimate allele frequencies in each population/species\n"
"       -p, --pool-seq=MIN_DEPTH                (optional) VCF contains pool-seq data; i.e., each 'individual' is a population\n"
"                                               allele frequencies are then estimated from the AD (Allelic Depth) field, as long as there are MIN_DEPTH reads\n"
"                                               e.g MIN_DEPTH=5 may be reasonable; when there are fewer reads, the allele frequency is set to missing\n"
"       -c, --no-combine                        (optional) do not output the \"_combine.txt\" and \"_combine_stderr.txt\" files\n"
"       --ABBAclustering                        (optional) Test whether strong ABBA-informative sites cluster along the genome\n"
//"                                               TYPE can be: 1 - clustering within a vector of all segregating sites\n"
//"                                                            2 - clustering within a vector of strong ABBA and BABA sites\n"
// "                                               TYPE=2 is less sensitive, but is robust to mutation rate variation\n"
"\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";


enum { OPT_NO_F4, OPT_KS_TEST };
static const char* shortopts = "hr:n:t:j:fk:l:o:gcp:";

static const struct option longopts[] = {
    { "run-name",   required_argument, NULL, 'n' },
    { "no-combine",   required_argument, NULL, 'c' },
    { "out-prefix",   required_argument, NULL, 'o' },
    { "region",   required_argument, NULL, 'r' },
    { "tree",   required_argument, NULL, 't' },
    { "JKwindow",   required_argument, NULL, 'j' },
    { "JKnum",   required_argument, NULL, 'k' },
    { "help",   no_argument, NULL, 'h' },
    { "no-f4-ratio",   no_argument, NULL, OPT_NO_F4 },
    { "use-genotype-probabilities", no_argument, NULL, 'g'},
    { "pool-seq", required_argument, NULL, 'p'},
    { "KS-test-for-homoplasy", no_argument , NULL, OPT_KS_TEST},
    { "ABBAclustering", no_argument , NULL, OPT_KS_TEST},
    { NULL, 0, NULL, 0 }
};

namespace opt
{
    static string vcfFile;
    static string setsFile;
    static string treeFile = "";
    static string runName = "";
    static string providedOutPrefix = "";
    static int jkWindowSize = 0;
    static int jkNum = 20;
    static int regionStart = -1;
    static int regionLength = -1;
    static int providedNumLines = -1;
    static bool fStats = true;
    static bool KStest = false;
    static bool useGenotypeProbabilities = false;
    static bool poolSeq = false;
    static int poolMinDepth;
    static bool combine = true;
}


int DminMain(int argc, char** argv) {
    parseDminOptions(argc, argv);
    string line; // for reading the input files
    string outFileRoot = prepareOutFileRootString(opt::providedOutPrefix, opt::runName, opt::setsFile, opt::regionStart, opt::regionLength);
    std::istream* treeFile; std::ofstream* outFileTree;
    std::map<string,std::vector<int>> treeTaxonNamesToLoc; std::vector<int> treeLevels;
    if (opt::treeFile != "") {
        treeFile = new std::ifstream(opt::treeFile.c_str());
        if (!treeFile->good()) { std::cerr << "The file " << opt::treeFile << " could not be opened. Exiting..." << std::endl; exit(1);}
        outFileTree = new std::ofstream(outFileRoot + "_tree.txt");
        getline(*treeFile, line);
        assignTreeLevelsAndLinkToTaxa(line,treeTaxonNamesToLoc,treeLevels);
        //for (std::map<string,std::vector<int>>::iterator it = treeTaxonNamesToLoc.begin(); it != treeTaxonNamesToLoc.end(); ++it) {
        //    std::cout << "{" << it->first << "}\n";
        // }
    }
    
    int VCFlineCount = assignNumLinesToAnalyse(opt::providedNumLines, opt::regionLength, opt::vcfFile);;
    
    std::istream* vcfFile;
    if (opt::vcfFile == "stdin") {
        vcfFile = &std::cin;
    } else {
        vcfFile = createReader(opt::vcfFile.c_str());
    }
    
    // Get the sample sets
    SetInformation setInfo(opt::setsFile, MIN_SETS, OutgroupRequired);

    std::ofstream* outFileBBAA = new std::ofstream(outFileRoot+"_BBAA.txt"); assertFileOpen(*outFileBBAA, outFileRoot+"_BBAA.txt");
    std::ofstream* outFileDmin = new std::ofstream(outFileRoot+"_Dmin.txt"); assertFileOpen(*outFileDmin, outFileRoot+"_Dmin.txt");
    std::ofstream* outFileCombine; if (opt::combine) {
        outFileCombine = new std::ofstream(outFileRoot+"_combine.txt");
        assertFileOpen(*outFileCombine, outFileRoot+"_combine.txt");
    }
    std::ofstream* outFileCombineStdErr; if (opt::combine) {
        outFileCombineStdErr = new std::ofstream(outFileRoot+"_combine_stderr.txt");
        assertFileOpen(*outFileCombineStdErr, outFileRoot+"_combine_stderr.txt");
    }
    
    int nCombinations = nChoosek((int)setInfo.populations.size(),3);
    if (opt::fStats) std::cerr << "Going to calculate D and f4-ratio values for " << nCombinations << " trios" << std::endl;
    else std::cerr << "Going to calculate D values for " << nCombinations << " trios" << std::endl;
    
    if (opt::treeFile != "") { // Check that the tree contains all the populations/species
        setInfo.checkIfTreeNamesMatch(treeTaxonNamesToLoc);
    }
    
    // first, get all combinations of three sets (species):
    std::vector<std::vector<string>> trios; trios.resize(nCombinations);
    std::vector<std::vector<int>> triosInt; triosInt.resize(nCombinations);
    std::vector<bool> v(setInfo.populations.size()); std::fill(v.begin(), v.begin() + 3, true); // prepare a selection vector
    int pNum = 0;
    do {
        for (int i = 0; i < v.size(); ++i) {
            if (v[i]) { trios[pNum].push_back(setInfo.populations[i]); triosInt[pNum].push_back(i); }
        } pNum++;
    } while (std::prev_permutation(v.begin(), v.end())); // Getting all permutations of the selection vector - so it selects all combinations
    std::cerr << "Done permutations" << std::endl;
    
    // Create objects to hold the results for each trio
    std::vector<TrioDinfo> trioInfos(nCombinations); for (int i = 0; i < nCombinations; i++) { TrioDinfo info; trioInfos[i] = info; }
    
    // And need to prepare the vectors to hold allele frequency values:
    std::vector<double> allPs(setInfo.populations.size(),0.0);
    std::vector<double> allSplit1Ps(setInfo.populations.size(),0.0); std::vector<int> allSplit1Counts(setInfo.populations.size(),0);
    std::vector<double> allSplit2Ps(setInfo.populations.size(),0.0); std::vector<int> allSplit2Counts(setInfo.populations.size(),0);
    std::vector<double> allCorrectionFactors(setInfo.populations.size(),0);
    
    int totalVariantNumber = 0;
    std::vector<string> sampleNames; std::vector<std::string> fields;
    // Find out how often to report progress, based on the number of trios
    int reportProgressEvery; if (nCombinations < 1000) reportProgressEvery = 100000;
    else if (nCombinations < 100000) reportProgressEvery = 10000;
    else reportProgressEvery = 1000;
    clock_t start = clock(); clock_t startGettingCounts; clock_t startCalculation;
   // double durationGettingCounts; double durationCalculation;
    int JKblockSizeBasedOnNum = 0;
    
    //int missingLikelihoodsCount = 0;
    //int errCount = 0;
    
    while (getline(*vcfFile, line)) {
        line.erase(std::remove(line.begin(), line.end(), '\r'), line.end()); // Deal with any left over \r from files prepared on Windows
        if (line[0] == '#' && line[1] == '#') {
            if (opt::regionStart == -1) { VCFlineCount--; } continue;
        } else if (line[0] == '#' && line[1] == 'C') {
            if (opt::regionStart == -1) { VCFlineCount--; } JKblockSizeBasedOnNum = (VCFlineCount/opt::jkNum)-1;
            printInitialMessageTriosQuartets(opt::regionLength, VCFlineCount, JKblockSizeBasedOnNum, opt::jkWindowSize, opt::jkNum);
            fields = split(line, '\t');
            std::vector<std::string> sampleNames(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());
            setInfo.linkSetsAndVCFpositions(sampleNames);
        } else {
            totalVariantNumber++;
            if (opt::regionStart != -1) {
                if (totalVariantNumber < opt::regionStart)
                    continue;
                if (totalVariantNumber > (opt::regionStart+opt::regionLength)) {
                    std::cerr << "DONE" << std::endl; break;
                }
            }
            if (totalVariantNumber % JKblockSizeBasedOnNum == 0 && opt::jkWindowSize == 0) {
                for (int i = 0; i != trios.size(); i++) {
                    trioInfos[i].addRegionDs(P3isTrios2); trioInfos[i].addRegionDs(P3isTrios1); trioInfos[i].addRegionDs(P3isTrios0);
                }
            }
            
            if (totalVariantNumber % reportProgressEvery == 0) reportProgessVCF(totalVariantNumber, VCFlineCount, start);
            
            fields = split(line, '\t'); checkGenotypesExist(fields, totalVariantNumber);
            std::vector<std::string> genotypes(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());

            // Only consider biallelic SNPs
            string refAllele = fields[3]; string altAllele = fields[4];
            if (refAllele.length() > 1 || altAllele.length() > 1 || altAllele == "*") {
                refAllele.clear(); refAllele.shrink_to_fit(); altAllele.clear(); altAllele.shrink_to_fit();
                genotypes.clear(); genotypes.shrink_to_fit(); continue;
            }
            
            startGettingCounts = clock();
            double p_O;
            if (opt::fStats)  {
                GeneralSetCountsWithSplits* c = new GeneralSetCountsWithSplits(setInfo.popToPosMap, (int)genotypes.size());
                c->getSplitCountsNew(genotypes, setInfo.posToPopMap);
                
                if (opt::useGenotypeProbabilities) {
                    int likelihoodsOrProbabilitiesTagPosition = c->checkForGenotypeLikelihoodsOrProbabilities(fields);
                    if (likelihoodsOrProbabilitiesTagPosition == LikelihoodsProbabilitiesAbsent) {
                        printMissingLikelihoodsWarning(fields[0], fields[1]);
                        opt::useGenotypeProbabilities = false;
                    } else c->getAFsFromGenotypeLikelihoodsOrProbabilitiesWithSplits(genotypes,setInfo.posToPopMap,likelihoodsOrProbabilitiesTagPosition, atoi(fields[1].c_str()));
                }
                
                if (opt::poolSeq) {
                    int ADtagPos = c->findADtagPosition(fields);
                    c->getAFsFromADtagWithSplits(genotypes, setInfo.popToPosMap, ADtagPos, opt::poolMinDepth);
                }
                
                p_O = c->setDAFs.at("Outgroup"); if (p_O == -1) { delete c; continue; } // We need to make sure that the outgroup is defined
                
                if (opt::useGenotypeProbabilities) {
                    for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {
                        try {
                            allPs[i] = c->setDAFsFromLikelihoods.at(setInfo.populations[i]);
                            allSplit1Ps[i] = c->setDAFsplit1fromLikelihoods.at(setInfo.populations[i]);
                            allSplit2Ps[i] = c->setDAFsplit2fromLikelihoods.at(setInfo.populations[i]);
                            allSplit1Counts[i] = c->setAlleleCountsSplit1fromLikelihoods.at(setInfo.populations[i]);
                            allSplit2Counts[i] = c->setAlleleCountsSplit2fromLikelihoods.at(setInfo.populations[i]);
                            if(allSplit1Ps[i] < 0) {
                                std::cerr << line << std::endl;
                            std::cerr << "setInfo.populations[i] " << setInfo.populations[i] << std::endl;
                            std::cerr << "allPs[i] " << allSplit1Ps[i] << std::endl;
                            std::cerr << "allSplit1Ps[i] " << allSplit1Ps[i] << std::endl;
                            std::cerr << "allSplit2Ps[i] " << allSplit2Ps[i] << std::endl;
                            }
                        } catch (const std::out_of_range& oor) { std::cerr << "Counts are missing some info for " << setInfo.populations[i] << std::endl; }
                    }
                   // print_vector(allPs, std::cerr);
                } else if (opt::poolSeq) {
                    for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {
                        try {
                                allPs[i] = c->setPoolDAFs.at(setInfo.populations[i]);
                                allSplit1Ps[i] = c->setPoolDAFsplit1.at(setInfo.populations[i]);
                                allSplit2Ps[i] = c->setPoolDAFsplit2.at(setInfo.populations[i]);
                                allSplit1Counts[i] = 1; allSplit2Counts[i] = 1;
                        } catch (const std::out_of_range& oor) { std::cerr << "Counts are missing some info for " << setInfo.populations[i] << std::endl; }
                    }
                
                } else {
                    for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {
                        try {
                                allPs[i] = c->setDAFs.at(setInfo.populations[i]);
                                allSplit1Ps[i] = c->setDAFsplit1.at(setInfo.populations[i]);
                                allSplit2Ps[i] = c->setDAFsplit2.at(setInfo.populations[i]);
                                allSplit1Counts[i] = c->setAlleleCountsSplit1.at(setInfo.populations[i]);
                                allSplit2Counts[i] = c->setAlleleCountsSplit2.at(setInfo.populations[i]);
                                allCorrectionFactors[i] = c->setCorrectionFactors.at(setInfo.populations[i]);
                            
                            /*if (isnan(allPs[i])) {
                                                          std::cerr << "allPs[i]: " << allPs[i] << " ; Exiting ..." << std::endl;
                                                      std::cerr << "allSplit1Ps[i]: " << allSplit1Ps[i] << " ; Exiting ..." << std::endl;
                                                      std::cerr << "allSplit2Ps[i]: " << allSplit2Ps[i] << " ; Exiting ..." << std::endl;
                                                      std::cerr << "allSplit1Counts[i]: " << allSplit1Counts[i] << " ; Exiting ..." << std::endl;
                                                      std::cerr << "allSplit2Counts[i]: " << allSplit2Counts[i] << " ; Exiting ..." << std::endl;
                                                        //  std::cerr << fields[0] << " " << fields[1] << " species[i]: " << species[i] << " ; Exiting ..." << std::endl;
                                                        //  std::cerr << genotypes[speciesToPosMap.at(species[i])[0]] << std::endl;
                                                        //  exit(1);
                                                      } */
                        } catch (const std::out_of_range& oor) { std::cerr << "Counts are missing some info for " << setInfo.populations[i] << std::endl; }
                    }
                    //print_vector(allPs, std::cerr);
                }
                delete c;
            } else {
                GeneralSetCounts* c2 = (GeneralSetCountsWithSplits*) new GeneralSetCounts(setInfo.popToPosMap, (int)genotypes.size());
                c2->getSetVariantCounts(genotypes, setInfo.posToPopMap);
                if (opt::useGenotypeProbabilities) {
                    int likelihoodsOrProbabilitiesTagPosition = c2->checkForGenotypeLikelihoodsOrProbabilities(fields);
                    if (likelihoodsOrProbabilitiesTagPosition == LikelihoodsProbabilitiesAbsent) {
                        printMissingLikelihoodsWarning(fields[0], fields[1]);
                        opt::useGenotypeProbabilities = false;
                    } else c2->getAFsFromGenotypeLikelihoodsOrProbabilities(genotypes,setInfo.posToPopMap,likelihoodsOrProbabilitiesTagPosition);
                }
                
                if (opt::poolSeq) {
                    int ADtagPos = c2->findADtagPosition(fields);
                    c2->getAFsFromADtag(genotypes,setInfo.popToPosMap,ADtagPos, opt::poolMinDepth);
                }
                
                p_O = c2->setDAFs.at("Outgroup"); if (p_O == -1) { delete c2; continue; } // We need to make sure that the outgroup is defined
                if (opt::useGenotypeProbabilities) {
                    for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {
                        try { allPs[i] = c2->setDAFsFromLikelihoods.at(setInfo.populations[i]); }
                        catch (const std::out_of_range& oor) { std::cerr << "Counts are missing some info for " << setInfo.populations[i] << std::endl; }
                    }
                 // print_vector(allPs, std::cerr);
                } else if (opt::poolSeq) {
                    for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {
                        try {allPs[i] = c2->setPoolDAFs.at(setInfo.populations[i]); }
                        catch (const std::out_of_range& oor) { std::cerr << "Counts are missing some info for " << setInfo.populations[i] << std::endl; }
                    }
                } else {
                    for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {
                        try {allPs[i] = c2->setDAFs.at(setInfo.populations[i]); }
                        catch (const std::out_of_range& oor) { std::cerr << "Counts are missing some info for " << setInfo.populations[i] << std::endl; }
                    }
                //print_vector(allPs, std::cerr);
                //exit(1);
                }
                delete c2;
            }
            genotypes.clear(); genotypes.shrink_to_fit();
           // durationGettingCounts = ( clock() - startGettingCounts ) / (double) CLOCKS_PER_SEC;
            
            startCalculation = clock();
            // Now calculate the D stats:
            double p_S1; double p_S2; double p_S3; double ABBA; double BABA; double BBAA; double BAAB = 0; double ABAB = 0; double AABB = 0;
            double correctionP3;
            for (int i = 0; i != trios.size(); i++) {
                p_S1 = allPs[triosInt[i][0]];
                if (p_S1 == -1) continue;  // If any member of the trio has entirely missing data, just move on to the next trio
                p_S2 = allPs[triosInt[i][1]];
                if (p_S2 == -1) continue;
                p_S3 = allPs[triosInt[i][2]];
                if (p_S3 == -1) continue;
                if (p_S1 == 0 && p_S2 == 0 && p_S3 == 0) continue; // Checking if the SNP is variable in the trio
                if (p_S1 == 1 && p_S2 == 1 && p_S3 == 1) continue; // Checking if the SNP is variable in the trio
                
                // Also no need to calculate anything if the SNP is variable in only one population
             /* if (p_S1 == 0 && p_S2 == 0 && p_O == 0) continue;
              if (p_S1 == 1 && p_S2 == 1 && p_O == 1) continue;
              if (p_S1 == 0 && p_S3 == 0 && p_O == 0) continue;
              if (p_S1 == 1 && p_S3 == 1 && p_O == 1) continue;
              if (p_S2 == 0 && p_S3 == 0 && p_O == 0) continue;
              if (p_S2 == 1 && p_S3 == 1 && p_O == 1) continue; */
                
                //std::cerr << "p_S1: " << p_S1 << " ; p_S2: " << p_S2 << " ; p_S3: " << p_S3 << std::endl;
                //std::cerr << std::endl;
                
                
                ABBA = (1-p_S1)*p_S2*p_S3*(1-p_O);
                BABA = p_S1*(1-p_S2)*p_S3*(1-p_O);
                BBAA = p_S1*p_S2*(1-p_S3)*(1-p_O);
                
                if (p_O != 0) {
                    BAAB = p_S1*(1-p_S2)*(1-p_S3)*p_O;
                    ABAB = (1-p_S1)*p_S2*(1-p_S3)*p_O;
                    AABB = (1-p_S1)*(1-p_S2)*p_S3*p_O;
                    
                    ABBA = ABBA + BAAB; BABA = BABA + ABAB; BBAA = BBAA + AABB;
                }
                
                trioInfos[i].ABBAtotal += ABBA; trioInfos[i].BABAtotal += BABA; trioInfos[i].BBAAtotal += BBAA;
                
                if (ABBA > 0.5 && (ABBA + BABA) == 0) {
                    std::cerr << "ABBA : " << ABBA << std::endl;
                    std::cerr << "BABA : " << BABA << std::endl;
                    std::cerr << "(ABBA + BABA): " << (ABBA + BABA) << std::endl;
                }
                if ((ABBA + BABA) != 0) { trioInfos[i].usedVars[0]++; trioInfos[i].totalUsedVars[0]++;
                    trioInfos[i].localD1num += ABBA - BABA; trioInfos[i].localD1denom += ABBA + BABA; }
                if ((ABBA + BBAA) != 0) { trioInfos[i].usedVars[1]++; trioInfos[i].totalUsedVars[1]++;
                    trioInfos[i].localD2num += ABBA - BBAA; trioInfos[i].localD2denom += ABBA + BBAA; }
                if ((BBAA + BABA) != 0) { trioInfos[i].usedVars[2]++; trioInfos[i].totalUsedVars[2]++;
                    trioInfos[i].localD3num += BBAA - BABA; trioInfos[i].localD3denom += BBAA + BABA; }
                
                
                if (opt::KStest) {
                    if (ABBA > 0.5) {
                       // trioInfos[i].linearStrongABBApos[0].push_back(trioInfos[i].totalUsedVars[0]);
                       // trioInfos[i].linearStrongABBApos[1].push_back(trioInfos[i].totalUsedVars[1]);
                        trioInfos[i].numStrongVars[0]++; trioInfos[i].numStrongVars[1]++;
                        trioInfos[i].linearStrongABBApos[0].push_back(totalVariantNumber);
                        trioInfos[i].linearStrongABBAposStrongSitesOnly[0].push_back(trioInfos[i].numStrongVars[0]);
                        trioInfos[i].linearStrongABBApos[1].push_back(totalVariantNumber);
                        trioInfos[i].linearStrongABBAposStrongSitesOnly[1].push_back(trioInfos[i].numStrongVars[1]);
                    }
                    if (BABA > 0.5) {
                        //trioInfos[i].linearStrongBABApos[0].push_back(trioInfos[i].totalUsedVars[0]);
                        //trioInfos[i].linearStrongBABApos[2].push_back(trioInfos[i].totalUsedVars[2]);
                        trioInfos[i].numStrongVars[0]++; trioInfos[i].numStrongVars[2]++;
                        trioInfos[i].linearStrongBABApos[0].push_back(totalVariantNumber);
                        trioInfos[i].linearStrongBABAposStrongSitesOnly[0].push_back(trioInfos[i].numStrongVars[0]);
                        trioInfos[i].linearStrongBABApos[2].push_back(totalVariantNumber);
                        trioInfos[i].linearStrongBABAposStrongSitesOnly[2].push_back(trioInfos[i].numStrongVars[2]);
                    }
                    if (BBAA > 0.5) {
                        //trioInfos[i].linearStrongABBApos[2].push_back(trioInfos[i].totalUsedVars[2]);
                        //trioInfos[i].linearStrongBABApos[1].push_back(trioInfos[i].totalUsedVars[1]);
                        trioInfos[i].numStrongVars[1]++; trioInfos[i].numStrongVars[2]++;
                        trioInfos[i].linearStrongABBApos[2].push_back(totalVariantNumber);
                        trioInfos[i].linearStrongABBAposStrongSitesOnly[2].push_back(trioInfos[i].numStrongVars[2]);
                        trioInfos[i].linearStrongBABApos[1].push_back(totalVariantNumber);
                        trioInfos[i].linearStrongBABAposStrongSitesOnly[1].push_back(trioInfos[i].numStrongVars[1]);
                    }
                }
                
                
                if (opt::fStats) {
                    
                    // f_G
                 //   int c_S1a = 0; int c_S1b = 0; int c_S2a = 0; int c_S2b = 0;int c_S3a = 0; int c_S3b = 0;
                  //  c_S3a = allSplit1Counts[triosInt[i][2]]; c_S3b = allSplit2Counts[triosInt[i][2]];
                  //  c_S2a = allSplit1Counts[triosInt[i][1]]; c_S2b = allSplit2Counts[triosInt[i][1]];
                  //  c_S1a = allSplit1Counts[triosInt[i][0]]; c_S1b = allSplit2Counts[triosInt[i][0]];
                    
                    
                    
                    double p_S1a = 0; double p_S1b = 0; double p_S2a = 0; double p_S2b = 0; double p_S3a = 0; double p_S3b = 0;
                    
                    correctionP3 = allCorrectionFactors[triosInt[i][2]];
                    
                    p_S3a = allSplit1Ps[triosInt[i][2]]; p_S3b = allSplit2Ps[triosInt[i][2]];
                    p_S2a = allSplit1Ps[triosInt[i][1]]; p_S2b = allSplit2Ps[triosInt[i][1]];
                    p_S1a = allSplit1Ps[triosInt[i][0]]; p_S1b = allSplit2Ps[triosInt[i][0]];
                    
                  //  std::cerr << "p_S1a : " << p_S1a << "; p_S1b : " << p_S1b << std::endl;
                  //  std::cerr << "p_S2a : " << p_S2a << "; p_S2b : " << p_S2b << std::endl;
                  //  std::cerr << "p_S3a : " << p_S3a << "; p_S3b : " << p_S3b << std::endl;
                    
                    assert(p_S1a >= 0); assert(p_S1b >= 0);
                    assert(p_S2a >= 0); assert(p_S2b >= 0);
                    assert(p_S3a >= 0); assert(p_S3b >= 0);
                    
                    
                    double thisFgDenom1 = fG_Denom_perVariant(p_S1,p_S3a,p_S3b,p_O);
                    double thisFgDenom1_rev = fG_Denom_perVariant(p_S2,p_S3a,p_S3b,p_O);
                    
                    trioInfos[i].F_G_denom1 += fG_Denom_perVariant(p_S1,p_S3a,p_S3b,p_O);
                    trioInfos[i].F_G_denom1_reversed += fG_Denom_perVariant(p_S2,p_S3a,p_S3b,p_O);
                    trioInfos[i].F_G_denom2 += fG_Denom_perVariant(p_S1,p_S2a,p_S2b,p_O);
                    trioInfos[i].F_G_denom2_reversed += fG_Denom_perVariant(p_S3,p_S2a,p_S2b,p_O);
                    trioInfos[i].F_G_denom3 += fG_Denom_perVariant(p_S3,p_S1a,p_S1b,p_O);
                    trioInfos[i].F_G_denom3_reversed += fG_Denom_perVariant(p_S2,p_S1a,p_S1b,p_O);
                    
                    
                    
                    
                    if (p_O != 0) {
                        thisFgDenom1 += fG_Denom_perVariant(1-p_S1,1-p_S3a,1-p_S3b,1-p_O);
                        thisFgDenom1_rev += fG_Denom_perVariant(1-p_S2,1-p_S3a,1-p_S3b,1-p_O);
                        trioInfos[i].F_G_denom1 += fG_Denom_perVariant(1-p_S1,1-p_S3a,1-p_S3b,1-p_O);
                        trioInfos[i].F_G_denom1_reversed += fG_Denom_perVariant(1-p_S2,1-p_S3a,1-p_S3b,1-p_O);
                        trioInfos[i].F_G_denom2 += fG_Denom_perVariant(1-p_S1,1-p_S2a,1-p_S2b,1-p_O);
                        trioInfos[i].F_G_denom2_reversed += fG_Denom_perVariant(1-p_S3,1-p_S2a,1-p_S2b,1-p_O);
                        trioInfos[i].F_G_denom3 += fG_Denom_perVariant(1-p_S3,1-p_S1a,1-p_S1b,1-p_O);
                        trioInfos[i].F_G_denom3_reversed += fG_Denom_perVariant(1-p_S2,1-p_S1a,1-p_S1b,1-p_O);
                    }
                    
                    /* investigating rare cases of unexpected f4-ratio values
                    if (thisFgDenom1 < 0) {
                        errCount++;
                        std::cerr << "thisFgDenom1: " << thisFgDenom1 << " ; thisFgDenom1_rev: " << thisFgDenom1_rev << std::endl;
                        std::cerr << "ABBA: " << ABBA << " ; BABA: " << BABA << " ; ABBA-BABA: " << ABBA-BABA << std::endl;
                        std::cerr << "p_S1: " << p_S1 << std::endl;
                        std::cerr << "p_S2: " << p_S2 << std::endl;
                        std::cerr << "p_S3: " << p_S3 << "; p_S3a: " << p_S3a << " ; p_S3b: " << p_S3b << std::endl;
                        std::cerr << "correctionP3: " << correctionP3 << std::endl;
                        print_vector(allPs, std::cerr);
                        print_vector(allCorrectionFactors, std::cerr);
                        std::cerr << "p_O: " << p_O << std::endl;
                        std::cerr << std::endl;
                        if (errCount > 10) {
                            exit(1);
                        }
                    
                    }
                    */
                    
                    
               /*
                // Find which topology is in agreement with the counts of the BBAA, BABA, and ABBA patterns
                                   if (BBAAtotal >= BABAtotal && BBAAtotal >= ABBAtotal) {
                                       BBAAarrangement = P3isTrios2;
                                   } else if (BABAtotal >= BBAAtotal && BABAtotal >= ABBAtotal) {
                                       BBAAarrangement = P3isTrios1;
                                   } else if (ABBAtotal >= BBAAtotal && ABBAtotal >= BABAtotal) {
                                       BBAAarrangement = P3isTrios0;
                                   }
                if (totalVariantNumber % reportProgressEvery == 0) {
                    std::cerr << trios[0][0] << "\t" << trios[0][1] << "\t" << trios[0][2] << "\n";
                    std::cerr << "p_S1a: " << p_S1a << " ; p_S1b: " << p_S1b << std::endl;
                    std::cerr << "p_S2a: " << p_S2a << " ; p_S2b: " << p_S2b << std::endl;
                    std::cerr << "p_S3a: " << p_S3a << " ; p_S3b: " << p_S3b << std::endl;
                    
                    
                    std::cerr << "ABBA-BABA: " << trioInfos[i].ABBAtotal-trioInfos[i].BABAtotal << "; ABBA - BBAA: " << trioInfos[i].ABBAtotal - trioInfos[i].BBAAtotal << "; ABBA - BBAA: " << trioInfos[i].BBAAtotal - trioInfos[i].BABAtotal << std::endl;
                    std::cerr << "trioInfos[i].F_G_denom1: " << trioInfos[i].F_G_denom1 << "; trioInfos[i].F_G_denom2: " << trioInfos[i].F_G_denom2 << "; trioInfos[i].F_G_denom3: " << trioInfos[i].F_G_denom3 << std::endl;
                    std::cerr << "trioInfos[i].F_G_denom1_reversed: " << trioInfos[i].F_G_denom1_reversed << "; trioInfos[i].F_G_denom2_reversed: " << trioInfos[i].F_G_denom2_reversed << "; trioInfos[i].F_G_denom3_reversed: " << trioInfos[i].F_G_denom3_reversed << std::endl;
                    
                    std::cerr << std::endl;
                    } */
                }
                
                // std::cerr << "trioInfos[i].localD1num" << trioInfos[i].localD1denom << std::endl;
                if (opt::jkWindowSize > 0) {
                    if (trioInfos[i].usedVars[0] == opt::jkWindowSize) { trioInfos[i].addRegionDs(P3isTrios2); }
                    if (trioInfos[i].usedVars[1] == opt::jkWindowSize) { trioInfos[i].addRegionDs(P3isTrios1); }
                    if (trioInfos[i].usedVars[2] == opt::jkWindowSize) { trioInfos[i].addRegionDs(P3isTrios0); }
                }
                // } */
            }
           // durationCalculation = ( clock() - startCalculation ) / (double) CLOCKS_PER_SEC;
        }
    }
    std::cerr << "Done processing VCF. Preparing output files..." << '\n';
    
    string header = makeHeader(false, opt::fStats, opt::KStest);
    *outFileBBAA << header << std::endl; *outFileDmin << header << std::endl;
    if (opt::treeFile != "") *outFileTree << header << std::endl;
    
    int exceptionCount = 0;
    for (int i = 0; i != trios.size(); i++) { //
        // Get the D values
        try {
            trioInfos[i].calculateFinalDs();
        } catch (const char* msg) {
            exceptionCount++;
            if (exceptionCount <= 10) {
                std::cerr << msg << std::endl;
                std::cerr << "Could not calculate p-values for the trio: " << trios[i][0] << " " << trios[i][1] << " " << trios[i][2] << std::endl;
                if (opt::jkWindowSize > 0) std::cerr << "You should probably decrease the the jackknife block size (-j option)" << std::endl;
                else std::cerr << "it looks like there aren't enough ABBA-BABA informative variants for this trio" << std::endl;
                std::cerr << std::endl;
            }
            trioInfos[i].D1_p = nan(""); trioInfos[i].D2_p = nan(""); trioInfos[i].D3_p = nan("");
        }
        
        // Find which topology is in agreement with the counts of BBAA, BABA, and ABBA
        trioInfos[i].assignBBAAarrangement();
        std::vector<string> BBAAoutVec = trioInfos[i].makeOutVec(trios[i], opt::fStats, opt::KStest, trioInfos[i].BBAAarrangement);
        print_vector(BBAAoutVec,*outFileBBAA);
        
        // Find Dmin:
        trioInfos[i].assignDminArrangement();
        std::vector<string> DminOutVec = trioInfos[i].makeOutVec(trios[i], opt::fStats, opt::KStest, trioInfos[i].DminArrangement);
        print_vector(DminOutVec,*outFileDmin);
        
        // Find which arrangement of trios is consistent with the input tree (if provided):
        if (opt::treeFile != "") {
            int loc1 = treeTaxonNamesToLoc[trios[i][0]][0];
            int loc2 = treeTaxonNamesToLoc[trios[i][1]][0];
            int loc3 = treeTaxonNamesToLoc[trios[i][2]][0];
            trioInfos[i].treeArrangement = trioInfos[i].assignTreeArrangement(treeLevels, loc1, loc2, loc3);
            std::vector<string> treeOutVec = trioInfos[i].makeOutVec(trios[i], opt::fStats, opt::KStest, trioInfos[i].treeArrangement);
            print_vector(treeOutVec,*outFileTree);
        }
        
        // Output a simple file that can be used for combining multiple local runs:
        if (opt::combine) {
            *outFileCombine << trios[i][0] << "\t" << trios[i][1] << "\t" << trios[i][2] << "\t" << trioInfos[i].BBAAtotal << "\t" << trioInfos[i].BABAtotal << "\t" << trioInfos[i].ABBAtotal;
            if (opt::fStats) {
                *outFileCombine << "\t" << trioInfos[i].F_G_denom1 << "\t" << trioInfos[i].F_G_denom2 << "\t" << trioInfos[i].F_G_denom3;
                *outFileCombine << "\t" << trioInfos[i].F_G_denom1_reversed << "\t" << trioInfos[i].F_G_denom2_reversed << "\t" << trioInfos[i].F_G_denom3_reversed;
                *outFileCombine << std::endl;
            } else {
                *outFileCombine << std::endl;
            }
            print_vector(trioInfos[i].regionDs[0], *outFileCombineStdErr, ',', false); *outFileCombineStdErr << "\t"; print_vector(trioInfos[i].regionDs[1], *outFileCombineStdErr, ',', false); *outFileCombineStdErr << "\t";
            print_vector(trioInfos[i].regionDs[2], *outFileCombineStdErr, ',',false); *outFileCombineStdErr << std::endl;
        }
        //std::cerr << trios[i][0] << "\t" << trios[i][1] << "\t" << trios[i][2] << "\t" << D1 << "\t" << D2 << "\t" << D3 << "\t" << BBAAtotals[i] << "\t" << BABAtotals[i] << "\t" << ABBAtotals[i] << std::endl;
    }
    if (exceptionCount > 10) {
        std::cerr << "..." << std::endl;
        std::cerr << "p-value could not be calculated for " << exceptionCount << " trios" << std::endl;
        if (opt::jkWindowSize > 0) std::cerr << "You should probably decrease the the jackknife block size (-j option)" << std::endl;
        else std::cerr << "it looks like there aren't enough ABBA-BABA informative variants for these trios" << std::endl;
        std::cerr << "If this was a run for a subset of the genome (e.g. one chromosome), you may still get p-values for these trios from DtriosCombine" << std::endl;
        std::cerr << std::endl;
    }
    return 0;
    
}



void parseDminOptions(int argc, char** argv) {
    bool die = false; string regionArgString; std::vector<string> regionArgs;
    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
    {
        std::istringstream arg(optarg != NULL ? optarg : "");
        switch (c)
        {
            case '?': die = true; break;
            case 'n': arg >> opt::runName; break;
            case 't': arg >> opt::treeFile; break;
            case 'j': arg >> opt::jkWindowSize; break;
            case 'k': arg >> opt::jkNum; break;
            case OPT_NO_F4: opt::fStats = false; break;
            case OPT_KS_TEST: opt::KStest = true; break;
            case 'c': opt::combine = false; break;
            case 'g': opt::useGenotypeProbabilities = true; break;
            case 'l': arg >> opt::providedNumLines; break;
            case 'o': arg >> opt::providedOutPrefix; break;
            case 'p': opt::poolSeq = true; arg >> opt::poolMinDepth; break;
            case 'r': arg >> regionArgString; regionArgs = split(regionArgString, ',');
                if (regionArgs.size() != 2) {
                    std::cerr << "the --region argument should be two numbers separated by a comma\n";
                    die = true;
                } else {
                    opt::regionStart = (int)stringToDouble(regionArgs[0]); opt::regionLength = (int)stringToDouble(regionArgs[1]);  break;
                }
            case 'h':
                std::cout << DMIN_USAGE_MESSAGE;
                exit(EXIT_SUCCESS);
        }
    }
    
    int maxNumArgs = 2; int minNumArgs = 2; // if (opt::poolSeq) { minNumArgs = 1; }
    
    if (opt::poolSeq && opt::useGenotypeProbabilities) {
        std::cerr << "Error: The -p and -g options are not compatible. Please check your command line. Exiting ....\n";
        die = true;
    }
    
    if (argc - optind < minNumArgs) {
        std::cerr << "missing arguments\n";
        die = true;
    }
    else if (argc - optind > maxNumArgs)
    {
        std::cerr << "too many arguments\n";
        die = true;
    }
    
    if (die) {
        std::cout << "\n" << DMIN_USAGE_MESSAGE;
        exit(EXIT_FAILURE);
    }
    
    // Parse the input filenames
    opt::vcfFile = argv[optind++];
    opt::setsFile = argv[optind++];
    
    if (opt::vcfFile == "stdin" && opt::providedNumLines <= 0) {
        std::cerr << "If you want to read the VCF via a pipe, you need to specify the number of lines in the input via the -l option\n";
        std::cerr << "See the example above\n";
        die = true;
    }
    
    if (die) {
        std::cout << "\n" << DMIN_USAGE_MESSAGE;
        exit(EXIT_FAILURE);
    }
}



================================================
FILE: Dmin.h
================================================
//
//  Dmin.h
//  Dsuite
//
//  Created by Milan Malinsky on 02/04/2019.
//

#ifndef Dmin_h
#define Dmin_h
#include "Dsuite_utils.h"


void parseDminOptions(int argc, char** argv);
int DminMain(int argc, char** argv);



#endif /* Dmin_h */


================================================
FILE: Dmin_combine.cpp
================================================
//
//  Dmin_combine.cpp
//  Dsuite
//
//  Created by Milan Malinsky on 11/04/2019.
//

#include "Dmin_combine.h"
#include "Dsuite_common.h"

#define SUBPROGRAM "DtriosCombine"

#define DEBUG 1

static const char *DMINCOMBINE_USAGE_MESSAGE =
"Usage: " PROGRAM_BIN " " SUBPROGRAM " [OPTIONS] DminFile1 DminFile2 DminFile3 ....\n"
"Combine the BBAA, ABBA, and BABA counts from multiple files (e.g per-chromosome) and output the overall D stats,\n"
"p-values and f4-ratio values\n"
"\n"
"       -h, --help                              display this help and exit\n"
"       -o, --out-prefix=OUT_FILE_PREFIX        (optional) the prefix for the files where the results should be written\n"
"                                               output will be put in OUT_FILE_PREFIX_combined_BBAA.txt, OUT_FILE_PREFIX_combined_Dmin.txt, OUT_FILE_PREFIX_combined_tree.txt etc.\n"
"                                               by default, the prefix is \"out\"\n"
"       -n, --run-name                          (optional) run-name will be included in the output file name after the PREFIX\n"
"       -t , --tree=TREE_FILE.nwk               (optional) a file with a tree in the newick format specifying the relationships between populations/species\n"
"                                               D and f4-ratio values for trios arranged according to the tree will be output in a file with _tree.txt suffix\n"
"       -s , --subset=start,length              (optional) only process a subset of the trios\n"
"\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";


static const char* shortopts = "hn:t:s:o:";

static const struct option longopts[] = {
    { "subset",   required_argument, NULL, 's' },
    { "out-prefix",   required_argument, NULL, 'o' },
    { "run-name",   required_argument, NULL, 'n' },
    { "tree",   required_argument, NULL, 't' },
    { "help",   no_argument, NULL, 'h' },
    { NULL, 0, NULL, 0 }
};

namespace opt
{
    static std::vector<string> dminFiles;
    static string providedOutPrefix = "out";
    static string runName = "";
    static string treeFile = "";
    int subsetStart = -1;
    int subsetLength = -1;
}


int DminCombineMain(int argc, char** argv) {
    parseDminCombineOptions(argc, argv);
    const bool KStestPossible = false;
    
    string line; // for reading the input files
    
    
    string outFileRoot = prepareOutFileRootString(opt::providedOutPrefix, opt::runName, "", -1, -1);
    
    std::vector<std::istream*> dminstdErrFiles; std::vector<std::istream*> dminBBAAscoreFiles;
    for (int i = 0; i < opt::dminFiles.size(); i++) {
        std::istream* dminBBAAscoreFile;
        if (file_exists(opt::dminFiles[i] + "_combine.txt")) {
            dminBBAAscoreFile = createReader((opt::dminFiles[i] + "_combine.txt").c_str());
        } else if(file_exists(opt::dminFiles[i] + "_combine.txt.gz")) {
            dminBBAAscoreFile = createReader((opt::dminFiles[i] + "_combine.txt.gz").c_str());
        } else {
            std::cerr << "Can't find the file: " << opt::dminFiles[i] + "_combine.txt" << " or " << opt::dminFiles[i] + "_combine.txt.gz. Exiting..." << std::endl;
            exit(EXIT_FAILURE);
        }
        dminBBAAscoreFiles.push_back(dminBBAAscoreFile);
        std::istream* dminstdErrFile;
        if (file_exists(opt::dminFiles[i] + "_combine_stderr.txt")) {
            dminstdErrFile = createReader((opt::dminFiles[i] + "_combine_stderr.txt").c_str());
        } else if(file_exists(opt::dminFiles[i] + "_combine_stderr.txt.gz")) {
            dminstdErrFile = createReader((opt::dminFiles[i] + "_combine_stderr.txt.gz").c_str());
        } else {
            std::cerr << "Can't find the file: " << opt::dminFiles[i] + "_combine_stderr.txt" << " or " << opt::dminFiles[i] + "_combine_stderr.txt.gz. Exiting..." << std::endl;
            exit(EXIT_FAILURE);
        }
        dminstdErrFiles.push_back(dminstdErrFile);
        std::cerr << "Reading file " << opt::dminFiles[i] << std::endl;
    }
    
    std::istream* treeFile; std::ofstream* outFileTree;
    std::map<string,std::vector<int>> treeTaxonNamesToLoc; std::vector<int> treeLevels;
    if (opt::treeFile != "") {
        treeFile = new std::ifstream(opt::treeFile.c_str());
        if (!treeFile->good()) { std::cerr << "The file " << opt::treeFile << " could not be opened. Exiting..." << std::endl; exit(1);}
        outFileTree = new std::ofstream(outFileRoot + "_combined_tree.txt");
        getline(*treeFile, line);
        assignTreeLevelsAndLinkToTaxa(line,treeTaxonNamesToLoc,treeLevels);
    }
    // Now get the standard error values
    std::ofstream* outFileBBAA = new std::ofstream(outFileRoot + "_combined_BBAA.txt"); std::ofstream* outFileDmin = new std::ofstream(outFileRoot + "_combined_Dmin.txt");
    
    std::vector<double> BBAA_local_Ds; std::vector<double> ABBA_local_Ds; std::vector<double> BABA_local_Ds;
    string s1; string s2; string s3;
    bool allDone = false; bool fIncluded = false;
    int processedTriosNumber = 0; int exceptionCount = 0;
    
    getline(*dminBBAAscoreFiles[0], line); std::vector<string> patternCounts = split(line, '\t');
    if (patternCounts.size() == 12) fIncluded = true;
    string header = makeHeader(false,fIncluded,KStestPossible);
    *outFileBBAA << header << std::endl; *outFileDmin << header << std::endl;
    if (opt::treeFile != "") *outFileTree << header << std::endl;
    dminBBAAscoreFiles[0]->seekg(0, dminBBAAscoreFiles[0]->beg); // Go back to the beginning of this file
    
    do {
        TrioDinfo info; processedTriosNumber++;
        if (processedTriosNumber % 10000 == 0) { std::cerr << "Processed " << processedTriosNumber << " trios" << std::endl; }
        
        if (opt::subsetStart != -1) {
            if (processedTriosNumber < opt::subsetStart) {
                for (int i = 0; i < dminBBAAscoreFiles.size(); i++) { getline(*dminBBAAscoreFiles[i], line); }
                for (int i = 0; i < dminstdErrFiles.size(); i++) { getline(*dminstdErrFiles[i], line); }
                continue;
            }
            if (processedTriosNumber >= (opt::subsetStart+opt::subsetLength)) {
                std::cerr << "DONE" << std::endl; break;
            }
        }
        
        
        for (int i = 0; i < dminBBAAscoreFiles.size(); i++) {
            if (getline(*dminBBAAscoreFiles[i], line)) {
                std::vector<string> patternCounts = split(line, '\t');
                assert(patternCounts.size() == 6 || patternCounts.size() == 12);

                if (i == 0) {
                    s1 = patternCounts[0]; s2 = patternCounts[1]; s3 = patternCounts[2];
                } else {
                    assert(s1 == patternCounts[0]); assert(s2 == patternCounts[1]); assert(s3 == patternCounts[2]);
                }
                info.BBAAtotal += stringToDouble(patternCounts[3]);
                info.BABAtotal += stringToDouble(patternCounts[4]);
                info.ABBAtotal += stringToDouble(patternCounts[5]);
                if (fIncluded) {
                    info.F_G_denom1 += stringToDouble(patternCounts[6]);
                    info.F_G_denom2 += stringToDouble(patternCounts[7]);
                    info.F_G_denom3 += stringToDouble(patternCounts[8]);
                    info.F_G_denom1_reversed += stringToDouble(patternCounts[9]);
                    info.F_G_denom2_reversed += stringToDouble(patternCounts[10]);
                    info.F_G_denom3_reversed += stringToDouble(patternCounts[11]);
                }
            } else {
                allDone = true; break;
            }
        }
        
        for (int i = 0; i < dminstdErrFiles.size(); i++) {
            if (getline(*dminstdErrFiles[i], line)) {
                std::vector<string> localDs = split2(line, "\t");
                //assert(localDs.size() == 3 || localDs.size() == 0);
                if (localDs.size() == 3) {
                    std::vector<string> regionD_strings0 = split(localDs[0], ',');
                    std::vector<string> regionD_strings1 = split(localDs[1], ',');
                    std::vector<string> regionD_strings2 = split(localDs[2], ',');
                    for (int j = 0; j < regionD_strings0.size(); j++) {
                        double localD = stringToDouble(regionD_strings0[j]);
                        if (!std::isnan(localD)) info.regionDs[0].push_back(localD);
                    }
                    for (int j = 0; j < regionD_strings1.size(); j++) {
                        double localD = stringToDouble(regionD_strings1[j]);
                        if (!std::isnan(localD)) info.regionDs[1].push_back(localD);
                    }
                    for (int j = 0; j < regionD_strings2.size(); j++) {
                        double localD = stringToDouble(regionD_strings2[j]);
                        if (!std::isnan(localD)) info.regionDs[2].push_back(localD);
                    }
                } else {
                    print_vector(localDs,std::cerr); exit(EXIT_FAILURE);
                }
            } else {
                allDone = true; break;
            }
        }
        
        
        if (!allDone) {
            try {
                info.calculateFinalDs();
            } catch (const char* msg) {
                exceptionCount++;
                if (exceptionCount <= 10) {
                    std::cerr << msg << std::endl;
                    std::cerr << "Could not calculate p-values for the trio: " << s1 << " " << s2 << " " << s3 << std::endl;
                    std::cerr << "You should probably decrease the the jackknife block size (-j option)" << std::endl;
                    std::cerr << std::endl;
                }
                info.D1_p = nan(""); info.D2_p = nan(""); info.D3_p = nan("");
            }
            
            std::vector<string> trio; trio.push_back(s1); trio.push_back(s2); trio.push_back(s3);
            // Find which topology is in agreement with the counts of BBAA, BABA, and ABBA
            info.assignBBAAarrangement();
            std::vector<string> BBAAoutVec = info.makeOutVec(trio, fIncluded,KStestPossible, info.BBAAarrangement);
            print_vector(BBAAoutVec,*outFileBBAA);
           
            // Find Dmin:
            info.assignDminArrangement();
            std::vector<string> DminOutVec = info.makeOutVec(trio, fIncluded, KStestPossible,info.DminArrangement);
            print_vector(DminOutVec,*outFileDmin);
            
            if (opt::treeFile != "") {
                int loc1 = treeTaxonNamesToLoc[s1][0]; int loc2 = treeTaxonNamesToLoc[s2][0]; int loc3 = treeTaxonNamesToLoc[s3][0];
                info.treeArrangement = info.assignTreeArrangement(treeLevels, loc1, loc2, loc3);
                std::vector<string> treeOutVec = info.makeOutVec(trio, fIncluded, KStestPossible, info.treeArrangement);
                print_vector(treeOutVec,*outFileTree);
            }
        }
 
    } while(!allDone);
    
    if (exceptionCount > 10) {
        std::cerr << "..." << std::endl;
        std::cerr << "p-value could not be calculated for " << exceptionCount << " trios" << std::endl;
        std::cerr << "You should definitely decrease the the jackknife block size!!!" << std::endl;
        std::cerr << std::endl;
    }
    
    return 0;
    
}



void parseDminCombineOptions(int argc, char** argv) {
    bool die = false; string subsetArgString; std::vector<string> subsetArgs;
    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
    {
        std::istringstream arg(optarg != NULL ? optarg : "");
        switch (c)
        {
            case '?': die = true; break;
            case 'n': arg >> opt::runName; break;
            case 't': arg >> opt::treeFile; break;
            case 'o': arg >> opt::providedOutPrefix; break;
            case 's': arg >> subsetArgString; subsetArgs = split(subsetArgString, ',');
                opt::subsetStart = (int)stringToDouble(subsetArgs[0]); opt::subsetLength = (int)stringToDouble(subsetArgs[1]);  break;
            case 'h':
                std::cout << DMINCOMBINE_USAGE_MESSAGE;
                exit(EXIT_SUCCESS);
        }
    }
    
    
    int nFilenames = argc - optind;
    if (nFilenames < 1) {
        std::cerr << "missing arguments\n";
        die = true;
    }
    
    if (die) {
        std::cout << "\n" << DMINCOMBINE_USAGE_MESSAGE;
        exit(EXIT_FAILURE);
    }
    
    // Parse the input filenames
    while (optind < argc) {
        opt::dminFiles.push_back(argv[optind++]);
    }
}


================================================
FILE: Dmin_combine.h
================================================
//
//  Dmin_combine.h
//  Dsuite
//
//  Created by Milan Malinsky on 11/04/2019.
//

#ifndef Dmin_combine_h
#define Dmin_combine_h

#include "Dsuite_utils.h"

void parseDminCombineOptions(int argc, char** argv);
int DminCombineMain(int argc, char** argv);

#endif /* Dmin_combine_h */


================================================
FILE: Dquartets.cpp
================================================
//
//  Dquartets.cpp
//  DsuiteXcode
//
//  Created by Milan Malinsky on 14/07/2020.
//

#include "Dquartets.h"
#include "Dsuite_common.h"

#define SUBPROGRAM "Dquartets"

#define DEBUG 0
#define MIN_SETS 4

static const char *DQUARTS_USAGE_MESSAGE =
"Usage: " PROGRAM_BIN " " SUBPROGRAM " [OPTIONS] INPUT_FILE.vcf SETS.txt\n"
"Calculate the D (ABBA/BABA) and f4-ratio (f_G) statistics for all quartets of species in the dataset (there is no outgroup)\n"
"The results are as definded in Patterson et al. 2012\n"
"The SETS.txt should have two columns: SAMPLE_ID    SPECIES_ID\n"
"\n"
stdInInfo
"       -h, --help                              display this help and exit\n"
"       -k, --JKnum                             (default=20) the number of Jackknife blocks to divide the dataset into; should be at least 20 for the whole dataset\n"
"       -j, --JKwindow                          (default=NA) Jackknife block size in number of informative SNPs (as used in v0.2)\n"
"                                               when specified, this is used in place of the --JKnum option\n"
regionOption    // -r
treeOption      // -t
outOption       // -o
"       -n, --run-name                          (optional; default=quartets) run-name will be included in the output file name after the PREFIX\n"
"       --no-f4-ratio                           (optional) don't calculate the f4-ratio\n"
"       -l NUMLINES                             (optional) the number of lines in the VCF input - required if reading the VCF via a unix pipe\n"
"       -a, --allF4-ratios                      (optional) output F4 ratios for all posible arrangements\n"
"\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";

enum { OPT_NO_F4 };
static const char* shortopts = "hr:n:t:j:fpk:l:o:a";

static const struct option longopts[] = {
    { "run-name",   required_argument, NULL, 'n' },
    { "out-prefix",   required_argument, NULL, 'o' },
    { "region",   required_argument, NULL, 'r' },
    { "tree",   required_argument, NULL, 't' },
    { "JKwindow",   required_argument, NULL, 'j' },
    { "JKnum",   required_argument, NULL, 'k' },
    { "help",   no_argument, NULL, 'h' },
    { "no-f4-ratio",   no_argument, NULL, OPT_NO_F4 },
    { "allF4-ratios",   no_argument, NULL, 'a' },
    { NULL, 0, NULL, 0 }
};

namespace opt
{
    static string vcfFile;
    static string setsFile;
    static string treeFile = "";
    static string runName = "quartets";
    static string providedOutPrefix = "";
    static int jkWindowSize = 0;
    static int jkNum = 20;
    static int regionStart = -1;
    static int regionLength = -1;
    static int providedNumLines = -1;
    static bool fStats = true;
    static bool allF4 = false;
}


int DquartetsMain(int argc, char** argv) {
    parseDquartetsOptions(argc, argv);
    string line; // for reading the input files
    string outFileRoot = prepareOutFileRootString(opt::providedOutPrefix, opt::runName, opt::setsFile, opt::regionStart, opt::regionLength);
    
    std::istream* treeFile; std::ofstream* outFileTree;
    std::map<string,std::vector<int>> treeTaxonNamesToLoc; std::vector<int> treeLevels;
    if (opt::treeFile != "") {
        treeFile = new std::ifstream(opt::treeFile.c_str());
        if (!treeFile->good()) { std::cerr << "The file " << opt::treeFile << " could not be opened. Exiting..." << std::endl; exit(1);}
        outFileTree = new std::ofstream(outFileRoot+ "_" + opt::runName + "_tree.txt");
        getline(*treeFile, line);
        assignTreeLevelsAndLinkToTaxa(line,treeTaxonNamesToLoc,treeLevels);
        //for (std::map<string,std::vector<int>>::iterator it = treeTaxonNamesToLoc.begin(); it != treeTaxonNamesToLoc.end(); ++it) {
        //    std::cout << "{" << it->first << "}\n";
        // }
    }
    
    int VCFlineCount = assignNumLinesToAnalyse(opt::providedNumLines, opt::regionLength, opt::vcfFile);
    
    std::istream* vcfFile;
    if (opt::vcfFile == "stdin") { vcfFile = &std::cin; }
    else { vcfFile = createReader(opt::vcfFile.c_str()); }
    
    // Get the sample sets
    SetInformation setInfo(opt::setsFile, MIN_SETS, OutgroupNotRequired);
    
    std::ofstream* outFileBBAA = new std::ofstream(outFileRoot+"_BBAA.txt"); assertFileOpen(*outFileBBAA, outFileRoot+"_BBAA.txt");
    std::ofstream* outFileDmin = new std::ofstream(outFileRoot+"_Dmin.txt"); assertFileOpen(*outFileDmin, outFileRoot+"_Dmin.txt");
    std::ofstream* outFileCombine = new std::ofstream(outFileRoot+"_combine.txt"); assertFileOpen(*outFileCombine, outFileRoot+"_combine.txt");
    std::ofstream* outFileCombineStdErr = new std::ofstream(outFileRoot+"_combine_stderr.txt");
    assertFileOpen(*outFileCombineStdErr, outFileRoot+"_combine_stderr.txt");

    
    int nCombinations = nChoosek((int)setInfo.populations.size(),4);
    if (opt::fStats) std::cerr << "Going to calculate D and f4-ratio values for " << nCombinations << " quartets" << std::endl;
    else std::cerr << "Going to calculate D values for " << nCombinations << " quartets" << std::endl;
    
    if (opt::treeFile != "") { // Check that the tree contains all the populations/species
        setInfo.checkIfTreeNamesMatch(treeTaxonNamesToLoc);
    }
    
    // first, get all combinations of four sets (species):
    std::vector<std::vector<string>> quartets; quartets.resize(nCombinations);
    std::vector<std::vector<int>> quartetsInt; quartetsInt.resize(nCombinations);
    std::vector<bool> v(setInfo.populations.size()); std::fill(v.begin(), v.begin() + 4, true); // prepare a selection vector
    int pNum = 0;
    do {
        for (int i = 0; i < v.size(); ++i) {
            if (v[i]) { quartets[pNum].push_back(setInfo.populations[i]); quartetsInt[pNum].push_back(i); }
        } pNum++;
    } while (std::prev_permutation(v.begin(), v.end())); // Getting all permutations of the selection vector - so it selects all combinations
    std::cerr << "Done permutations" << std::endl;
    
    // Create objects to hold the results for each quartet
    std::vector<QuartetDinfo> quartetInfos(nCombinations); for (int i = 0; i < nCombinations; i++) {
        QuartetDinfo info; quartetInfos[i] = info;
    }
    
    // If a tree was supplied, check the tree arrangement for each trio...
    if (opt::treeFile != "") {
        for (int i = 0; i != quartets.size(); i++) {
            int loc1 = treeTaxonNamesToLoc[quartets[i][0]][0];
            int loc2 = treeTaxonNamesToLoc[quartets[i][1]][0];
            int loc3 = treeTaxonNamesToLoc[quartets[i][2]][0];
            int loc4 = treeTaxonNamesToLoc[quartets[i][3]][0];
            quartetInfos[i].treeArrangement = quartetInfos[i].assignQuartetTreeArrangement(treeLevels, loc1, loc2, loc3,loc4);
        }
    }
    
    // And need to prepare the vectors to hold allele frequency values:
    std::vector<double> allPs(setInfo.populations.size(),0.0);
    std::vector<double> allSplit1Ps(setInfo.populations.size(),0.0); std::vector<int> allSplit1Counts(setInfo.populations.size(),0);
    std::vector<double> allSplit2Ps(setInfo.populations.size(),0.0); std::vector<int> allSplit2Counts(setInfo.populations.size(),0);
    
    int totalVariantNumber = 0;
    std::vector<string> sampleNames; std::vector<std::string> fields;
    // Find out how often to report progress, based on the number of trios
    int reportProgressEvery; if (nCombinations < 1000) reportProgressEvery = 100000;
    else if (nCombinations < 100000) reportProgressEvery = 10000;
    else reportProgressEvery = 1000;
    clock_t start; clock_t startGettingCounts; clock_t startCalculation;
    double durationOverall; double durationGettingCounts; double durationCalculation;
    int JKblockSizeBasedOnNum = 0;
    
    while (getline(*vcfFile, line)) {
        line.erase(std::remove(line.begin(), line.end(), '\r'), line.end()); // Deal with any left over \r from files prepared on Windows
        if (line[0] == '#' && line[1] == '#') {
            VCFlineCount--; continue;
        } else if (line[0] == '#' && line[1] == 'C') {
            VCFlineCount--; JKblockSizeBasedOnNum = (VCFlineCount/opt::jkNum)-1;
            printInitialMessageTriosQuartets(opt::regionLength, VCFlineCount, JKblockSizeBasedOnNum, opt::jkWindowSize, opt::jkNum);
            fields = split(line, '\t');
            std::vector<std::string> sampleNames(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());
            setInfo.linkSetsAndVCFpositions(sampleNames);
            start = clock();
            //  std::cerr << " " << std::endl;
            //  std::cerr << "Outgroup at pos: "; print_vector_stream(speciesToPosMap["Outgroup"], std::cerr);
        } else {
            totalVariantNumber++;
            if (opt::regionStart != -1) {
                if (totalVariantNumber < opt::regionStart)
                    continue;
                if (totalVariantNumber > (opt::regionStart+opt::regionLength)) {
                    std::cerr << "DONE" << std::endl; break;
                }
            }
            if (totalVariantNumber % JKblockSizeBasedOnNum == 0 && opt::jkWindowSize == 0) {
                for (int i = 0; i != quartets.size(); i++) {
                    quartetInfos[i].addRegionDs(P3isTrios2); quartetInfos[i].addRegionDs(P3isTrios1); quartetInfos[i].addRegionDs(P3isTrios0);
                }
            }
            if (totalVariantNumber % reportProgressEvery == 0) {
                durationOverall = ( clock() - start ) / (double) CLOCKS_PER_SEC;
                std::cerr << "Processed " << totalVariantNumber << " variants (" << ((double)totalVariantNumber/VCFlineCount)*100 << "%) in " << durationOverall << "secs" << std::endl;
                //std::cerr << "GettingCounts " << durationGettingCounts << " calculation " << durationCalculation << "secs" << std::endl;
            }
            fields = split(line, '\t');
            std::vector<std::string> genotypes(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());

            // Only consider biallelic SNPs
            string refAllele = fields[3]; string altAllele = fields[4];
            if (refAllele.length() > 1 || altAllele.length() > 1 || altAllele == "*") {
                refAllele.clear(); refAllele.shrink_to_fit(); altAllele.clear(); altAllele.shrink_to_fit();
                genotypes.clear(); genotypes.shrink_to_fit(); continue;
            }
            
            startGettingCounts = clock();
            if (opt::fStats)  {
                GeneralSetCountsWithSplits* c = new GeneralSetCountsWithSplits(setInfo.popToPosMap, (int)genotypes.size());
                c->getSplitCountsNew(genotypes, setInfo.posToPopMap);
                for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {
                    try {
                        allPs[i] = c->setAAFs.at(setInfo.populations[i]);
                        allSplit1Ps[i] = c->setAAFsplit1.at(setInfo.populations[i]);
                        allSplit2Ps[i] = c->setAAFsplit2.at(setInfo.populations[i]);
                        allSplit1Counts[i] = c->setAlleleCountsSplit1.at(setInfo.populations[i]);
                        allSplit2Counts[i] = c->setAlleleCountsSplit2.at(setInfo.populations[i]);
                       // std::cerr << "species[i] " << species[i] << "; allPs[i] " << allPs[i] << " ; c->setDAFs[species[i]] " << c->setDAFs[0] << std::endl;
                    } catch (const std::out_of_range& oor) {
                        std::cerr << "Counts are missing some info for " << setInfo.populations[i] << std::endl;
                    }
                }
                delete c;
            } else {
                GeneralSetCounts* c = (GeneralSetCountsWithSplits*) new GeneralSetCounts(setInfo.popToPosMap, (int)genotypes.size());
                c->getSetVariantCounts(genotypes, setInfo.posToPopMap);
                for (std::vector<std::string>::size_type i = 0; i != setInfo.populations.size(); i++) {
                    allPs[i] = c->setAAFs.at(setInfo.populations[i]);
                 //   std::cerr << "species[i] " << species[i] << "; allPs[i] " << allPs[i] << std::endl;
                }
                delete c;
            }
            genotypes.clear(); genotypes.shrink_to_fit();
            durationGettingCounts = ( clock() - startGettingCounts ) / (double) CLOCKS_PER_SEC;
            
            startCalculation = clock();
            // Now calculate the D stats:
            double p_S1; double p_S2; double p_S3; double p_S4; double ABBA; double BABA; double BBAA; double BAAB; double ABAB; double AABB;
            for (int i = 0; i != quartets.size(); i++) {
                p_S1 = allPs[quartetsInt[i][0]];
             //   std::cerr << "p_S1 " << p_S1 << std::endl;
                if (p_S1 == -1) continue;  // If any member of the trio has entirely missing data, just move on to the next trio
                p_S2 = allPs[quartetsInt[i][1]];
             //   std::cerr << "p_S2 " << p_S2 << std::endl;
                if (p_S2 == -1) continue;
                p_S3 = allPs[quartetsInt[i][2]];
             //   std::cerr << "p_S3 " << p_S3 << std::endl;
                if (p_S3 == -1) continue;
                p_S4 = allPs[quartetsInt[i][3]];
             //   std::cerr << "p_S4 " << p_S4 << std::endl;
                if (p_S4 == -1) continue;
                
                if (p_S1 == 0 && p_S2 == 0 && p_S3 == 0) continue; // Checking if the SNP is variable in the trio
                if (p_S1 == 0 && p_S2 == 0 && p_S4 == 0) continue; // Checking if the SNP is variable in the trio
                if (p_S1 == 0 && p_S3 == 0 && p_S4 == 0) continue; // Checking if the SNP is variable in the trio
                if (p_S2 == 0 && p_S3 == 0 && p_S4 == 0) continue; // Checking if the SNP is variable in the trio
                
                if (p_S1 == 1 && p_S2 == 1 && p_S3 == 1) continue; // Checking if the SNP is variable in the trio
                if (p_S1 == 1 && p_S2 == 1 && p_S4 == 1) continue; // Checking if the SNP is variable in the trio
                if (p_S1 == 1 && p_S3 == 1 && p_S4 == 1) continue; // Checking if the SNP is variable in the trio
                if (p_S2 == 1 && p_S3 == 1 && p_S4 == 1) continue; // Checking if the SNP is variable in the trio
                
                if (p_S4 != 1) {
                    ABBA = (1-p_S1)*p_S2*p_S3*(1-p_S4); quartetInfos[i].ABBAtotal += ABBA;
                    BABA = p_S1*(1-p_S2)*p_S3*(1-p_S4); quartetInfos[i].BABAtotal += BABA;
                    BBAA = p_S1*p_S2*(1-p_S3)*(1-p_S4); quartetInfos[i].BBAAtotal += BBAA;
                    if ((ABBA + BABA) != 0) { quartetInfos[i].usedVars[0]++; quartetInfos[i].localD1num += ABBA - BABA; quartetInfos[i].localD1denom += ABBA + BABA; }
                    if ((ABBA + BBAA) != 0) { quartetInfos[i].usedVars[1]++; quartetInfos[i].localD2num += ABBA - BBAA; quartetInfos[i].localD2denom += ABBA + BBAA; }
                    if ((BBAA + BABA) != 0) { quartetInfos[i].usedVars[2]++; quartetInfos[i].localD3num += BBAA - BABA; quartetInfos[i].localD3denom += BBAA + BABA; }
                }
                if (p_S4 != 0) {
                    BAAB = p_S1*(1-p_S2)*(1-p_S3)*p_S4; quartetInfos[i].ABBAtotal += BAAB;
                    ABAB = (1-p_S1)*p_S2*(1-p_S3)*p_S4; quartetInfos[i].BABAtotal += ABAB;
                    AABB = (1-p_S1)*(1-p_S2)*p_S3*p_S4; quartetInfos[i].BBAAtotal += AABB;
                    if (BAAB + ABAB != 0)  { quartetInfos[i].localD1num += BAAB - ABAB; quartetInfos[i].localD1denom += BAAB + ABAB; }
                    if (BAAB + AABB != 0)  { quartetInfos[i].localD2num += BAAB - AABB; quartetInfos[i].localD2denom += BAAB + AABB; }
                    if (AABB + ABAB != 0)  { quartetInfos[i].localD3num += AABB - ABAB; quartetInfos[i].localD3denom += AABB + ABAB; }
                }
                
                if (opt::fStats) {
                    
                    double p_S1a = allSplit1Ps[quartetsInt[i][0]]; double p_S1b = allSplit2Ps[quartetsInt[i][0]];
                    double p_S2a = allSplit1Ps[quartetsInt[i][1]]; double p_S2b = allSplit2Ps[quartetsInt[i][1]];
                    double p_S3a = allSplit1Ps[quartetsInt[i][2]]; double p_S3b = allSplit2Ps[quartetsInt[i][2]];
                    double p_S4a = allSplit1Ps[quartetsInt[i][3]]; double p_S4b = allSplit2Ps[quartetsInt[i][3]];
                    
                    /* Orientation 1: F4(P1, P2; P3, P4)
                     ----------------------------------
                     These are the different denominators with 'a' and 'b' being the subsamples
                     1) F4(P1, P3a; P3b, P4) ----- F_G_denom1 --- (p_S1,p_S3a,p_S3b,p_S4)
                     2) F4(P1, P2a; P2b, P4) ----- F_G_denom2 --- (p_S1,p_S2a,p_S2b,p_S4)
                     3) F4(P1a, P2; P3, P1b)
                     4) F4(P4a, P2; P3, P4b) */
                    quartetInfos[i].F_G_denoms[0] += f4_perVariant(p_S1,p_S3a,p_S3b,p_S4);
                    quartetInfos[i].F_G_denoms[1] += f4_perVariant(p_S1,p_S2a,p_S2b,p_S4);
                    quartetInfos[i].F_G_denoms[2] += f4_perVariant(p_S1a,p_S2,p_S3,p_S1b);
                    quartetInfos[i].F_G_denoms[3] += f4_perVariant(p_S4a,p_S2,p_S3,p_S4b);
                    
                    /* Orientation 1b: F4(P2, P1; P3, P4)
                    ----------------------------------   Same as Orientation 3
                    5) F4(P2, P1a; P1b, P4)
                    6) F4(P2, P3a; P3b, P4)
                    7) F4(P2a, P1; P3, P2b)
                    8) F4(P4a, P1; P3, P4b) */
                    quartetInfos[i].F_G_denoms[4] += f4_perVariant(p_S2,p_S1a,p_S1b,p_S4);
                    quartetInfos[i].F_G_denoms[5] += f4_perVariant(p_S2,p_S3a,p_S3b,p_S4);
                    quartetInfos[i].F_G_denoms[6] += f4_perVariant(p_S2a,p_S1,p_S3,p_S2b);
                    quartetInfos[i].F_G_denoms[7] += f4_perVariant(p_S4a,p_S1,p_S3,p_S4b);
                    
                    /* Orientation 2: F4(P1, P3; P2, P4)
                     ----------------------------------
                     9) F4(P1, P3a; P3b, P4) - a duplicate of 1)
                     10) F4(P1, P2a; P2b, P4) - a duplicate of 2)
                     11) F4(P1a, P3; P2, P1b)
                     12) F4(P4a, P3; P2, P4b) */
                    quartetInfos[i].F_G_denoms[8] += f4_perVariant(p_S1,p_S3a,p_S3b,p_S4);
                    quartetInfos[i].F_G_denoms[9] += f4_perVariant(p_S1,p_S2a,p_S2b,p_S4);
                    quartetInfos[i].F_G_denoms[10] += f4_perVariant(p_S1a,p_S3,p_S2,p_S1b);
                    quartetInfos[i].F_G_denoms[11] += f4_perVariant(p_S4a,p_S3,p_S2,p_S4b);
                    
                    /* Orientation 2b: F4(P3, P1; P2, P4)
                     ----------------------------------
                     13) F4(P3, P1a; P1b, P4) ---- F_G_denom3 ---   (p_S3,p_S1a,p_S1b,p_S4)
                     14) F4(P3, P2a; P2b, P4) ---- F_G_denom2_reversed --- (p_S3,p_S2a,p_S2b,p_S4)
                     15) F4(P3a, P1; P2, P3b) ----
                     16) F4(P4a, P1; P2, P4b) ---- */
                    quartetInfos[i].F_G_denoms[12] += f4_perVariant(p_S3,p_S1a,p_S1b,p_S4);
                    quartetInfos[i].F_G_denoms[13] += f4_perVariant(p_S3,p_S2a,p_S2b,p_S4);
                    quartetInfos[i].F_G_denoms[14] += f4_perVariant(p_S3a,p_S1,p_S2,p_S3b);
                    quartetInfos[i].F_G_denoms[15] += f4_perVariant(p_S4a,p_S1,p_S2,p_S4b);
                    
                    /* Orientation 3: F4(P1, P4; P2, P3)
                     ---------------------------------- Same as Orientation 1b
                     17) F4(P1, P4a; P4b, P3) - a duplicate of 8)
                     18) F4(P1a, P4; P2, P1b) - a duplicate of 5) ---- F_G_denom3_reversed --- (p_S2,p_S1a,p_S1b,p_S4)
                     19) F4(P1, P2a; P2b, P3) - a duplicate of 7)
                     20) F4(P3a, P4; P2, P3b) - a duplicate of 6) ---- F_G_denom1_reversed --- (p_S2,p_S3a,p_S3b,p_S4) */
                    quartetInfos[i].F_G_denoms[16] += f4_perVariant(p_S1,p_S4a,p_S4b,p_S3);
                    quartetInfos[i].F_G_denoms[17] += f4_perVariant(p_S1a,p_S4,p_S2,p_S1b);
                    quartetInfos[i].F_G_denoms[18] += f4_perVariant(p_S1,p_S2a,p_S2b,p_S3);
                    quartetInfos[i].F_G_denoms[19] += f4_perVariant(p_S3a,p_S4,p_S2,p_S3b);
                    
                    /* Orientation 3b: F4(P4, P1; P2, P3)
                     ----------------------------------
                     21) F4(P4, P1a; P1b, P3) - a duplicate of 13)
                     22) F4(P4, P2a; P2b, P3) - a duplicate of 14)
                     23) F4(P3a, P1; P2, P3b) - a duplicate of 15)
                     24) F4(P4a, P1; P2, P4b) - a duplicate of 16) */
                    quartetInfos[i].F_G_denoms[20] += f4_perVariant(p_S4,p_S1a,p_S1b,p_S3);
                    quartetInfos[i].F_G_denoms[21] += f4_perVariant(p_S4,p_S2a,p_S2b,p_S3);
                    quartetInfos[i].F_G_denoms[22] += f4_perVariant(p_S3a,p_S1,p_S2,p_S3b);
                    quartetInfos[i].F_G_denoms[23] += f4_perVariant(p_S4a,p_S1,p_S2,p_S4b);
                    
                   // Original version
                    quartetInfos[i].F_G_denom1 += f4_perVariant(p_S1,p_S3a,p_S3b,p_S4);
                    quartetInfos[i].F_G_denom1_reversed += f4_perVariant(p_S2,p_S3a,p_S3b,p_S4);
                    quartetInfos[i].F_G_denom2 += f4_perVariant(p_S1,p_S2a,p_S2b,p_S4);
                    quartetInfos[i].F_G_denom2_reversed += f4_perVariant(p_S3,p_S2a,p_S2b,p_S4);
                    quartetInfos[i].F_G_denom3 += f4_perVariant(p_S3,p_S1a,p_S1b,p_S4);
                    quartetInfos[i].F_G_denom3_reversed += f4_perVariant(p_S2,p_S1a,p_S1b,p_S4);
                }
                
                // std::cerr << "trioInfos[i].localD1num" << trioInfos[i].localD1denom << std::endl;
                if (opt::jkWindowSize > 0) {
                    if (quartetInfos[i].usedVars[0] == opt::jkWindowSize) { quartetInfos[i].addRegionDs(P3isTrios2); }
                    if (quartetInfos[i].usedVars[1] == opt::jkWindowSize) { quartetInfos[i].addRegionDs(P3isTrios1); }
                    if (quartetInfos[i].usedVars[2] == opt::jkWindowSize) { quartetInfos[i].addRegionDs(P3isTrios0); }
                }
                // } */
            }
            durationCalculation = ( clock() - startCalculation ) / (double) CLOCKS_PER_SEC;
        }
    }
    std::cerr << "Done processing VCF. Preparing output files..." << '\n';
    
    string header = makeHeader(true, opt::fStats,false);
    *outFileDmin << header << std::endl;
    if(opt::allF4) {
        header += "\tF_G_denom1\tF_G_denom2\tF_G_denom3\tF_G_denom4";
    }
    if (opt::treeFile != "") *outFileTree << header << std::endl;
    *outFileBBAA << header << std::endl;
    
    int exceptionCount = 0;
    for (int i = 0; i != quartets.size(); i++) { //
        // Get the D values
        try {
            /*std::cerr << "Here..." << '\n';
            std::cerr << "quartetInfos[i]." << quartetInfos[i].ABBAtotal << '\n';
            std::cerr << "quartetInfos[i]." << quartetInfos[i].BBAAtotal << '\n';
            std::cerr << "quartetInfos[i]." << quartetInfos[i].BABAtotal << '\n'; */
            quartetInfos[i].calculateFinalDs();
        } catch (const char* msg) {
            exceptionCount++;
            if (exceptionCount <= 10) {
                std::cerr << msg << std::endl;
                std::cerr << "Could not calculate p-values for the quartet: " << quartets[i][0] << " " << quartets[i][1] << " " << quartets[i][2] << " " << quartets[i][3]<< std::endl;
                if (opt::jkWindowSize > 0) std::cerr << "You should probably decrease the the jackknife block size (-j option)" << std::endl;
                else std::cerr << "it looks like there aren't enough ABBA-BABA informative variants for this quartet" << std::endl;
                std::cerr << std::endl;
            }
            quartetInfos[i].D1_p = nan(""); quartetInfos[i].D2_p = nan(""); quartetInfos[i].D3_p = nan("");
        }
       // std::cerr << "Here..." << '\n';
        
        // Find which topology is in agreement with the counts of BBAA, BABA, and ABBA
        quartetInfos[i].assignBBAAarrangement();
        std::vector<string> BBAAoutVec = quartetInfos[i].makeOutVec(quartets[i], opt::fStats, quartetInfos[i].BBAAarrangement, opt::allF4);
       // std::cerr << "quartetInfos[i].BBAAarrangement: " << quartetInfos[i].BBAAarrangement << std::endl;
        print_vector(BBAAoutVec,*outFileBBAA);
        
        // Find Dmin:
        quartetInfos[i].assignDminArrangement();
       // std::cerr << "quartetInfos[i].DminArrangement " << quartetInfos[i].DminArrangement << '\n';
        std::vector<string> DminOutVec = quartetInfos[i].makeOutVec(quartets[i], opt::fStats, quartetInfos[i].DminArrangement);
        print_vector(DminOutVec,*outFileDmin);
        
        // Find which arrangement of trios is consistent with the input tree (if provided):
        if (opt::treeFile != "") {
       //     std::cerr << "quartetInfos[i].treeArrangement " << quartetInfos[i].treeArrangement << '\n';
            std::vector<string> treeOutVec = quartetInfos[i].makeOutVec(quartets[i], opt::fStats, quartetInfos[i].treeArrangement, opt::allF4);
            print_vector(treeOutVec,*outFileTree);
        }
        
        // Output a simple file that can be used for combining multiple local runs:
        *outFileCombine << quartets[i][0] << "\t" << quartets[i][1] << "\t" << quartets[i][2] << "\t" << quartetInfos[i].BBAAtotal << "\t" << quartetInfos[i].BABAtotal << "\t" << quartetInfos[i].ABBAtotal;
        if (opt::fStats) {
            *outFileCombine << "\t" << quartetInfos[i].F_G_denom1 << "\t" << quartetInfos[i].F_G_denom2 << "\t" << quartetInfos[i].F_G_denom3;
            *outFileCombine << "\t" << quartetInfos[i].F_G_denom1_reversed << "\t" << quartetInfos[i].F_G_denom2_reversed << "\t" << quartetInfos[i].F_G_denom3_reversed;
            *outFileCombine << std::endl;
        } else {
            *outFileCombine << std::endl;
        }
        print_vector(quartetInfos[i].regionDs[0], *outFileCombineStdErr, ',', false); *outFileCombineStdErr << "\t"; print_vector(quartetInfos[i].regionDs[1], *outFileCombineStdErr, ',', false); *outFileCombineStdErr << "\t";
        print_vector(quartetInfos[i].regionDs[2], *outFileCombineStdErr, ',',false); *outFileCombineStdErr << std::endl;
        
        //std::cerr << trios[i][0] << "\t" << trios[i][1] << "\t" << trios[i][2] << "\t" << D1 << "\t" << D2 << "\t" << D3 << "\t" << BBAAtotals[i] << "\t" << BABAtotals[i] << "\t" << ABBAtotals[i] << std::endl;
    }
    if (exceptionCount > 10) {
        std::cerr << "..." << std::endl;
        std::cerr << "p-value could not be calculated for " << exceptionCount << " quartets" << std::endl;
        if (opt::jkWindowSize > 0) std::cerr << "You should probably decrease the the jackknife block size (-j option)" << std::endl;
        else std::cerr << "it looks like there aren't enough ABBA-BABA informative variants for these quartets" << std::endl;
       // std::cerr << "If this was a run for a subset of the genome (e.g. one chromosome), you may still get p-values for these quartets from DtriosCombine" << std::endl;
        std::cerr << std::endl;
    }
    return 0;
    
}


void parseDquartetsOptions(int argc, char** argv) {
    bool die = false; string regionArgString; std::vector<string> regionArgs;
    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
    {
        std::istringstream arg(optarg != NULL ? optarg : "");
        switch (c)
        {
            case '?': die = true; break;
            case 'n': arg >> opt::runName; break;
            case 't': arg >> opt::treeFile; break;
            case 'j': arg >> opt::jkWindowSize; break;
            case 'k': arg >> opt::jkNum; break;
            case 'o': arg >> opt::providedOutPrefix; break;
            case OPT_NO_F4: opt::fStats = false; break;
            case 'l': arg >> opt::providedNumLines; break;
            case 'a': opt::allF4 = true; break;
            case 'r': arg >> regionArgString; regionArgs = split(regionArgString, ',');
                opt::regionStart = (int)stringToDouble(regionArgs[0]); opt::regionLength = (int)stringToDouble(regionArgs[1]);  break;
            case 'h':
                std::cout << DQUARTS_USAGE_MESSAGE;
                exit(EXIT_SUCCESS);
        }
    }
    
    if (argc - optind < 2) {
        std::cerr << "missing arguments\n";
        die = true;
    }
    else if (argc - optind > 2)
    {
        std::cerr << "too many arguments\n";
        die = true;
    }
    
    if (die) {
        std::cout << "\n" << DQUARTS_USAGE_MESSAGE;
        exit(EXIT_FAILURE);
    }
    
    // Parse the input filenames
    opt::vcfFile = argv[optind++];
    opt::setsFile = argv[optind++];
    
    if (opt::vcfFile == "stdin" && opt::providedNumLines <= 0) {
        std::cerr << "If you want to read the VCF via a pipe, you need to specify the number of lines in the input via the -l option\n";
        std::cerr << "See the example above\n";
        die = true;
    }
    
    if (die) {
        std::cout << "\n" << DQUARTS_USAGE_MESSAGE;
        exit(EXIT_FAILURE);
    }
}


================================================
FILE: Dquartets.h
================================================
//
//  Dquartets.h
//  DsuiteXcode
//
//  Created by Milan Malinsky on 14/07/2020.
//

#ifndef Dquartets_h
#define Dquartets_h
#include "Dsuite_utils.h"

void parseDquartetsOptions(int argc, char** argv);
int DquartetsMain(int argc, char** argv);

#endif /* Dquartets_h */


================================================
FILE: Dsuite.cpp
================================================
//
//  Dsuite.cpp
//  Dsuite
//
//  Created by Milan Malinsky on 02/04/2019.
//

#include <iostream>
#include "Dsuite_utils.h"
#include "Dmin.h"
#include "D.h"
#include "Dmin_combine.h"
#include "Dsuite_fBranch.h"
#include "Dquartets.h"

#define AUTHOR "Milan Malinsky"
#define PACKAGE_VERSION "0.5 r58"


static const char *VERSION_MESSAGE =
"Dsuite software Version " PACKAGE_VERSION "\n"
"Written by Milan Malinsky.\n"
"\n";

static const char *USAGE_MESSAGE =
"Program: " PROGRAM_BIN "\n"
"Version: " PACKAGE_VERSION "\n"
"Contact: " AUTHOR " [" PACKAGE_BUGREPORT "]\n"
"Usage: " PROGRAM_BIN " <command> [options]\n\n"
"Commands:\n"
"           Dtrios                  Calculate D (ABBA-BABA) and f4-ratio statistics for all possible trios of populations/species\n"
"           DtriosCombine           Combine results from Dtrios runs across genomic regions (e.g. per-chromosome)\n"
"           Dinvestigate            Follow up analyses for trios with significantly elevated D:\n"
"                                   calculates f_d, f_dM, and d_f in windows along the genome\n"
"           Fbranch                 Calculate D and f statistics for branches on a tree that relates the populations/species\n"
"\n"
"Experimental:\n"
"           Dquartets               Calculate D (ABBA-BABA) and f4-ratio statistics for all possible quartets of populations/species\n"
"                                   (no outgroup specified)\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";

int main(int argc, char **argv) {
    
    if(argc <= 1)
    {
        std::cout << USAGE_MESSAGE;
        return 0;
    }
    else
    {
        std::string command(argv[1]);
        if(command == "help" || command == "--help" || command == "-h")
        {
            std::cout << USAGE_MESSAGE;
            return 0;
        }
        else if(command == "version" || command == "--version")
        {
            std::cout << VERSION_MESSAGE;
            return 0;
        }
        
        if(command == "Dinvestigate")
            abbaBabaMain(argc - 1, argv + 1);
        else if (command == "Dtrios")
            DminMain(argc - 1, argv + 1);
        else if (command == "DtriosCombine")
            DminCombineMain(argc - 1, argv + 1);
        else if (command == "Fbranch")
            fBranchMain(argc - 1, argv + 1);
        else if (command == "Dquartets")
            DquartetsMain(argc - 1, argv + 1);
        else
        {
            std::cerr << "Unrecognized command: " << command << "\n";
            return 1;
        }
        return 0;
    }
}



================================================
FILE: Dsuite_common.cpp
================================================
//
//  Dsuite_common.cpp
//  DsuiteXcode
//
//  Created by Milan Malinsky on 21/07/2020.
//

#include "Dsuite_common.h"



void SetInformation::linkSetsAndVCFpositions(const std::vector<std::string>& sampleNames) {
    // print_vector_stream(sampleNames, std::cerr);
    for (std::vector<std::string>::size_type i = 0; i != sampleNames.size(); i++) {
        try { posToPopMap[i] = IDsToPopMap.at(sampleNames[i]); } catch (const std::out_of_range& oor) {
            std::cerr << "WARNING: The sample " << sampleNames[i] << " is in the VCF but not assigned in the SETS.txt file" << std::endl;
        }
    }
    // Iterate over all the keys in the map to find the samples in the VCF:
    // Give an error if no sample is found for a species:
    for(std::map<string, std::vector<string>>::const_iterator it = popToIDsMap.begin(); it != popToIDsMap.end(); ++it) {
        string sp =  it->first;
        //std::cerr << "sp " << sp << std::endl;
        std::vector<string> IDs = it->second;
        std::vector<size_t> spPos = locateSet(sampleNames, IDs); 
        if (spPos.empty()) {
            std::cerr << "Did not find any samples in the VCF for \"" << sp << "\"" << std::endl;
            assert(!spPos.empty());
        }
        popToPosMap[sp] = spPos;
    }
}

void SetInformation::checkIfTreeNamesMatch(std::map<string,std::vector<int>>& treeTaxonNamesToLoc) {
        for (int i = 0; i != populations.size(); i++) {
            try { treeTaxonNamesToLoc.at(populations[i]);
            } catch (const std::out_of_range& oor) {
                std::cerr << "Out of Range error: " << oor.what() << '\n';
                std::cerr << "species[i]: " << populations[i] << '\n';
                std::cerr << CHECK_TREE_ERROR_MSG << '\n';
                exit(1);
    }}
}


string makeHeader(bool quartet, bool includeFstats, bool includeKSstats) {
    string header = "P1\tP2\tP3"; if (quartet) header += "\tP4";
    header += "\tDstatistic\tZ-score\tp-value";
    if (includeFstats) { header += "\t"; header += F4HEADER; }
    if (includeKSstats) { header += "\t"; header += "clustering_sensitive"; header += "\t"; header += "clustering_robust";}
    header += "\tBBAA\tABBA\tBABA";
    return header;
}

string prepareOutFileRootString(const string& providedPrefix, const string& runName, const string& setsFileName, const int regionStart, const int regionLength) {
    string fileNameRootString; string outRoot; if (providedPrefix == "") { outRoot = stripExtension(setsFileName);} else { outRoot = providedPrefix; }
    if (regionStart == -1) { if (runName != "") fileNameRootString = outRoot + "_" + runName; else fileNameRootString = outRoot; }
    else fileNameRootString = outRoot+"_"+runName+"_"+numToString(regionStart)+"_"+numToString(regionStart+regionLength);
    return fileNameRootString;
}

void printMissingLikelihoodsWarning(const string& chr, const string& pos) {
    std::cerr << "WARNING: Could not fing genotype likelihoods/probabilities (GP, PL, or GL fields) for variant at " << chr << " " << pos << std::endl;
    std::cerr << "WARNING: Did you really mean to use the -g option? Reverting to using called genotypes." << std::endl;
}

void duplicateTreeValueError(const string& duplicate) {
    std::cerr << "ERROR: Duplicate value in the tree \"" << duplicate << "\"\n";
    std::cerr << "Exiting\n";
    exit(1);
}

void printInitialMessageTriosQuartets(const int regionLengthOpt, const int VCFlineCount, const int JKblockSizeBasedOnNum, const int jkWindowSizeOpt, const int jkNumOpt) {
    if (regionLengthOpt > 0) { std::cerr << "The VCF region to be analysed contains " << VCFlineCount << " variants\n"; }
    else { std::cerr << "The VCF contains " << VCFlineCount << " variants\n"; }
    if (jkWindowSizeOpt == 0) std::cerr << "Going to use block size of " << JKblockSizeBasedOnNum << " variants to get " << jkNumOpt << " Jackknife blocks\n";
}

void assignTreeLevelsAndLinkToTaxa(string& treeLine, std::map<string,std::vector<int>>& taxaToLoc, std::vector<int>& levels) {
    // First take care of any branch lengths
    std::regex branchLengths(":.*?(?=,|\\))");
    treeLine = std::regex_replace(treeLine,branchLengths,"");
    //std::cerr << line << std::endl;

    // Now process the tree
    levels.assign(treeLine.length(),0); int currentLevel = 0;
    std::vector<string> treeTaxonNames;
    string currentTaxonName = "";
    int lastBegin = 0;
    for (int i = 0; i < treeLine.length(); ++i) {
        if (treeLine[i] == '(') {
            currentLevel++; levels[i] = currentLevel;
        } else if (treeLine[i] == ')') {
            currentLevel--; levels[i] = currentLevel;
            if (currentTaxonName != "") {
                if (taxaToLoc.count(currentTaxonName) == 1) { duplicateTreeValueError(currentTaxonName); }
                treeTaxonNames.push_back(currentTaxonName);
                taxaToLoc[currentTaxonName].push_back(lastBegin);
                taxaToLoc[currentTaxonName].push_back(i-1);
                currentTaxonName = "";
            }
        } else if (treeLine[i] == ',') {
            levels[i] = currentLevel;
            if (currentTaxonName != "") {
                treeTaxonNames.push_back(currentTaxonName);
                taxaToLoc[currentTaxonName].push_back(lastBegin);
                taxaToLoc[currentTaxonName].push_back(i-1);
                currentTaxonName = "";
            }
        } else {
            if (currentTaxonName == "")
                lastBegin = i;
            levels[i] = currentLevel;
            currentTaxonName += treeLine[i];
        }
    }
    //print_vector(treeTaxonNames, std::cout,'\n');
    //print_vector(treeLevels, std::cout,' ');
    //for (std::map<string,std::vector<int>>::iterator i = treeTaxonNamesToLoc.begin(); i != treeTaxonNamesToLoc.end(); i++) {
    //    std::cout << i->first << "\t" << i->second[0] << "\t" << i->second[1] << "\t" << treeLevels[i->second[0]] << "\t" << treeLevels[i->second[1]] << std::endl;
    //}
}

int assignNumLinesToAnalyse(const int providedNumLinesOpt, const int regionLengthOpt,const string& vcfFileOpt) {
    int VCFlineCount;
    if (providedNumLinesOpt > 0) {
        VCFlineCount = providedNumLinesOpt;
    } else if (regionLengthOpt > 0) {
        VCFlineCount = regionLengthOpt;
    } else { // Block to find the number of lines in the VCF file
        std::istream* vcfFile = createReader(vcfFileOpt.c_str());
        // See how big is the VCF file
        vcfFile->unsetf(std::ios_base::skipws); // new lines will be skipped unless we stop it from happening:
        // count the newlines with an algorithm specialized for counting:
        VCFlineCount = (int)std::count(std::istream_iterator<char>(*vcfFile),std::istream_iterator<char>(),'\n');
        //std::cout << "VCF Lines: " << VCFlineCount << "\n";
    }
    return VCFlineCount;
}


================================================
FILE: Dsuite_common.h
================================================
//
//  Dsuite_common.h
//  DsuiteXcode
//
//  Created by Milan Malinsky on 21/07/2020.
//

#ifndef Dsuite_common_h
#define Dsuite_common_h

#define stdInInfo   "Use 'stdin' for the VCF file when piping from another program into Dsuite via standard input\n" \
                    "in this case it is necessary to provide the number of lines in the filtered VCF via the -l option\n" \
                    "For example, to filter the VCF for overall mimimum depth of at least 1000 across all samples:\n" \
                    "NUMLINES=$(bcftools view -i 'INFO/DP>1000' INPUT_FILE.vcf | wc -l)  # to get NUMLINES\n" \
                    "bcftools view -i 'INFO/DP>1000' INPUT_FILE.vcf | Dsuite Dtrios -l $NUMLINES stdin SETS.txt\n" \
                    "\n"

#define regionOption    "       -r, --region=start,length               (optional) only process a subset of the VCF file; both \"start\" and \"length\" indicate variant numbers\n" \
                        "                                               e.g. --region=20001,10000 will process variants from 20001 to 30000\n"

#define treeOption      "       -t, --tree=TREE_FILE.nwk                (optional) a file with a tree in the newick format specifying the relationships between populations/species\n" \
                        "                                               D and f4-ratio values for trios arranged according to the tree will be output in a file with _tree.txt suffix\n"

#define outOption       "       -o, --out-prefix=OUT_FILE_PREFIX        (optional) the prefix for the files where the results should be written\n" \
                        "                                               output will be put in OUT_FILE_PREFIX_BBAA.txt, OUT_FILE_PREFIX_Dmin.txt, OUT_FILE_PREFIX_tree.txt etc.\n" \
                        "                                               by default, the prefix is taken from the name of the SETS.txt file\n"

#include "Dsuite_utils.h"

inline void notEnoughPopulationsError(const int minPopulations) {
    std::cerr << "ERROR: You need at least " << minPopulations << " sets (populations/species) for this analysis." << std::endl;
    exit(EXIT_FAILURE);
}

inline void outgroupNeededError(const string& setsFileName) {
    std::cerr << "ERROR: The file " << setsFileName << " needs to specify the \"Outgroup\"" << std::endl;
    exit(EXIT_FAILURE);
}

inline void outgroupNotUsedInQuartetsWarning(const string& setsFileName) {
    std::cerr << "WARNING: You specified the \"Outgroup\" in " << setsFileName << ". This is needed in Dtrios, but will be ignored in Dquarters - the \"Outgroup\" will be treated as any other population. It must also be present in the tree if you are supplying one." << std::endl;
}

inline void wrongNumberOfColumnsError(const string& setsFileName, int lineNum) {
    std::cerr << "ERROR: Please fix the format of the " << setsFileName << " file." << std::endl;
    std::cerr << "Line " << lineNum << " does not have two columns separated by a tab." << std::endl;
    exit(EXIT_FAILURE);
}

inline void lineEmptyError(const string& setsFileName, int lineNum) {
    std::cerr << "ERROR: Please fix the format of the " << setsFileName << " file." << std::endl;
    std::cerr << "Line " << lineNum << " is empty." << std::endl;
    exit(EXIT_FAILURE);
}

class SetInformation {
public:
    
    SetInformation(const string& setsFileName, const int minPopulations, const int outgroupRequirement) {
        
        std::ifstream* setsFile = new std::ifstream(setsFileName.c_str());
        assertFileOpen(*setsFile, setsFileName);
        
        string line; int l = 0; bool outgroupSpecified = false;
        while (getline(*setsFile, line)) {
            line.erase(std::remove(line.begin(), line.end(), '\r'), line.end()); // Deal with any left over \r from files prepared on Windows
            
            l++; if (line == "") lineEmptyError(setsFileName,l);
            
            std::vector<string> ID_Pop = split(line, '\t');
            
            if (ID_Pop.size() != 2) wrongNumberOfColumnsError(setsFileName,l);
            if (ID_Pop[1] == "Outgroup") { outgroupSpecified = true; }
            
            popToIDsMap[ID_Pop[1]].push_back(ID_Pop[0]);
            IDsToPopMap[ID_Pop[0]] = ID_Pop[1];
        }
        
        for(std::map<string,std::vector<string>>::iterator it = popToIDsMap.begin(); it != popToIDsMap.end(); ++it) {
            if ((it->first) != "Outgroup" && it->first != "xxx") {
                populations.push_back(it->first);
            }
        } std::cout << "There are " << populations.size() << " sets (populations/species) excluding the Outgroup" << std::endl;
        
        if (populations.size() < minPopulations) notEnoughPopulationsError(minPopulations);
        
        // Provide error/warning messages depending on which analysis is run and the presence/absence of Outgroup in the SETS file
        if (outgroupRequirement == OutgroupNotRequired && outgroupSpecified) outgroupNotUsedInQuartetsWarning(setsFileName);
        if (outgroupRequirement == OutgroupRequired && !outgroupSpecified) outgroupNeededError(setsFileName);
    };
    
    
    
    std::vector<string> populations;
    std::map<string, string> IDsToPopMap;
    std::map<string, std::vector<string>> popToIDsMap;
    std::map<string, std::vector<size_t>> popToPosMap;
    std::map<size_t, string> posToPopMap;

    void linkSetsAndVCFpositions(const std::vector<std::string>& sampleNames);
    void checkIfTreeNamesMatch(std::map<string,std::vector<int>>& treeTaxonNamesToLoc);
};


void process_SETS_file(std::ifstream* setsFile, const string fName, std::map<string, std::vector<string>>& speciesToIDsMap, std::map<string, string>& IDsToSpeciesMap, int outgroupRequirement);
string makeHeader(bool quartet, bool includeFstats, bool includeKSstats);
string prepareOutFileRootString(const string& providedPrefix, const string& runName, const string& setsFileName, const int regionStart, const int regionLength);
void printMissingLikelihoodsWarning(const string& chr, const string& pos);
void printInitialMessageTriosQuartets(const int regionLengthOpt, const int VCFlineCount, const int JKblockSizeBasedOnNum, const int jkWindowSizeOpt, const int jkNumOpt);
void duplicateTreeValueError(const string& duplicate);
void assignTreeLevelsAndLinkToTaxa(string& treeLine, std::map<string,std::vector<int>>& taxaToLoc, std::vector<int>& levels);
int assignNumLinesToAnalyse(const int providedNumLinesOpt, const int regionLengthOpt,const string& vcfFileOpt);

inline void reportProgessVCF(const int variantsProcessed, const std::clock_t startTime) {
    double durationOverall = ( std::clock() - startTime ) / (double) CLOCKS_PER_SEC;
    std::cout << "Processed " << variantsProcessed << " variants in " << durationOverall << "secs" << std::endl;
}

inline void reportProgessVCF(const int variantsProcessed, const int VCFlineCount, const std::clock_t startTime) {
    double durationOverall = ( std::clock() - startTime ) / (double) CLOCKS_PER_SEC;
    std::cerr << "Processed " << variantsProcessed << " variants (" << ((double)variantsProcessed/VCFlineCount)*100 << "%) in " << durationOverall << "secs" << std::endl;
}

#endif /* Dsuite_common_h */


================================================
FILE: Dsuite_fBranch.cpp
================================================
//
//  Dsuite_fBranch.cpp
//  DsuiteXcode
//
//  Created by Milan Malinsky on 11/11/2019.
//

#include "Dsuite_fBranch.h"
#define SUBPROGRAM "Fbranch"

#define DEBUG 0

static const char *BRANCHSCORE_USAGE_MESSAGE =
"Usage: " PROGRAM_BIN " " SUBPROGRAM " [OPTIONS] TREE_FILE.nwk FVALS_tree.txt\n"
"Implements the 'f-branch' type calculations developed by Hannes Svardal for Malinsky et al., 2018, Nat. Ecol. Evo.\n"
"Uses the f4-ratio (f_G) values produced by Dsuite Dtrios (or DtriosCombine) with the --tree option; this is the output of Dtrios with the \"_tree.txt\" suffix\n"
"To use  Fbranch, the tree in TREE_FILE.nwk must be rooted with the Outgroup.\n"
"Output to stdout\n"
"\n"
"       -p, --pthresh                           (default=0.01) fb scores whose associated p-value is less than \n"
"       -Z, --Zb-matrix                         (optional)  output the equivalent of fb-statistic, but with Z-scores to assess statistical significance\n"
"                                               this will be printed below the f-branch matrix\n"
"       -P, --Pb-matrix                         (optional)  output the equivalent of fb-statistic, but with p-values to assess statistical significance\n"
"                                               this will be printed below the f-branch matrix\n"
"       -h, --help                              display this help and exit\n"
"\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";

//enum { OPT_F_JK };

static const char* shortopts = "hp:Z";

//static const int JK_WINDOW = 5000;

static const struct option longopts[] = {
    { "Zb-matrix",   no_argument, NULL, 'Z' },
    { "Pb-matrix",   no_argument, NULL, 'P' },
    { "pthresh",   required_argument, NULL, 'p' },
    { "help",   no_argument, NULL, 'h' },
    { NULL, 0, NULL, 0 }
};

namespace opt
{
    static string treeFile;
    static string DvalsFile;
    static bool printZb = false;
    static bool printPb = false;
    static double pthresh = 0.01;
}


int fBranchMain(int argc, char** argv) {
    parseFbranchOptions(argc, argv);
    std::istream* treeFile = new std::ifstream(opt::treeFile.c_str());
    if (!treeFile->good()) { std::cerr << "The file " << opt::treeFile << " could not be opened. Exiting..." << std::endl; exit(EXIT_FAILURE);}
    std::istream* DvalsFile = new std::ifstream(opt::DvalsFile.c_str());
    if (!DvalsFile->good()) { std::cerr << "The file " << opt::DvalsFile << " could not be opened. Exiting..." << std::endl; exit(EXIT_FAILURE);}
    if (opt::DvalsFile.substr(opt::DvalsFile.size()-9) != "_tree.txt") { std::cerr << "The name of the input file with the f4-ratio values should end in \"_tree.txt\".\nPlease make sure you run Dtrios with the --tree option and then feed the correct file into Fbranch. Exiting..." << std::endl; exit(EXIT_FAILURE); }
    std::map<string,std::vector<std::vector<string>>> acToBmap;
    string line; int l = 0;
    getline(*DvalsFile, line); // get the header
    std::vector<string> headerVec = split(line, '\t');
    int indexFg = -1; int indexZ = -1;
    if (headerVec[4] == "Z-score") { indexZ = 4; }
    if (headerVec[5] == F4HEADER || headerVec[5] == "f_G") { indexFg = 5; } else if (headerVec[6] == F4HEADER || headerVec[6] == "f_G") { indexFg = 6; }
    while (getline(*DvalsFile, line)) {
        line.erase(std::remove(line.begin(), line.end(), '\r'), line.end()); // Deal with any left over \r from files prepared on Windows
        l++; if (line == "") { std::cerr << "Please fix the format of the " << opt::DvalsFile << " file.\nLine " << l << " is empty. Exiting..." << std::endl; exit(EXIT_FAILURE); }
        std::vector<string> speciesAndVals = split(line, '\t');
        if (speciesAndVals.size() < 6 || indexFg == -1) {
            std::cerr << "Please fix the format of the " << opt::DvalsFile << " file." << std::endl;
            std::cerr << "Looks like the file does not contain f4-ratio statistics. Exiting..." << std::endl;
            exit(EXIT_FAILURE);
        }
        double f4ratio = stringToDouble(speciesAndVals[indexFg]); double Zscore = stringToDouble(speciesAndVals[indexZ]);
        double pval = 2 * (1 - normalCDF(Zscore));
        std::vector<string> bAndValLine;  bAndValLine.push_back(speciesAndVals[1]);
        if (pval < opt::pthresh) bAndValLine.push_back(speciesAndVals[indexFg]); else bAndValLine.push_back("0"); // Set non-significant f4-ratio statistics to 0
        if (indexZ != -1) bAndValLine.push_back(speciesAndVals[indexZ]);
        std::vector<string> aAndValLine;  aAndValLine.push_back(speciesAndVals[0]); aAndValLine.push_back("0");
        if (indexZ != -1) aAndValLine.push_back("0");
        acToBmap[speciesAndVals[0]+","+speciesAndVals[2]].push_back(bAndValLine);
        acToBmap[speciesAndVals[1]+","+speciesAndVals[2]].push_back(aAndValLine);
    }
    string treeString; getline(*treeFile, treeString);
    Tree* testTree = new Tree(treeString);
    testTree->updateProgenyIds();
    testTree->fillSisterBranches();
    for (std::vector<Branch*>::iterator b = testTree->branches.begin(); b != testTree->branches.end(); b++) {
        if ((*b)->parentId != "treeOrigin") {
            std::vector<string> Bs = (*b)->progenyIds;
            std::vector<string> As = (*b)->sisterBranch->progenyIds;
            //if((*b)->id == "b5") { print_vector(Bs, std::cout); }
            std::vector<double> Bmins; std::vector<double> vals;
            std::vector<double> ZBmins; std::vector<double> Zvals;
            for (std::vector<string>::iterator C = testTree->allSpecies.begin(); C != testTree->allSpecies.end(); C++) {
                for (std::vector<string>::iterator A = As.begin(); A != As.end(); A++) {
                    std::vector<std::vector<string>> bAndVal; std::vector<std::vector<string>> aAndVal;
                    try { bAndVal = acToBmap.at(*A+","+*C); } catch (const std::out_of_range& oor) {}
                    for (int i = 0; i < bAndVal.size(); i++) {
                        if (std::count(Bs.begin(), Bs.end(), bAndVal[i][0])) {
                            vals.push_back(stringToDouble(bAndVal[i][1]));
                            if (indexZ != -1) {
                                Zvals.push_back(stringToDouble(bAndVal[i][2]));
                               // std::cerr << "bAndVal[i]: "; print_vector(bAndVal[i],std::cerr);
                            }
                        }
                        //if((*b)->id == "b5") { std::cout << *A << "\t" << bAndVal[i][0] << "\t" << bAndVal[i][1] << "\tbAndVal.size():\t" << bAndVal.size() << "\ti:\t" << i << std::endl;
                            //
                        //}
                    }
                    if (!vals.empty()) { Bmins.push_back(*std::min_element(vals.begin(),vals.end())); vals.clear(); }
                    if (!Zvals.empty()) { ZBmins.push_back(*std::min_element(Zvals.begin(),Zvals.end())); Zvals.clear(); }
                    //
                }
                double fbC = NAN; double ZfbC = NAN;
                if (!Bmins.empty()) { fbC = median(Bmins.begin(),Bmins.end()); Bmins.clear(); }
                if (!ZBmins.empty()) { ZfbC = median(ZBmins.begin(),ZBmins.end()); ZBmins.clear(); }
                /* else { // There is no positive value; just find if any value is possible for this ABC combination
                    bool ACpossible = false;
                    for (std::vector<string>::iterator B = Bs.begin(); B != Bs.end(); B++) {
                        std::vector<std::vector<string>> bAndVal; std::vector<std::vector<string>> aAndVal;
                        try { bAndVal = acToBmap.at(*B+","+*C); } catch (const std::out_of_range& oor) {}
                        for (int i = 0; i < bAndVal.size(); i++) {
                            if (std::count(As.begin(), As.end(), bAndVal[i][0])) {
                                ACpossible = true; break;
                            }
                        }
                    }
                    if (ACpossible) fbC = 0;
                } */
                (*b)->fbCvals.push_back(fbC);
                (*b)->ZfbCvals.push_back(ZfbC);
                (*b)->PfbCvals.push_back(2 * (1 - normalCDF(ZfbC)));
               // std::cerr << "Here: (*b)->progenyIds: "; print_vector((*b)->progenyIds,std::cerr);
               // std::cerr << "Here: (*b)->ZfbCvals.size() " << (*b)->ZfbCvals.size() << std::endl;
               // std::cerr << "Here: (*b)->ZfbCvals: "; print_vector((*b)->ZfbCvals,std::cerr);
            }
        }
    }
    
    // Generate output
    std::cout << "branch\tbranch_descendants\t"; print_vector(testTree->allSpecies, std::cout);
    for (std::vector<Branch*>::iterator b = testTree->branches.begin(); b != testTree->branches.end(); b++) {
        if ((*b)->parentId != "treeOrigin") {
            std::cout << (*b)->id << "\t"; print_vector((*b)->progenyIds, std::cout, ',', false);
            std::cout << "\t"; print_vector((*b)->fbCvals, std::cout);
            //std::cout << "Sister branch:\t" <<  (*b)->sisterBranch->id << std::endl;
            //std::cout << "This branch progeny:\t"; print_vector((*b)->progenyIds, std::cout);
            //std::cout << "Sister branch progeny:\t"; print_vector((*b)->sisterBranch->progenyIds, std::cout);
            //std::cout << "fbCs:\t"; print_vector((*b)->fbCvals, std::cout);
            //std::cout << std::endl;
        }
    }
    if (indexZ != -1 && opt::printZb) {
        std::cout << "\n";
        std::cout << "# Z-scores:\n";
        std::cout << "branch\tbranch_descendants\t"; print_vector(testTree->allSpecies, std::cout);
        for (std::vector<Branch*>::iterator b = testTree->branches.begin(); b != testTree->branches.end(); b++) {
            if ((*b)->parentId != "treeOrigin") {
                std::cout << (*b)->id << "\t"; print_vector((*b)->progenyIds, std::cout, ',', false);
                std::cout << "\t"; print_vector((*b)->ZfbCvals, std::cout);
            }
        }
    }
    if (indexZ != -1 && opt::printPb) {
        std::cout << "\n";
        std::cout << "# p-values:\n";
        std::cout << "branch\tbranch_descendants\t"; print_vector(testTree->allSpecies, std::cout);
        for (std::vector<Branch*>::iterator b = testTree->branches.begin(); b != testTree->branches.end(); b++) {
            if ((*b)->parentId != "treeOrigin") {
                std::cout << (*b)->id << "\t"; print_vector((*b)->progenyIds, std::cout, ',', false);
                std::cout << "\t"; print_vector((*b)->PfbCvals, std::cout);
            }
        }
    }
    return 0;
    
}

void parseFbranchOptions(int argc, char** argv) {
    bool die = false;
    std::vector<string> windowSizeStep;
    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
    {
        std::istringstream arg(optarg != NULL ? optarg : "");
        switch (c)
        {
            case '?': die = true; break;
            case 'p': arg >> opt::pthresh; break;
            case 'Z': opt::printZb = true; break;
            case 'P': opt::printPb = true; break;
            case 'h':
                std::cout << BRANCHSCORE_USAGE_MESSAGE;
                exit(EXIT_SUCCESS);
        }
    }
    
    if (argc - optind < 2) {
        std::cerr << "missing arguments\n";
        die = true;
    }
    else if (argc - optind > 2)
    {
        std::cerr << "too many arguments\n";
        die = true;
    }
    
    if (die) {
        std::cout << "\n" << BRANCHSCORE_USAGE_MESSAGE;
        exit(EXIT_FAILURE);
    }
    
    // Parse the input filenames
    opt::treeFile = argv[optind++];
    opt::DvalsFile = argv[optind++]; 
}


void Tree::updateProgenyIds() {
    // Determine the progeny of each branch (needed to know whether conditions are met, and for fossil constraints).
    // First of all, set progeniesComplete to 2 for all extinct and present branches.
    for (std::vector<Branch*>::iterator b = branches.begin(); b != branches.end(); b++) {
        if ((*b)->daughterId1 == "none") {
            (*b)->progeniesComplete = 2;
            (*b)->progenyIds.push_back((*b)->terminalSpeciesId);
        }
        // Set progenyPassedOn to true for the two root branches.
        if ((*b)->parentId == "treeOrigin") (*b)->progenyPassedOn = true;
    }
    bool allProgeniesComplete = false;
    while(!allProgeniesComplete) {
        std::vector<Branch*> newlyCompleted;
        for (std::vector<Branch*>::iterator b = branches.begin(); b != branches.end(); b++) {
            // Determine if the progeny of this branch is clear but has not been passed on to the parent yet.
            if ((*b)->progeniesComplete == 2 && (*b)->progenyPassedOn == false) {
                newlyCompleted.push_back(*b);
            }
        }
        if (newlyCompleted.size() == 0) allProgeniesComplete = true;
        for (std::vector<Branch*>::iterator b = newlyCompleted.begin(); b != newlyCompleted.end(); b++) {
            // Find parent, pass progeny+self on to parents progeny, add parent.progeniesComplete += 1, and change own progenyPassedOn to true.
            for (std::vector<Branch*>::iterator bb = branches.begin(); bb != branches.end(); bb++) {
                if ((*bb)->id == (*b)->parentId) {
                    (*b)->parentBranch = *bb;
                    (*bb)->progenyIds.insert((*bb)->progenyIds.end(), (*b)->progenyIds.begin(), (*b)->progenyIds.end() );
                    (*bb)->progeniesComplete++;
                    (*b)->progenyPassedOn = true;
                    break;
                }
            }
        }
    }
}

void Tree::fillSisterBranches() {
    for (std::vector<Branch*>::iterator b = branches.begin(); b != branches.end(); b++) {
        if ((*b)->parentId != "treeOrigin") {
            string sisterId;
            if ((*b)->parentBranch->daughterId1 != (*b)->id)
                sisterId = (*b)->parentBranch->daughterId1;
            else
                sisterId = (*b)->parentBranch->daughterId2;
            for (std::vector<Branch*>::iterator bb = branches.begin(); bb != branches.end(); bb++) {
                if ((*bb)->id == sisterId) {
                    (*b)->sisterBranch = *bb;
                    break;
                }
            }
        }
    }
}


================================================
FILE: Dsuite_fBranch.h
================================================
//
//  Dsuite_fBranch.h
//  DsuiteXcode
//
//  Created by Milan Malinsky on 11/11/2019.
//

#ifndef Dsuite_fBranch_h
#define Dsuite_fBranch_h

#include <stdio.h>
#include "Dsuite_utils.h"
#include "Dsuite_common.h"


int fBranchMain(int argc, char** argv);
void parseFbranchOptions(int argc, char** argv);


class Branch {
public:
    Branch(string inId, string inParentId, string inDaughterId1, string inDaughterId2, string inTerminalSpeciesId) {
        id = inId;
        parentId = inParentId;
        if (inTerminalSpeciesId == "unknown") {
            daughterId1 = inDaughterId1;
            daughterId2 = inDaughterId2;
            terminalSpeciesId = "";
        } else {
            //assert(inDaughterIds.size() == 0);
            terminalSpeciesId = inTerminalSpeciesId;
            daughterId1 = "none";
            daughterId2 = "none";
        }
        progeniesComplete = 0;
        progenyPassedOn = false;
    };
    
    
    string id;
    string parentId;
    string daughterId1;
    string daughterId2;
    std::vector<string> progenyIds;
    string terminalSpeciesId;
    
    Branch* parentBranch;
    Branch* sisterBranch;
    std::vector<double> fbCvals;
    std::vector<double> ZfbCvals;
    std::vector<double> PfbCvals;
    
    int progeniesComplete;
    bool progenyPassedOn;
};


class Tree {
public:
    Tree(string treeString) {
        // First take care of any branch lengths
        std::regex branchLengths(":.*?(?=,|\\))");
        string treeNoBranchLengths = std::regex_replace(treeString,branchLengths,"");
        std::vector<string> tmpBranchEndNodeId;
        std::vector<string> tmpBranchStartNodeId;
        int numberOfInternalNodes = 0;
        std::regex sistersRegEx("\\(([a-zA-Z0-9.[:s:]_-]+),([a-zA-Z0-9.[:s:]_-]+)\\)");
        std::regex sistersRegExNoGroups("\\([a-zA-Z0-9.[:s:]_-]+,[a-zA-Z0-9.[:s:]_-]+\\)");
        std::regex comma(",");
        std::smatch match;
        string workingTreeCopy = treeNoBranchLengths;
        while (std::regex_search(workingTreeCopy,match,sistersRegEx)) {
            assert(match.size() == 3);
            // for (auto x:match) std::cout << x << " "; std::cout << std::endl;
            string nodeId = "internalNode"+numToString(numberOfInternalNodes)+"X";
            tmpBranchStartNodeId.push_back(nodeId);
            tmpBranchStartNodeId.push_back(nodeId);
            if (std::count(tmpBranchEndNodeId.begin(),tmpBranchEndNodeId.end(),match[1])) duplicateTreeValueError(match[1]);
            else tmpBranchEndNodeId.push_back(match[1]);
            if (std::count(tmpBranchEndNodeId.begin(),tmpBranchEndNodeId.end(),match[2])) duplicateTreeValueError(match[2]);
            else tmpBranchEndNodeId.push_back(match[2]);
            
            workingTreeCopy = std::regex_replace(workingTreeCopy, sistersRegExNoGroups, nodeId, std::regex_constants::format_first_only);
            // std::cout << workingTreeCopy << std::endl;
            numberOfInternalNodes++;
        }
        if (std::regex_search(workingTreeCopy,comma)) {
            std::cerr << "ERROR: The tree string could not be parsed correctly! The remaining unparsed tree string is:"  << std::endl;
            std::cerr << workingTreeCopy << std::endl;
            exit(1);
        }
        
        // Prepare arrays for temporary branch format.
        std::vector<string> tmp2BranchID;
        std::vector<string> tmp2BranchParentId;
        std::vector<string> tmp2BranchDaughterId1;
        std::vector<string> tmp2BranchDaughterId2;
        std::vector<string> tmp2BranchEndNodeId;
        
        // Prepare the first two branches in temporary format (tmpBranchEndNodeId[-1] and tmpBranchEndNodeId[-2] are the two oldest branches).
        // Test if the first root branch ends in an internal node.
        std::regex internalNodeRegEx("internalNode[0-9]+X");
        tmp2BranchID.push_back("b0");
        tmp2BranchParentId.push_back("treeOrigin");
        tmp2BranchEndNodeId.push_back(tmpBranchEndNodeId[tmpBranchEndNodeId.size()-1]);
        if (std::regex_match(tmpBranchEndNodeId[tmpBranchEndNodeId.size()-1],internalNodeRegEx)) {
            tmp2BranchDaughterId1.push_back("unborn"); tmp2BranchDaughterId2.push_back("unborn");
        } else {
            tmp2BranchDaughterId1.push_back("none"); tmp2BranchDaughterId2.push_back("none");
        }
        // Repeat the above for the second branch.
        // Test if the second root branch ends in an internal node.
        tmp2BranchID.push_back("b1");
        tmp2BranchParentId.push_back("treeOrigin");
        tmp2BranchEndNodeId.push_back(tmpBranchEndNodeId[tmpBranchEndNodeId.size()-2]);
        if (std::regex_match(tmpBranchEndNodeId[tmpBranchEndNodeId.size()-2],internalNodeRegEx)) {
            tmp2BranchDaughterId1.push_back("unborn"); tmp2BranchDaughterId2.push_back("unborn");
        } else {
            tmp2BranchDaughterId1.push_back("none"); tmp2BranchDaughterId2.push_back("none");
        }
        
        // Find out about all remaining branches until either all branches end with extinctions, or all branches have reached the present.
        int branchIdCounter = 2;
        bool treeComplete = false;
        while (!treeComplete) {
            bool change = false;
            //std::cout << "tmp2BranchID.size(): " << tmp2BranchID.size() << std::endl;
            for (int i = 0; i < tmp2BranchID.size(); i++) {
                // if a branch terminated with a speciation event in the past, then add the two daughter branches
                if (tmp2BranchDaughterId1[i] == "unborn" && tmp2BranchDaughterId2[i] == "unborn") {
                    //std::cout << "tmp2BranchEndNodeId.size(): " << tmp2BranchEndNodeId.size() << std::endl;
                    // Find the two branches that have the same start node as this branch's end node.
                    for (int j = 0; j < tmpBranchStartNodeId.size(); j++) {
                       // std::cout << "j: " << j << " i: " << i << std::endl;
                        if (tmpBranchStartNodeId[j] == tmp2BranchEndNodeId[i]) {
                            tmp2BranchID.push_back("b"+numToString(branchIdCounter));
                            //std::cout << "tmp2BranchID.size(): " << tmp2BranchID.size() << " i: " << i << std::endl;
                            tmp2BranchParentId.push_back(tmp2BranchID[i]);
                            //std::cout << "tmpBranchEndNodeId.size(): " << tmpBranchEndNodeId.size() << " j: " << j << std::endl;
                            tmp2BranchEndNodeId.push_back(tmpBranchEndNodeId[j]);
                            if (std::regex_match(tmpBranchEndNodeId[j],internalNodeRegEx)) {
                                tmp2BranchDaughterId1.push_back("unborn");
                                tmp2BranchDaughterId2.push_back("unborn");
                            } else {
                                tmp2BranchDaughterId1.push_back("none");
                                tmp2BranchDaughterId2.push_back("none");
                            }
                            // Update daughter ids of temporary parent.
                            //std::cout << "tmp2BranchDaughterId1.size(): " << tmp2BranchDaughterId1.size() << " i: " << i << std::endl;
                           // std::cout << "tmp2BranchDaughterId2.size(): " << tmp2BranchDaughterId2.size() << " i: " << i << std::endl;
                            if (tmp2BranchDaughterId1[i] == "unborn") {
                                tmp2BranchDaughterId1[i] = "b"+numToString(branchIdCounter);
                            } else {
                                tmp2BranchDaughterId2[i] = "b"+numToString(branchIdCounter);
                            }
                            // Increase the branchIdCounter
                            branchIdCounter += 1;
                            change = true;
                        }
                    }
                }
            }
            if (change == false) treeComplete = true;
        }
        
        // Fill array @branch, and at the same time, add species for terminal branches.
        std::vector<string> species;
        for (int i = 0; i < tmp2BranchID.size(); i++) {
            string speciesId;
            if (std::regex_match(tmp2BranchEndNodeId[i], internalNodeRegEx)) {
                speciesId = "unknown";
            } else {
                speciesId = tmp2BranchEndNodeId[i];
                //if (tmp2BranchParentId[i] != "treeOrigin")
                allSpecies.push_back(speciesId);
            }
            branches.push_back(new Branch(tmp2BranchID[i], tmp2BranchParentId[i], tmp2BranchDaughterId1[i], tmp2BranchDaughterId2[i], speciesId));
            
        }
    };
    
    std::vector<string> allSpecies;
    std::vector<Branch*> branches;
    void updateProgenyIds();
    void fillSisterBranches();
    
};

 


#endif /* Dsuite_fBranch_h */


================================================
FILE: Dsuite_utils.cpp
================================================
//
//  Dsuite_utils.cpp
//  Dsuite
//
//  Created by Milan Malinsky on 02/04/2019.
//

#include "Dsuite_utils.h"

long double normalCDF(double x) // Phi(-∞, x) aka N(x)
{
    return erfcl(-x/std::sqrt(2))/2;
}

double Fd_Denom_perVariant(double p1, double p2, double p3, double pO) {
    double Fd_Denom = 0;
    if (p2 > p3) Fd_Denom = ((1-p1)*p2*p2*(1-pO)) - (p1*(1-p2)*p2*(1-pO));
    else Fd_Denom = ((1-p1)*p3*p3*(1-pO)) - (p1*(1-p3)*p3*(1-pO));
    return Fd_Denom;
}

double fG_Denom_perVariant(double p1, double p3a, double p3b, double pO) {
    double fG_Denom = ((1-p1)*p3a*p3b*(1-pO)) - (p1*(1-p3a)*p3b*(1-pO));
    return fG_Denom;
}

// As per Patterson et al. (2012)
double f4_perVariant(double p1, double p2, double p3, double p4) {
    double f4 = (p2-p1)*(p3-p4);
    return f4;
}

double FdM_Denom_perVariant(double p1, double p2, double p3, double pO) {
    double FdM_Denom = 0;
    if (p1 <= p2) {
        if (p2 > p3) FdM_Denom = ((1-p1) * p2 * p2 * (1-pO)) - (p1 * (1-p2) * p2 * (1-pO));
        else FdM_Denom = ((1-p1) * p3 * p3 * (1-pO)) - (p1 * (1-p3) * p3 * (1-pO));
    } else {
        if (p1 > p3) FdM_Denom = -(((1-p1)*p2*p1*(1-pO)) - (p1*(1-p2)*p1*(1-pO)));
        else FdM_Denom = -(((1-p3)*p2*p3*(1-pO)) - (p3*(1-p2)*p3*(1-pO)));
    }
    return FdM_Denom;
}



// Works only on biallelic markers
void GeneralSetCounts::getSetVariantCounts(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap) {
    
    getBasicCounts(genotypes, posToSpeciesMap);
    
    // If at least one of the outgroup individuals has non-missing data
    // Find out what is the "ancestral allele" - i.e. the one more common in the outgroup
    try {
        if (setAlleleCounts.at("Outgroup") > 0) {
            if ((double)setAltCounts.at("Outgroup")/setAlleleCounts.at("Outgroup") < 0.5) { AAint = AncestralAlleleRef; }
            else { AAint = AncestralAlleleAlt; }
        }
    } catch (std::out_of_range& e) { AAint = AncestralAlleleMissing; }
    
    // Now fill in the allele frequencies
    double totalAAF = 0; double totalDAF = 0; int numNonZeroCounts = 0;
    for(std::map<string,int>::iterator it = setAltCounts.begin(); it != setAltCounts.end(); ++it) {
        if (setAlleleCounts.at(it->first) > 0) {
            numNonZeroCounts++;
            double thisAAF = (double)setAltCounts.at(it->first)/setAlleleCounts.at(it->first);
            setAAFs[it->first] = thisAAF; totalAAF += thisAAF;
            if (AAint == 0) { // Ancestral allele seems to be the ref, so derived is alt
                setDAFs[it->first] = thisAAF; totalDAF += thisAAF;
            } else if (AAint == 1) { // Ancestral allele seems to be alt, so derived is ref
                setDAFs[it->first] = (1 - thisAAF); totalDAF += (1 - thisAAF);
            }
        }
    }
    averageAAF = totalAAF/numNonZeroCounts; averageDAF = totalDAF/numNonZeroCounts;
}

int GeneralSetCounts::returnFormatTagPosition(std::vector<std::string>& format, const std::string& tag) {
    // Find the position of GQ (genotype quality) in the genotypeData vector below
    std::vector<std::string>::iterator TAGit; int TAGi = std::numeric_limits<int>::min();
    TAGit = find (format.begin(), format.end(), tag);
    if (TAGit == format.end()) {
        // std::cerr << "This variant hasn't got associated per-sample GQ info" << std::endl;
    } else {
        TAGi = (int)std::distance( format.begin(), TAGit );
        //hasGQ = true;
    }
    return TAGi;
}


int GeneralSetCounts::checkForGenotypeLikelihoodsOrProbabilities(const std::vector<std::string>& vcfLineFields) {
    std::vector<std::string> format = split(vcfLineFields[8], ':');
    if (format.size() == 1) return LikelihoodsProbabilitiesAbsent; // The GT tag must be present in the first place
    
    int likelihoodsOrProbabilitiesTagPosition = returnFormatTagPosition(format, "GP");
    if (likelihoodsOrProbabilitiesTagPosition != std::numeric_limits<int>::min()) { likelihoodsProbabilitiesType = LikelihoodsProbabilitiesGP; }
    else {
        likelihoodsOrProbabilitiesTagPosition = returnFormatTagPosition(format, "GL");
        if (likelihoodsOrProbabilitiesTagPosition != std::numeric_limits<int>::min()) { likelihoodsProbabilitiesType = LikelihoodsProbabilitiesGL; }
        else {
            likelihoodsOrProbabilitiesTagPosition = returnFormatTagPosition(format, "PL");
            if (likelihoodsOrProbabilitiesTagPosition != std::numeric_limits<int>::min()) { likelihoodsProbabilitiesType = LikelihoodsProbabilitiesPL; }
        }
    }
    return likelihoodsOrProbabilitiesTagPosition;
}

double getExpectedGenotype(const std::vector<double>& thisProbabilities) {
    double Egenotype = thisProbabilities[1] + 2*thisProbabilities[2];
    return Egenotype;
}

void transformFromPhred(std::vector<double>& thisLikelihoods) {

    thisLikelihoods[0] = pow(10,-(thisLikelihoods[0]/10.0));
    thisLikelihoods[1] = pow(10,-(thisLikelihoods[1]/10.0));
    thisLikelihoods[2] = pow(10,-(thisLikelihoods[2]/10.0));
}

void transformFromGL(std::vector<double>& thisLikelihoods) {

    thisLikelihoods[0] = pow(10,(thisLikelihoods[0]/10.0));
    thisLikelihoods[1] = pow(10,(thisLikelihoods[1]/10.0));
    thisLikelihoods[2] = pow(10,(thisLikelihoods[2]/10.0));
}

std::vector<double> GeneralSetCounts::probabilitiesFromLikelihoods(const std::vector<double>& thisLikelihoods, const string& species) {
    std::vector<double> thisProbabilities; thisProbabilities.assign(3, 0.0);
    double multiple0 = thisLikelihoods[0]*setHWEpriorsFromAAFfromGT[species][0];
    double multiple1 = thisLikelihoods[1]*setHWEpriorsFromAAFfromGT[species][1];
    double multiple2 = thisLikelihoods[2]*setHWEpriorsFromAAFfromGT[species][2];
    double sum = multiple0 + multiple1 + multiple2;
    
    thisProbabilities[0] = multiple0/sum;
    thisProbabilities[1] = multiple1/sum;
    thisProbabilities[2] = multiple2/sum;
    
    return thisProbabilities;
}
 
void GeneralSetCounts::setHWEpriorsFromAFfromGT() {
    double AF;
    // Alternative allele frequencies
    for(std::map<string,double>::iterator it = setAAFs.begin(); it != setAAFs.end(); ++it) {
        if (it->second >= 0) AF = it->second; else AF = averageAAF; // This should be average of AFs across populations where it is known
        setHWEpriorsFromAAFfromGT[it->first][0] = pow((1-AF),2);
        setHWEpriorsFromAAFfromGT[it->first][1] = AF*(1-AF);
        setHWEpriorsFromAAFfromGT[it->first][2] = pow(AF,2);
    }
    // Derived allele frequencies
    for(std::map<string,double>::iterator it = setDAFs.begin(); it != setDAFs.end(); ++it) {
        if (it->second >= 0) AF = it->second; else AF = averageDAF; // This should be average of AFs across populations
        setHWEpriorsFromDAFfromGT[it->first][0] = pow((1-AF),2);
        setHWEpriorsFromDAFfromGT[it->first][1] = AF*(1-AF);
        setHWEpriorsFromDAFfromGT[it->first][2] = pow(AF,2);
    }
} 




void GeneralSetCounts::getAFsFromGenotypeLikelihoodsOrProbabilities(const std::vector<std::string>& genotypeFields, const std::map<size_t, string>& posToSpeciesMap, const int likelihoodsOrProbabilitiesTagPosition) {
    if (likelihoodsProbabilitiesType == LikelihoodsProbabilitiesPL || likelihoodsProbabilitiesType == LikelihoodsProbabilitiesGL) {
        setHWEpriorsFromAFfromGT();
    }
    
    for (std::vector<std::string>::size_type i = 0; i < genotypeFields.size(); i++) {
        std::string species; try { species = posToSpeciesMap.at(i); } catch (const std::out_of_range& oor) {
            continue;
        }
       // std::cerr << genotypeFields[i] << std::endl;
        std::string thisLikelihoodsOrProbabilitiesString = split(genotypeFields[i], ':')[likelihoodsOrProbabilitiesTagPosition];
        if (thisLikelihoodsOrProbabilitiesString == ".") continue;
        
        else {
            setAlleleProbCounts.at(species) += 2;
            std::vector<double> thisLikelihoodsOrProbabilities = splitToDouble(thisLikelihoodsOrProbabilitiesString,',');
            std::vector<double> thisProbabilities;
            switch (likelihoodsProbabilitiesType)
            {
                case LikelihoodsProbabilitiesPL:
                    transformFromPhred(thisLikelihoodsOrProbabilities);
                   // print_vector(thisLikelihoodsOrProbabilities, std::cerr);
                    thisProbabilities = probabilitiesFromLikelihoods(thisLikelihoodsOrProbabilities,species);
                    break;
                case LikelihoodsProbabilitiesGL: transformFromGL(thisLikelihoodsOrProbabilities);
                    thisProbabilities = probabilitiesFromLikelihoods(thisLikelihoodsOrProbabilities,species);
                    break;
                case LikelihoodsProbabilitiesGP:
                    thisProbabilities = thisLikelihoodsOrProbabilities;
                    break;
            }
            if (setAAFsFromLikelihoods.at(species) == -1) setAAFsFromLikelihoods.at(species) = 0;
            setAAFsFromLikelihoods.at(species) += getExpectedGenotype(thisProbabilities);
        }
    }
    
    for(std::map<string,double>::iterator it = setAAFsFromLikelihoods.begin(); it != setAAFsFromLikelihoods.end(); ++it) {
        if (setAAFsFromLikelihoods.at(it->first) != -1) {
            double AF = it->second/setAlleleProbCounts.at(it->first);
            it->second = AF;
            if (AAint == AncestralAlleleRef) {
                setDAFsFromLikelihoods.at(it->first) = AF;
            } else if (AAint == AncestralAlleleAlt) {
                setDAFsFromLikelihoods.at(it->first) = (1 - AF);
            }
        }
    }
     
}

void GeneralSetCounts::getAFsFromADtag(const std::vector<std::string>& genotypeFields, const std::map<string, std::vector<size_t>>& setsToPosMap, const int ADTagPosition, const int minDepth) {
    for (std::vector<std::string>::size_type i = 0; i < genotypeFields.size(); i++) {
          // std::cerr << genotypeFields[i] << std::endl;
           std::string thisADstring = split(genotypeFields[i], ':')[ADTagPosition];
           if (thisADstring == ".") {
               std::cerr << "The AD tag info appears to be missing: " << thisADstring << " ; Exiting ..." << std::endl;
               exit(1);
           }
           
           else {
               std::vector<double> ADs = splitToDouble(thisADstring,',');
               if (ADs.size() != 2) {
                   std::cerr << "This AD tag appears malformed: " << thisADstring << " ; Exiting ..." << std::endl;
                   exit(1);
               }
               
               int overallDepth = ADs[0] + ADs[1];
               if (overallDepth >= minDepth) {
                    individualPoolAAFs[i] = ADs[0]/(overallDepth);
               }
           }
       }
       
       for(std::map<string, std::vector<size_t>>::const_iterator it = setsToPosMap.begin(); it != setsToPosMap.end(); ++it) {
           int individualsInThisSet = (int) it->second.size();
           assert(individualsInThisSet > 0);
           if (individualsInThisSet == 1) {
               int pos = (int) it->second[0];
               setPoolAAFs.at(it->first) = individualPoolAAFs[pos];
           } else {
               std::vector<double> thisSetAFs;
               for (int i = 0; i < individualsInThisSet; i++) {
                   int pos = (int) it->second[i];
                   if (individualPoolAAFs[pos] != -1.0) thisSetAFs.push_back(individualPoolAAFs[pos]);
               }
               setPoolAAFs.at(it->first) = vector_average(thisSetAFs);
               
           }
           
           
           if (AAint == AncestralAlleleRef) {
               setPoolDAFs.at(it->first) = setPoolAAFs.at(it->first);
           } else if (AAint == AncestralAlleleAlt && setPoolAAFs.at(it->first) != -1.0) {
               setPoolDAFs.at(it->first) = (1 - setPoolAAFs.at(it->first));
           }
           
               
       }
}


void GeneralSetCountsWithSplits::getAFsFromADtagWithSplits(const std::vector<std::string>& genotypeFields, const std::map<string, std::vector<size_t>>& setsToPosMap, const int ADTagPosition, const int minDepth) {
    
    
    for (std::vector<std::string>::size_type i = 0; i < genotypeFields.size(); i++) {
       // std::cerr << genotypeFields[i] << std::endl;
        std::string thisADstring = split(genotypeFields[i], ':')[ADTagPosition];
        if (thisADstring == ".") {
            std::cerr << "The AD tag info appears to be missing: " << thisADstring << " ; Exiting ..." << std::endl;
            exit(1);
        }
        
        else {
            std::vector<double> ADs = splitToDouble(thisADstring,',');
            if (ADs.size() != 2) {
                std::cerr << "This AD tag appears malformed: " << thisADstring << " ; Exiting ..." << std::endl;
                exit(1);
            }
            
            int overallDepth = ADs[0] + ADs[1];
            if (overallDepth >= minDepth) {
                 individualPoolAAFs[i] = ADs[0]/(overallDepth);
            }
        }
    }
    
    for(std::map<string, std::vector<size_t>>::const_iterator it = setsToPosMap.begin(); it != setsToPosMap.end(); ++it) {
        int individualsInThisSet = (int) it->second.size();
        assert(individualsInThisSet > 0);
        if (individualsInThisSet == 1) {
            int pos = (int) it->second[0];
            setPoolAAFs.at(it->first) = individualPoolAAFs[pos];
            setPoolAAFsplit1.at(it->first) = individualPoolAAFs[pos];
            setPoolAAFsplit2.at(it->first) = individualPoolAAFs[pos];
        } else {
            std::vector<double> thisSetAFs;
            for (int i = 0; i < individualsInThisSet; i++) {
                int pos = (int) it->second[i];
                thisSetAFs.push_back(individualPoolAAFs[pos]);
            }
            setPoolAAFs.at(it->first) = vector_average(thisSetAFs);
            
            // Take care of the splits by random sampling with replacement:
            std::random_device rd;     // only used once to initialise (seed) engine
            std::mt19937 rng(rd());    // random-number engine used (Mersenne-Twister in this case)
            std::uniform_int_distribution<int> uni(0,(individualsInThisSet - 1)); // guaranteed unbiased
            
            std::vector<double> thisSetAFsplit1; std::vector<double> thisSetAFsplit2;
            for (int i = 0; i < individualsInThisSet; i++) {
                int random_pos_s1 = uni(rng);
                int random_pos_s2 = uni(rng);
                thisSetAFsplit1.push_back(individualPoolAAFs[random_pos_s1]);
                thisSetAFsplit2.push_back(individualPoolAAFs[random_pos_s2]);
            }
            setPoolAAFsplit1.at(it->first) = vector_average(thisSetAFsplit1);
            setPoolAAFsplit2.at(it->first) = vector_average(thisSetAFsplit2);
            
        }
        
        if (AAint == AncestralAlleleRef) {
            setPoolDAFs.at(it->first) = setPoolAAFs.at(it->first);
            setPoolDAFsplit1.at(it->first) = setPoolAAFsplit1.at(it->first);
            setPoolDAFsplit2.at(it->first) = setPoolAAFsplit2.at(it->first);
        } else if (AAint == AncestralAlleleAlt && setPoolAAFs.at(it->first) != -1.0) {
            setPoolDAFs.at(it->first) = (1 - setPoolAAFs.at(it->first));
            setPoolDAFsplit1.at(it->first) = (1 - setPoolAAFsplit1.at(it->first));
            setPoolDAFsplit2.at(it->first) = (1 - setPoolAAFsplit2.at(it->first));
        }
            
    }
}


// Only works for diploids for now!!!
void GeneralSetCountsWithSplits::getAFsFromGenotypeLikelihoodsOrProbabilitiesWithSplits(const std::vector<std::string>& genotypeFields, const std::map<size_t, string>& posToSpeciesMap, const int likelihoodsOrProbabilitiesTagPosition, const int pos) {
   
    
    if (likelihoodsProbabilitiesType == LikelihoodsProbabilitiesPL || likelihoodsProbabilitiesType == LikelihoodsProbabilitiesGL) {
        setHWEpriorsFromAFfromGT();
    }
    
    getBasicCountsFromLikelihoodsOrProbabilities(genotypeFields, posToSpeciesMap, likelihoodsOrProbabilitiesTagPosition);
    
     
    // Now fill in the allele frequencies
    for(std::map<string,std::vector<double>>::iterator it = setIndividualExpectedGenotypes.begin(); it != setIndividualExpectedGenotypes.end(); ++it) {
        if (it->first == "") {
            std::cerr << "it->first " << it->first << "\t"; print_vector(it->second, std::cerr); std::cerr << std::endl;
        }
        std::vector<double> thisSetExpectedGenotypes = it->second;
        
        
        if (thisSetExpectedGenotypes.size() > 0) {
            double thisAAF = (double)vector_sum(thisSetExpectedGenotypes)/(2*thisSetExpectedGenotypes.size());
          /* Debug stuff
           if(pos == 1180 || pos == 1046) {
                std::cerr << "pos: " << pos << std::endl;
                std::cerr << "it->first: " << it->first << std::endl;
                print_vector(thisSetExpectedGenotypes, std::cerr);
                std::cerr << "thisAAF: " << thisAAF << std::endl;
            }
           */
            //std::cerr << "species: " << it->first << std::endl;
            // print_vector(thisSetExpectedGenotypes, std::cerr);
            // std::cerr << "thisAAF: " << thisAAF << std::endl;
            setAAFsFromLikelihoods.at(it->first) = thisAAF;
            
            // Take care of the splits by random sampling with replacement:
            std::random_device rd;     // only used once to initialise (seed) engine
            std::mt19937 rng(rd());    // random-number engine used (Mersenne-Twister in this case)
            std::uniform_int_distribution<int> uniAFs(0,((int)thisSetExpectedGenotypes.size() - 1)); // guaranteed unbiased
            
            
            std::vector<double> thisSetIndividualExpectedGenotypesSampledSplit1;
            std::vector<double> thisSetIndividualExpectedGenotypesSampledSplit2;
            for (int i = 0; i < thisSetExpectedGenotypes.size(); i++) {
                int random_pos_s1 = uniAFs(rng);
                int random_pos_s2 = uniAFs(rng);
                thisSetIndividualExpectedGenotypesSampledSplit1.push_back(thisSetExpectedGenotypes[random_pos_s1]);
                thisSetIndividualExpectedGenotypesSampledSplit2.push_back(thisSetExpectedGenotypes[random_pos_s2]);
            }
            
            double thisAAFsplit1 = (double)vector_sum(thisSetIndividualExpectedGenotypesSampledSplit1)/(2*thisSetExpectedGenotypes.size());
           // std::cerr << "thisAAFsplit1: " << thisAAFsplit1 << std::endl;
            double thisAAFsplit2 = (double)vector_sum(thisSetIndividualExpectedGenotypesSampledSplit2)/(2*thisSetExpectedGenotypes.size());
           // std::cerr << "thisAAFsplit2: " << thisAAFsplit2 << std::endl;

            
           // std::cerr << "it->first " << it->first << std::endl;
            try {
            setAAFsplit1fromLikelihoods.at(it->first) = thisAAFsplit1; setAAFsplit2fromLikelihoods.at(it->first) = thisAAFsplit2;
                
            if (AAint == AncestralAlleleRef) { // Ancestral allele seems to be the ref, so derived is alt
                setDAFsFromLikelihoods.at(it->first) = thisAAF;
                setDAFsplit1fromLikelihoods.at(it->first) = thisAAFsplit1;
                setDAFsplit2fromLikelihoods.at(it->first) = thisAAFsplit2;
            } else if (AAint == AncestralAlleleAlt) { // Ancestral allele seems to be alt, so derived is ref
                setDAFsFromLikelihoods.at(it->first) = (1 - thisAAF);
                setDAFsplit1fromLikelihoods.at(it->first) = 1 - thisAAFsplit1;
                setDAFsplit2fromLikelihoods.at(it->first) = 1 - thisAAFsplit2;
            }
            } catch (std::out_of_range& e) { std::cerr << "The trouble was here" << it->first << std::endl; }
        }
    }
}



// Works only on biallelic markers
void GeneralSetCounts::getSetVariantCountsSimple(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap) {
    // std::cerr << fields[0] << "\t" << fields[1] << std::endl;
    getBasicCounts(genotypes, posToSpeciesMap);
    
    // Now fill in the allele frequencies
    for(std::map<string,int>::iterator it = setAltCounts.begin(); it != setAltCounts.end(); ++it) {
        if (setAlleleCounts.at(it->first) > 0) {
            setAAFs[it->first] = (double)setAltCounts.at(it->first)/setAlleleCounts.at(it->first);
        }
    }
}

void GeneralSetCounts::getBasicCounts(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap) {
    // Go through the genotypes - only biallelic markers are allowed
    for (std::vector<std::string>::size_type i = 0; i != genotypes.size(); i++) {
        bool speciesDefined = true;
        std::string species; try { species = posToSpeciesMap.at(i); } catch (const std::out_of_range& oor) {
            speciesDefined = false;
        }
        // The first allele in this individual
        if (genotypes[i][0] == '1') { overall++; individualsWithVariant[i]++; }
        if (genotypes[i][2] == '1') { overall++; individualsWithVariant[i]++; }
        if (speciesDefined) {
            if (genotypes[i][0] == '1') {
                setAltCounts[species]++; setAlleleCounts[species]++;
            } else if (genotypes[i][0] == '0') {
                setAlleleCounts[species]++;
            }
            // The second allele in this individual
            if (genotypes[i][2] == '1') {
                setAltCounts[species]++; setAlleleCounts[species]++;
            } else if (genotypes[i][2] == '0') {
                setAlleleCounts[species]++;
            }
        }
    }
}

void GeneralSetCountsWithSplits::getBasicCountsWithSplitsNew(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap) {
    
    // Go through the genotypes - only biallelic markers are allowed
    for (std::vector<std::string>::size_type i = 0; i != genotypes.size(); i++) {
        bool speciesDefined = true;
        std::string species; try { species = posToSpeciesMap.at(i); } catch (const std::out_of_range& oor) {
            speciesDefined = false;
        }
        
        if (speciesDefined) {
            string onlyGenotypeCalls = split(genotypes[i], ':')[0];   // The string with 0/0, 0/1, 1/0, 1/1, or e.g. 0/0/1/1 for a tetraploid
            if (onlyGenotypeCalls[0] == '.') {
                continue;   // Ignore missing data
            }
            // Find ploidy
            int l = (int)onlyGenotypeCalls.length();
            int numGTs = (l/2)+1;
            setAlleleCounts[species] += numGTs;
            
            // Go through the genotypes and fill in the data structure "GeneralSetCountsWithSplits"
            for (std::vector<std::string>::size_type j = 0; j <= l; j = j+2) {
               // std::cerr << "genotypes[i][j]: " << genotypes[i][j] << std::endl;
                setGenotypes[species].push_back(genotypes[i][j] - '0');
                if (genotypes[i][j] == '1') {
                    overall++; individualsWithVariant[i]++;
                    setAltCounts[species]++;
                }
            }
            double individualAF = (double)individualsWithVariant[i]/numGTs;
            
            /* std::cerr << "onlyGenotypeCalls: " << onlyGenotypeCalls << std::endl;
            std::cerr << "individualsWithVariant[i]: " << individualsWithVariant[i] << std::endl;
            std::cerr << "numGTs: " << numGTs << std::endl;
            std::cerr << "individualAF: " << individualAF << std::endl;
            */
            setIndividualAFs[species].push_back(individualAF);
        }
    }
}

void GeneralSetCountsWithSplits::getBasicCountsFromLikelihoodsOrProbabilities(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap, const int likelihoodsOrProbabilitiesTagPosition) {
    
    // Go through the genotypes - only biallelic markers are allowed
    for (std::vector<string>::size_type i = 0; i != genotypes.size(); i++) {
        bool speciesDefined = true;
        string species; try { species = posToSpeciesMap.at(i); } catch (const std::out_of_range& oor) {
            speciesDefined = false;
        }
        
        if (speciesDefined) {
            string thisLikelihoodsOrProbabilitiesString = split(genotypes[i], ':')[likelihoodsOrProbabilitiesTagPosition];
            if (thisLikelihoodsOrProbabilitiesString == ".") continue;
            else {
                setAlleleProbCounts.at(species) += 2;
                std::vector<double> thisLikelihoodsOrProbabilities = splitToDouble(thisLikelihoodsOrProbabilitiesString,',');
                std::vector<double> thisProbabilities;
                switch (likelihoodsProbabilitiesType)
                {
                    case LikelihoodsProbabilitiesPL:
                        transformFromPhred(thisLikelihoodsOrProbabilities);
                     // print_vector(thisLikelihoodsOrProbabilities, std::cerr);
                        thisProbabilities = probabilitiesFromLikelihoods(thisLikelihoodsOrProbabilities,species);
                        break;
                    case LikelihoodsProbabilitiesGL: break;
                    case LikelihoodsProbabilitiesGP:
                        thisProbabilities = thisLikelihoodsOrProbabilities;
                        break;
                }
                setIndividualExpectedGenotypes[species].push_back(getExpectedGenotype(thisProbabilities));
            }
        }
    }
}

void GeneralSetCountsWithSplits::getSplitCountsNew(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap) {
    
    getBasicCountsWithSplitsNew(genotypes, posToSpeciesMap);
    
    // If at least one of the outgroup individuals has non-missing data
    // Find out what is the "ancestral allele" - i.e. the one more common in the outgroup
    try {
        if (setAlleleCounts.at("Outgroup") > 0) {
            if ((double)vector_sum(setGenotypes.at("Outgroup"))/setGenotypes.at("Outgroup").size() < 0.5) { AAint = AncestralAlleleRef; }
            else { AAint = AncestralAlleleAlt; } 
        }
    } catch (std::out_of_range& e) { AAint = -1; }
    
    // Now fill in the allele frequencies
    double totalAAF = 0; int numNonZeroCounts = 0;
    for(std::map<string,std::vector<int>>::iterator it = setGenotypes.begin(); it != setGenotypes.end(); ++it) {
        if (it->first == "") {
            std::cerr << "it->first " << it->first << "\t"; print_vector(it->second, std::cerr); std::cerr << std::endl;
        }
        std::vector<int> thisSetGenotypes = setGenotypes.at(it->first);
        std::vector<double> thisSetIndividualAFs = setIndividualAFs.at(it->first);
        
        if (thisSetGenotypes.size() > 0) {
            numNonZeroCounts++;
            double thisAAF = (double)vector_sum(thisSetGenotypes)/thisSetGenotypes.size();
           // print_vector(thisSetGenotypes, std::cerr);
           // std::cerr << "thisAAF: " << thisAAF << std::endl;
            setAAFs[it->first] = thisAAF; totalAAF += thisAAF;
            
            // Take care of the splits by random sampling with replacement:
            std::random_device rd;     // only used once to initialise (seed) engine
            std::mt19937 rng(rd());    // random-number engine used (Mersenne-Twister in this case)
            std::uniform_int_distribution<int> uni(0,((int)thisSetGenotypes.size() - 1)); // guaranteed unbiased
            std::uniform_int_distribution<int> uniAFs(0,((int)thisSetIndividualAFs.size() - 1)); // guaranteed unbiased
            
         /*   std::vector<int> thisSetGenotypesSampledSplit1; std::vector<int> thisSetGenotypesSampledSplit2;
            for (int i = 0; i < thisSetGenotypes.size(); i++) {
                int random_pos_s1 = uni(rng);
                int random_pos_s2 = uni(rng);
                thisSetGenotypesSampledSplit1.push_back(thisSetGenotypes[random_pos_s1]);
                thisSetGenotypesSampledSplit2.push_back(thisSetGenotypes[random_pos_s2]);
            }
          */
            
            std::vector<double> thisSetIndividualAFsSampledSplit1; std::vector<double> thisSetIndividualAFsSampledSplit2;
            for (int i = 0; i < thisSetIndividualAFs.size(); i++) {
                int random_pos_s1 = uniAFs(rng);
                int random_pos_s2 = uniAFs(rng);
                thisSetIndividualAFsSampledSplit1.push_back(thisSetIndividualAFs[random_pos_s1]);
                thisSetIndividualAFsSampledSplit2.push_back(thisSetIndividualAFs[random_pos_s2]);
            }
            
          //  double thisAAFsplit1 = vector_average(thisSetGenotypesSampledSplit1);
          //  double thisAAFsplit2 = vector_average(thisSetGenotypesSampledSplit2);
            double thisAAFsplit1 = vector_average(thisSetIndividualAFsSampledSplit1);
            double thisAAFsplit2 = vector_average(thisSetIndividualAFsSampledSplit2);
            setAAFsplit1[it->first] = thisAAFsplit1; setAAFsplit2[it->first] = thisAAFsplit2;
            
            // Count correction as in admixtools
         //   double ya = vector_sum(thisSetGenotypes); double yb = thisSetGenotypes.size() - vector_sum(thisSetGenotypes);
         //   double yt = (double)thisSetGenotypes.size();
         //   double h = ya * yb / (yt * (yt - 1.0));
            //std::cerr << "it->first: " << it->first << std::endl;
            //std::cerr << "ya: " << ya << " ; yb: " << yb << " ; yt: " << yt << std::endl;
            //std::cerr << "h: " << h << " ; h / yt: " << h / yt << std::endl;
            
          //  setCorrectionFactors[it->first] = h / yt;
            
           // std::cerr << "it->first " << it->first << std::endl;
            try {
            if (AAint == AncestralAlleleRef) { // Ancestral allele seems to be the ref, so derived is alt
                setDAFs[it->first] = thisAAF;
                setDAFsplit1[it->first] = thisAAFsplit1; setDAFsplit2[it->first] = thisAAFsplit2;
            } else if (AAint == AncestralAlleleAlt) { // Ancestral allele seems to be alt, so derived is ref
                setDAFs[it->first] = (1 - thisAAF);
                setDAFsplit1[it->first] = 1 - thisAAFsplit1;
                setDAFsplit2[it->first] = 1 - thisAAFsplit2;
            }
                } catch (std::out_of_range& e) { std::cerr << "The trouble was here" << it->first << std::endl; }
        }
    }
    averageAAF = totalAAF/numNonZeroCounts;
    if (AAint == AncestralAlleleRef) averageDAF = averageAAF;
    else if (AAint == AncestralAlleleAlt) averageDAF = (1 - averageAAF);
}



int GeneralSetCounts::findADtagPosition(const std::vector<std::string>& vcfLineFields) {
    
    std::vector<std::string> format = split(vcfLineFields[8], ':');
    if (format.size() == 1) return LikelihoodsProbabilitiesAbsent; // The GT tag must be present in the first place
    
    int ADTagPosition = returnFormatTagPosition(format, "AD");
    if (ADTagPosition == std::numeric_limits<int>::min()) {
        std::cerr << "Could not find the AD tag in the VCF file. This tag is requored to use the pool-seq option. Exiting ...." << std::endl;
        exit(1);
    }
    return ADTagPosition;
}


double calculateOneDs(double ABBAtotal, double BABAtotal) {
    // Get the D values
    double Dnum1 = ABBAtotal - BABAtotal;
    
    double Ddenom1 = ABBAtotal + BABAtotal;
    double D = Dnum1/Ddenom1;
    return D;
}



double* calculateThreeDs(double ABBAtotal, double BABAtotal, double BBAAtotal) {
    // Get the D values
    double Dnum1 = ABBAtotal - BABAtotal;
    double Dnum2 = ABBAtotal - BBAAtotal;
    double Dnum3 = BBAAtotal - BABAtotal;
    
    double Ddenom1 = ABBAtotal + BABAtotal;
    double Ddenom2 = ABBAtotal + BBAAtotal;
    double Ddenom3 = BBAAtotal + BABAtotal;
    static double Ds[3]; Ds[0] = Dnum1/Ddenom1; Ds[1] = Dnum2/Ddenom2; Ds[2] = Dnum3/Ddenom3;
    return Ds;
}


double stringToDouble(std::string s) {
    double d;
    std::stringstream ss(s); //turn the string into a stream
    ss >> d; //convert
    return d;
}


// Remove a single file extension from the filename
std::string stripExtension(const std::string& filename)
{
    size_t suffixPos = filename.find_last_of('.');
    if(suffixPos == std::string::npos)
        return filename; // no suffix
    else
        return filename.substr(0, suffixPos);
}


void split(const std::string &s, char delim, std::vector<std::string> &elems) {
    std::stringstream ss(s);
    std::string item;
    while (std::getline(ss, item, delim)) {
        elems.push_back(item);
    }
}

std::vector<std::string> split(const std::string &s, char delim) {
    std::vector<std::string> elems;
    split(s, delim, elems);
    return elems;
}
    
    
void splitToDouble(const std::string &s, char delim, std::vector<double> &elems) {
    std::stringstream ss(s);
    std::string item;
    while (std::getline(ss, item, delim)) {
        elems.push_back(stringToDouble(item));
    }
}

std::vector<double> splitToDouble(const std::string &s, char delim) {
    std::vector<double> elems;
    splitToDouble(s, delim, elems);
    return elems;
}

std::vector<std::string> split2(std::string s, string delim) {
    std::vector<std::string> elems;
    size_t pos = 0;
    std::string token;
    while ((pos = s.find(delim)) != std::string::npos) {
        token = s.substr(0, pos);
        elems.push_back(token);
        s.erase(0, pos + delim.length());
    }
    elems.push_back(s);
    return elems;
}


std::vector<size_t> locateSet(const std::vector<std::string>& sample_names, const std::vector<std::string>& set) {
    std::vector<size_t> setLocs;
    for (std::vector<std::string>::size_type i = 0; i != set.size(); i++) {
        std::vector<std::string>::const_iterator it = std::find(sample_names.begin(), sample_names.end(), set[i]);
        if (it == sample_names.end()) {
            std::cerr << "Did not find the sample: \"" << set[i] << "\"" << std::endl;
            print_vector(sample_names, std::cerr,',');
        } else {
            size_t loc = std::distance(sample_names.begin(), it);
            setLocs.push_back(loc);
        }
    }
    return setLocs;
}


//
std::string suffix(const std::string& seq, size_t len)
{
    assert(seq.length() >= len);
    return seq.substr(seq.length() - len);
}

// Returns true if the filename has an extension indicating it is compressed
bool isGzip(const std::string& filename)
{
    size_t suffix_length = sizeof(GZIP_EXT) - 1;
    
    // Assume files without an extension are not compressed
    if(filename.length() < suffix_length)
        return false;
    
    std::string extension = suffix(filename, suffix_length);
    return extension == GZIP_EXT;
}

// Ensure a filehandle is open
void assertFileOpen(std::ifstream& fh, const std::string& fn)
{
    if(!fh.is_open())
    {
        std::cerr << "ERROR: Could not open " << fn << " for read\n";
        exit(EXIT_FAILURE);
    }
}
// Ensure a filehandle is open
void assertFileOpen(std::ofstream& fh, const std::string& fn)
{
    if(!fh.is_open())
    {
        std::cerr << "ERROR: Could not open " << fn << " for write\n";
        exit(EXIT_FAILURE);
    }
}


void assertGZOpen(gzstreambase& gh, const std::string& fn)
{
    if(!gh.good())
    {
        std::cerr << "ERROR: Could not open " << fn << std::endl;
        exit(EXIT_FAILURE);
    }
}

void checkGenotypesExist(const std::vector<std::string>& fields, const int variantNum) {
    if (fields.size() <= NUM_NON_GENOTYPE_COLUMNS) {
        std::cerr << "ERROR: Variant " << variantNum << " in the VCF appears to be truncated."  << std::endl;
        print_vector(fields, std::cerr);
        std::cerr << "Exiting..." << std::endl; exit(1);
    }
}

// Open a file that may or may not be gzipped for reading
// The caller is responsible for freeing the handle
std::istream* createReader(const std::string& filename, std::ios_base::openmode mode)
{
    if(isGzip(filename))
    {
        igzstream* pGZ = new igzstream(filename.c_str(), mode);
        assertGZOpen(*pGZ, filename);
        return pGZ;
    }
    else
    {
        std::ifstream* pReader = new std::ifstream(filename.c_str(), mode);
        assertFileOpen(*pReader, filename);
        return pReader;
    }
}

// Open a file that may or may not be gzipped for writing
// The caller is responsible for freeing the handle
std::ostream* createWriter(const std::string& filename,
                           std::ios_base::openmode mode)
{
    if(isGzip(filename))
    {
        ogzstream* pGZ = new ogzstream(filename.c_str(), mode);
        assertGZOpen(*pGZ, filename);
        return pGZ;
    }
    else
    {
        std::ofstream* pWriter = new std::ofstream(filename.c_str(), mode);
        assertFileOpen(*pWriter, filename);
        return pWriter;
    }
}

bool file_exists(const std::string& name) {
    std::ifstream f(name.c_str());
    return f.good();
}


void assignSplits01FromAlleleFrequency(const double p, double& splitA, double& splitB) {
    double r = ((double) rand() / (RAND_MAX));
    if (r <= p) { splitA = 1; }
    double r2 = ((double) rand() / (RAND_MAX));
    if (r2 <= p) { splitB = 1; }
}


================================================
FILE: Dsuite_utils.h
================================================
//
//  Dsuite_utils.h
//  Dsuite
//
//  Created by Milan Malinsky on 02/04/2019.
//

#ifndef Dsuite_utils_h
#define Dsuite_utils_h
#include <getopt.h>
#include <stdio.h>
#include <map>
#include <vector>
#include <sstream>
#include <fstream>
#include <cmath>
#include <algorithm>
#include <assert.h>
#include <time.h>
#include <regex>
#include <iterator>
#include <algorithm>
#include <limits>
#include <random>
#include <list>
#include <cstdint>
#include <iterator>
#include "gzstream.h"
#include "kstest.h"

#define PROGRAM_BIN "Dsuite"
#define PACKAGE_BUGREPORT "milan.malinsky@iee.unibe.ch"
#define GZIP_EXT ".gz"
#define F4HEADER "f4-ratio"
#define ploidy 2

#define CHECK_TREE_ERROR_MSG "It seems that this species is in the SETS.txt file but can't be found in the tree. Please check the spelling and completeness of your tree file."

#define P3isTrios2_Dpositive 1      // 1 - trios[i][0] and trios[i][1] are P1 and P2; D >= 0
#define P3isTrios2_Dnegative 2      // 2 - trios[i][0] and trios[i][1] are P1 and P2; D < 0
#define P3isTrios1_Dpositive 3      // 3 - trios[i][0] and trios[i][2] are P1 and P2; D >= 0
#define P3isTrios1_Dnegative 4      // 4 - trios[i][0] and trios[i][2] are P1 and P2; D < 0
#define P3isTrios0_Dpositive 5      // 5 - trios[i][2] and trios[i][1] are P1 and P2; D >= 0
#define P3isTrios0_Dnegative 6      // 6 - trios[i][2] and trios[i][1] are P1 and P2; D < 0

#define P3isTrios2 7    // 7 - trios[i][0] and trios[i][1] are P1 and P2;
#define P3isTrios1 8    // 8 - trios[i][0] and trios[i][2] are P1 and P2;
#define P3isTrios0 9    // 9 - trios[i][1] and trios[i][2] are P1 and P2;

#define ABBAvector 0
#define BABAvector 1
#define BBAAvector 2


#define OutgroupNotRequired 0
#define OutgroupRequired 1

#define LikelihoodsProbabilitiesAbsent 0
#define LikelihoodsProbabilitiesGP 1
#define LikelihoodsProbabilitiesGL 2
#define LikelihoodsProbabilitiesPL 3

#define AncestralAlleleMissing -1
#define AncestralAlleleRef 0
#define AncestralAlleleAlt 1

using std::string;
// VCF format constant
static const int NUM_NON_GENOTYPE_COLUMNS=9;  // 8 mendatory columns + 1 column with definition of the genotype columns

void assertFileOpen(std::ifstream& fh, const std::string& fn);
void assertFileOpen(std::ofstream& fh, const std::string& fn);
void checkGenotypesExist(const std::vector<std::string>& fields, const int variantNum);
double calculateOneDs(double ABBAtotal, double BABAtotal);
double* calculateThreeDs(double ABBAtotal, double BABAtotal, double BBAAtotal);
double f4_perVariant(double p1, double p2, double p3, double p4);
double Fd_Denom_perVariant(double p1, double p2, double p3, double pO);
double fG_Denom_perVariant(double p1, double p3a, double p3b, double pO);
double FdM_Denom_perVariant(double p1, double p2, double p3, double pO);
long double normalCDF(double x);
double stringToDouble(std::string s);
std::string stripExtension(const std::string& filename);
std::vector<std::string> split2(std::string s, string delim);
std::vector<std::string> split(const std::string &s, char delim);
std::vector<double> splitToDouble(const std::string &s, char delim);
std::vector<size_t> locateSet(const std::vector<std::string>& sample_names, const std::vector<std::string>& set);
std::istream* createReader(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in);
std::ostream* createWriter(const std::string& filename, std::ios_base::openmode mode = std::ios_base::out);
bool file_exists(const std::string& name);
void assignSplits01FromAlleleFrequency(const double p, double& splitA, double& splitB);

// Converting numbers (int, double, size_t, and char) to string
template <typename T> std::string numToString(T i) {
    std::string ret;
    std::stringstream out;
    out << i;
    ret = out.str();
    return ret;
}

///Represents the exception for taking the median of an empty list
class median_of_empty_list_exception:public std::exception{
  virtual const char* what() const throw() {
    return "Attempt to take the median of an empty list of numbers.  "
      "The median of an empty list is undefined.";
  }
};

///Return the median of a sequence of numbers defined by the random
///access iterators begin and end.  The sequence must not be empty
///(median is undefined for an empty set).
///
///The numbers must be convertible to double.
template<class RandAccessIter> double median(RandAccessIter begin, RandAccessIter end) {
  if(begin == end){ throw median_of_empty_list_exception(); }
  std::size_t size = end - begin;
  std::size_t middleIdx = size/2;
  RandAccessIter target = begin + middleIdx;
  std::nth_element(begin, target, end);

  if(size % 2 != 0){ //Odd number of elements
    return *target;
  }else{            //Even number of elements
    double a = *target;
    RandAccessIter targetNeighbor= target-1;
    std::nth_element(begin, targetNeighbor, end);
    return (a+*targetNeighbor)/2.0;
  }
}



// Print an arbitrary vector to a file
template <class T> void print_vector(T vector, std::ostream& outFile, char delim = '\t', bool endLine = true) {
    for (int i = 0; i < vector.size(); i++) {
        if (i == (vector.size()-1)) {
            if (endLine) outFile << vector[i] << std::endl;
            else outFile << vector[i];
        } else {
            outFile << vector[i] << delim;
        }
    }
}

template <class T> double vector_average(T vector) {
    double sum = 0;
    for (int i = 0; i < vector.size(); i++) {
        sum += vector[i];
    }
    double average = (double)sum / (double)vector.size();
    return average;
}

template <class T> double vector_sum(T vector) {
    double sum = 0;
    for (int i = 0; i < vector.size(); i++) {
        sum += vector[i];
    }
    return sum;
}

inline void copy_except(int i, std::vector<double>& inVec, std::vector<double>& outVec) {
    std::copy(inVec.begin(), inVec.begin() + i, outVec.begin());
    std::copy(inVec.begin() + i + 1, inVec.end(), outVec.begin()+i);
    //std::cerr << "copying:" << i << " "; print_vector_stream(inVec, std::cerr);
    //std::cerr << "copied: " << i << " "; print_vector_stream(outVec, std::cerr);
}

inline unsigned nChoosek( unsigned n, unsigned k )
{
    if (k > n) return 0;
    if (k * 2 > n) k = n-k;
    if (k == 0) return 1;
    
    int result = n;
    for( int i = 2; i <= k; ++i ) {
        result *= (n-i+1);
        result /= i;
    }
    return result;
}

// jackknive standard error
template <class T> double jackknive_std_err(T& vector) {
    if (vector.size() < 5) {
        throw "WARNING: Fewer than five blocks to calculate jackknife!!";
    }
    std::vector<double> jackkniveAverages;
    std::vector<double> JregionDs; JregionDs.resize(vector.size()-1);
    for (std::vector<double>::size_type i = 0; i != vector.size(); i++) {
        // std::cerr << "copying " << i << std::endl;
        copy_except(i, vector, JregionDs);
        jackkniveAverages.push_back(vector_average(JregionDs));
        JregionDs.clear(); JregionDs.resize(vector.size()-1);
    }
    double jackkniveOverallMean = vector_average(jackkniveAverages);
    double sum = 0;
    for (int i = 0; i < jackkniveAverages.size(); i++) {
        sum += std::pow((jackkniveAverages[i] - jackkniveOverallMean), 2.0);
    }
    double var = ((double)(jackkniveAverages.size()-1)/(double)jackkniveAverages.size()) * sum;
    double Dstd_err = std::sqrt(var);
    return Dstd_err;
}

class GeneralSetCounts {
public:
    GeneralSetCounts(const std::map<string, std::vector<size_t>>& setsToPosMap, const int nSamples) : overall(0), averageAAF(-1.0), averageDAF(-1.0),  likelihoodsProbabilitiesType(LikelihoodsProbabilitiesAbsent), AAint(AncestralAlleleMissing) {
        for(std::map<string, std::vector<size_t>>::const_iterator it = setsToPosMap.begin(); it != setsToPosMap.end(); ++it) {
            setAltCounts[it->first] = 0; setAlleleCounts[it->first] = 0; setAlleleProbCounts[it->first] = 0;
            setAAFs[it->first] = -1.0; setDAFs[it->first] = -1.0;
            setAAFsFromLikelihoods[it->first] = -1.0; setDAFsFromLikelihoods[it->first] = -1.0;
            setPoolAAFs[it->first] = -1.0; setPoolDAFs[it->first] = -1.0;
            setSizes.push_back(it->second.size()); setCorrectionFactors[it->first] = -1.0;
            setHWEpriorsFromAAFfromGT[it->first].assign(3, -1.0);
            setHWEpriorsFromDAFfromGT[it->first].assign(3, -1.0);
            std::vector<int> thisSetGenotypes; setGenotypes[it->first] = thisSetGenotypes;
            std::vector<double> thisSetIndividualAFs; setIndividualAFs[it->first] = thisSetIndividualAFs;
            std::vector<double> thisSetIndividualExpGenotypes; setIndividualExpectedGenotypes[it->first] = thisSetIndividualExpGenotypes;
        }
        individualsWithVariant.assign(nSamples, 0);
        individualPoolAAFs.assign(nSamples, -1.0);
    };
    
    void getSetVariantCountsSimple(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap);
    void getSetVariantCounts(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap);
    
    int checkForGenotypeLikelihoodsOrProbabilities(const std::vector<std::string>& vcfLineFields);
    int findADtagPosition(const std::vector<std::string>& vcfLineFields);
    
    void getAFsFromGenotypeLikelihoodsOrProbabilities(const std::vector<std::string>& genotypeFields, const std::map<size_t, string>& posToSpeciesMap, const int likelihoodsOrProbabilitiesTagPosition);
    void getAFsFromADtag(const std::vector<std::string>& genotypeFields, const std::map<string, std::vector<size_t>>& setsToPosMap, const int ADTagPosition, const int minDepth);
    
    int overall; int AAint;
    std::map<string,std::vector<int>> setGenotypes;
    std::map<string,std::vector<double>> setIndividualAFs;
    std::map<string,std::vector<double>> setIndividualExpectedGenotypes;
    std::map<string,int> setAltCounts;
    std::map<string,int> setAlleleCounts; // The number of non-missing alleles for this set
    std::map<string,int> setAlleleProbCounts; // The number of non-missing alleles for this set in terms of likelihoods/probabilities
    std::vector<double> individualPoolAAFs;  // Allele frequency for each individual pool estimated from Allelic Depth (AD tag in VCF) - for pool-seq data
    std::map<string,double> setPoolAAFs; // The above individual pool values are then averaged if multiple pools form a set (i.e., a population or species)
    std::map<string,double> setPoolDAFs;
    
    std::vector<size_t> setSizes; std::map<string,double> setCorrectionFactors;
    std::map<string,double> setAAFs; double averageAAF;     // Allele frequencies - alternative allele
    std::map<string,double> setDAFs; double averageDAF;     // Allele frequencies - derived allele
    std::map<string,double> setAAFsFromLikelihoods; double averageAAFFromLikelihoods;   // Allele frequencies - alternative allele
    std::map<string,double> setDAFsFromLikelihoods; double averageDAFFromLikelihoods;   // Allele frequencies - derived allele
    std::vector<int> individualsWithVariant; // 0 homRef, 1 het, 2 homAlt
    int likelihoodsProbabilitiesType;
    // std::vector<int> set1individualsWithVariant; std::vector<int> set2individualsWithVariant;
    // std::vector<int> set3individualsWithVariant; std::vector<int> set4individualsWithVariant;
    

    int returnFormatTagPosition(std::vector<std::string>& format, const std::string& tag);
    void setHWEpriorsFromAFfromGT();
    std::vector<double> probabilitiesFromLikelihoods(const std::vector<double>& thisLikelihoods, const string& species);
    std::map<string,std::vector<double> > setHWEpriorsFromAAFfromGT;
    std::map<string,std::vector<double> > setHWEpriorsFromDAFfromGT;
    
private:
    void getBasicCounts(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap);
    
    
};

// Split sets for the f_G statistic
class GeneralSetCountsWithSplits : public GeneralSetCounts {
public:
    GeneralSetCountsWithSplits(const std::map<string, std::vector<size_t>>& setsToPosMap, const int nSamples) : GeneralSetCounts(setsToPosMap,nSamples) {
        for(std::map<string, std::vector<size_t>>::const_iterator it = setsToPosMap.begin(); it != setsToPosMap.end(); ++it) {
            setAAFsplit1[it->first] = -1.0; setAAFsplit2[it->first] = -1.0; setDAFsplit1[it->first] = -1.0; setDAFsplit2[it->first] = -1.0;
            setAlleleCountsSplit1[it->first] = 0; setAlleleCountsSplit2[it->first] = 0; setAltCountsSplit1[it->first] = 0; setAltCountsSplit2[it->first] = 0;
            
            setAAFsplit1fromLikelihoods[it->first] = -1.0; setAAFsplit2fromLikelihoods[it->first] = -1.0; setDAFsplit1fromLikelihoods[it->first] = -1.0;
            setDAFsplit2fromLikelihoods[it->first] = -1.0; setAlleleCountsSplit1fromLikelihoods[it->first] = 0;
            setAlleleCountsSplit2fromLikelihoods[it->first] = 0;
            
            setPoolAAFsplit1[it->first] = -1.0; setPoolAAFsplit2[it->first] = -1.0;
            setPoolDAFsplit1[it->first] = -1.0; setPoolDAFsplit2[it->first] = -1.0;

        }
    }
    std::map<string,int> setAltCountsSplit1;
    std::map<string,int> setAltCountsSplit2;
    std::map<string,double> setAAFsplit1; // Allele frequencies - alternative allele
    std::map<string,double> setAAFsplit2; //
    std::map<string,double> setDAFsplit1; // Allele frequencies - derived allele, in the complement of the set
    std::map<string,double> setDAFsplit2;
    std::map<string,int> setAlleleCountsSplit1; // The number of non-missing alleles for the complement of this set
    std::map<string,int> setAlleleCountsSplit2;
    
    

    std::map<string,double> setAAFsplit1fromLikelihoods; // Allele frequencies - alternative allele
    std::map<string,double> setAAFsplit2fromLikelihoods; //
    std::map<string,double> setDAFsplit1fromLikelihoods; // Allele frequencies - derived allele, in the complement of the set
    std::map<string,double> setDAFsplit2fromLikelihoods;
    std::map<string,int> setAlleleCountsSplit1fromLikelihoods; // The number of non-missing alleles for the complement of this set
    std::map<string,int> setAlleleCountsSplit2fromLikelihoods;
    
    std::map<string,double> setPoolAAFsplit1;
    std::map<string,double> setPoolDAFsplit1;
    std::map<string,double> setPoolAAFsplit2; 
    std::map<string,double> setPoolDAFsplit2;
    
    
    
    void getSplitCountsNew(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap);
    
    void getAFsFromADtagWithSplits(const std::vector<std::string>& genotypeFields, const std::map<string, std::vector<size_t>>& setsToPosMap, const int ADTagPosition, const int minDepth);
    void getAFsFromGenotypeLikelihoodsOrProbabilitiesWithSplits(const std::vector<std::string>& genotypeFields, const std::map<size_t, string>& posToSpeciesMap, const int likelihoodsOrProbabilitiesTagPosition, const int pos);
    

private:
    void getBasicCountsWithSplitsNew(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap);
    void getBasicCountsFromLikelihoodsOrProbabilities(const std::vector<std::string>& genotypes, const std::map<size_t, string>& posToSpeciesMap, const int likelihoodsOrProbabilitiesTagPosition);
};

class TrioDinfo {
public:
    TrioDinfo() {
        ABBAtotal = 0;
        BABAtotal = 0;
        BBAAtotal = 0;
        treeArrangement = 0; BBAAarrangement = 0; DminArrangement = 0;
        regionDs.resize(3); usedVars.resize(3); totalUsedVars.resize(3); numStrongVars.resize(3);
        linearStrongABBApos.resize(3);  linearStrongBABApos.resize(3);
        linearStrongABBAposStrongSitesOnly.resize(3); linearStrongBABAposStrongSitesOnly.resize(3);
        usedVars[0] = 0; usedVars[1] = 0; usedVars[2] = 0;
        totalUsedVars[0] = 0; totalUsedVars[1] = 0; totalUsedVars[2] = 0;
        numStrongVars[0] = 0; numStrongVars[1] = 0; numStrongVars[2] = 0;
        localD1num = 0; localD2num = 0; localD3num = 0;
        localD1denom = 0; localD2denom = 0; localD3denom = 0;
        F_d_denom1 = 0; F_d_denom1_reversed = 0; F_dM_denom1 = 0; F_dM_denom1_reversed = 0; F_G_denom1 = 0; F_G_denom1_reversed = 0;
        F_d_denom2 = 0; F_d_denom2_reversed = 0; F_dM_denom2 = 0; F_dM_denom2_reversed = 0; F_G_denom2 = 0; F_G_denom2_reversed = 0;
        F_d_denom3 = 0; F_d_denom3_reversed = 0; F_dM_denom3 = 0; F_dM_denom3_reversed = 0; F_G_denom3 = 0; F_G_denom3_reversed = 0;
    };
    
    // string P1; string P2; string P3;
    double ABBAtotal; double BABAtotal; double BBAAtotal;
    double D1; double D2; double D3; long double D1_p; long double D2_p; long double D3_p; double D1_Z; double D2_Z; double D3_Z;
    double F_d_denom1; double F_d_denom1_reversed; double F_dM_denom1; double F_dM_denom1_reversed; double F_G_denom1; double F_G_denom1_reversed;
    double F_d_denom2; double F_d_denom2_reversed; double F_dM_denom2; double F_dM_denom2_reversed; double F_G_denom2; double F_G_denom2_reversed;
    double F_d_denom3; double F_d_denom3_reversed; double F_dM_denom3; double F_dM_denom3_reversed; double F_G_denom3; double F_G_denom3_reversed;
    
    std::vector<std::vector<int>> linearStrongABBApos; // positions of strong (> 0.5) ABBA for the three tree orientations
    std::vector<std::vector<int>> linearStrongBABApos; // positions of strong (> 0.5) BABA for the three tree orientations
    
    std::vector<std::vector<int>> linearStrongABBAposStrongSitesOnly; // positions of strong (> 0.5) ABBA in a vector of only ABBA and BABA sites
    std::vector<std::vector<int>> linearStrongBABAposStrongSitesOnly; // positions of strong (> 0.5) BABA in a vector of only ABBA and BABA sites
    
    double localD1num; double localD2num; double localD3num;
    double localD1denom; double localD2denom; double localD3denom;
    std::vector<std::vector<double>> regionDs; // vector with three empty (double) vectors
    std::vector<int> usedVars; // 
Download .txt
gitextract_vdae3bmp/

├── Build/
│   └── README.md
├── D.cpp
├── D.h
├── Dmin.cpp
├── Dmin.h
├── Dmin_combine.cpp
├── Dmin_combine.h
├── Dquartets.cpp
├── Dquartets.h
├── Dsuite.cpp
├── Dsuite_common.cpp
├── Dsuite_common.h
├── Dsuite_fBranch.cpp
├── Dsuite_fBranch.h
├── Dsuite_utils.cpp
├── Dsuite_utils.h
├── KolmogorovSmirnovDist.cpp
├── KolmogorovSmirnovDist.hpp
├── Makefile
├── README.md
├── gzstream.cpp
├── gzstream.h
├── kstest.cpp
├── kstest.h
└── utils/
    ├── DtriosParallel
    ├── dtools.py
    └── setup.py
Download .txt
SYMBOL INDEX (221 symbols across 17 files)

FILE: D.cpp
  type option (line 41) | struct option
  type opt (line 49) | namespace opt
  function doAbbaBaba (line 63) | void doAbbaBaba() {
  function abbaBabaMain (line 250) | int abbaBabaMain(int argc, char** argv) {
  function parseAbbaBabaOptions (line 317) | void parseAbbaBabaOptions(int argc, char** argv) {

FILE: D.h
  function class (line 13) | class TestTrioInfo {

FILE: Dmin.cpp
  type option (line 51) | struct option
  type opt (line 68) | namespace opt
  function DminMain (line 89) | int DminMain(int argc, char** argv) {
  function parseDminOptions (line 586) | void parseDminOptions(int argc, char** argv) {

FILE: Dmin_combine.cpp
  type option (line 34) | struct option
  type opt (line 43) | namespace opt
  function DminCombineMain (line 54) | int DminCombineMain(int argc, char** argv) {
  function parseDminCombineOptions (line 231) | void parseDminCombineOptions(int argc, char** argv) {

FILE: Dquartets.cpp
  type option (line 40) | struct option
  type opt (line 53) | namespace opt
  function DquartetsMain (line 70) | int DquartetsMain(int argc, char** argv) {
  function parseDquartetsOptions (line 442) | void parseDquartetsOptions(int argc, char** argv) {

FILE: Dsuite.cpp
  function main (line 42) | int main(int argc, char **argv) {

FILE: Dsuite_common.cpp
  function string (line 46) | string makeHeader(bool quartet, bool includeFstats, bool includeKSstats) {
  function string (line 55) | string prepareOutFileRootString(const string& providedPrefix, const stri...
  function printMissingLikelihoodsWarning (line 62) | void printMissingLikelihoodsWarning(const string& chr, const string& pos) {
  function duplicateTreeValueError (line 67) | void duplicateTreeValueError(const string& duplicate) {
  function printInitialMessageTriosQuartets (line 73) | void printInitialMessageTriosQuartets(const int regionLengthOpt, const i...
  function assignTreeLevelsAndLinkToTaxa (line 79) | void assignTreeLevelsAndLinkToTaxa(string& treeLine, std::map<string,std...
  function assignNumLinesToAnalyse (line 124) | int assignNumLinesToAnalyse(const int providedNumLinesOpt, const int reg...

FILE: Dsuite_common.h
  function notEnoughPopulationsError (line 30) | inline void notEnoughPopulationsError(const int minPopulations) {
  function outgroupNeededError (line 35) | inline void outgroupNeededError(const string& setsFileName) {
  function outgroupNotUsedInQuartetsWarning (line 40) | inline void outgroupNotUsedInQuartetsWarning(const string& setsFileName) {
  function wrongNumberOfColumnsError (line 44) | inline void wrongNumberOfColumnsError(const string& setsFileName, int li...
  function lineEmptyError (line 50) | inline void lineEmptyError(const string& setsFileName, int lineNum) {
  function class (line 56) | class SetInformation {
  function reportProgessVCF (line 114) | inline void reportProgessVCF(const int variantsProcessed, const std::clo...
  function reportProgessVCF (line 119) | inline void reportProgessVCF(const int variantsProcessed, const int VCFl...

FILE: Dsuite_fBranch.cpp
  type option (line 35) | struct option
  type opt (line 43) | namespace opt
  function fBranchMain (line 53) | int fBranchMain(int argc, char** argv) {
  function parseFbranchOptions (line 182) | void parseFbranchOptions(int argc, char** argv) {

FILE: Dsuite_fBranch.h
  function class (line 20) | class Branch {
  function class (line 58) | class Tree {

FILE: Dsuite_utils.cpp
  function normalCDF (line 10) | long double normalCDF(double x) // Phi(-∞, x) aka N(x)
  function Fd_Denom_perVariant (line 15) | double Fd_Denom_perVariant(double p1, double p2, double p3, double pO) {
  function fG_Denom_perVariant (line 22) | double fG_Denom_perVariant(double p1, double p3a, double p3b, double pO) {
  function f4_perVariant (line 28) | double f4_perVariant(double p1, double p2, double p3, double p4) {
  function FdM_Denom_perVariant (line 33) | double FdM_Denom_perVariant(double p1, double p2, double p3, double pO) {
  function getExpectedGenotype (line 109) | double getExpectedGenotype(const std::vector<double>& thisProbabilities) {
  function transformFromPhred (line 114) | void transformFromPhred(std::vector<double>& thisLikelihoods) {
  function transformFromGL (line 121) | void transformFromGL(std::vector<double>& thisLikelihoods) {
  function calculateOneDs (line 626) | double calculateOneDs(double ABBAtotal, double BABAtotal) {
  function stringToDouble (line 651) | double stringToDouble(std::string s) {
  function stripExtension (line 660) | std::string stripExtension(const std::string& filename)
  function split (line 670) | void split(const std::string &s, char delim, std::vector<std::string> &e...
  function split (line 678) | std::vector<std::string> split(const std::string &s, char delim) {
  function splitToDouble (line 685) | void splitToDouble(const std::string &s, char delim, std::vector<double>...
  function splitToDouble (line 693) | std::vector<double> splitToDouble(const std::string &s, char delim) {
  function split2 (line 699) | std::vector<std::string> split2(std::string s, string delim) {
  function locateSet (line 713) | std::vector<size_t> locateSet(const std::vector<std::string>& sample_nam...
  function suffix (line 730) | std::string suffix(const std::string& seq, size_t len)
  function isGzip (line 737) | bool isGzip(const std::string& filename)
  function assertFileOpen (line 750) | void assertFileOpen(std::ifstream& fh, const std::string& fn)
  function assertFileOpen (line 759) | void assertFileOpen(std::ofstream& fh, const std::string& fn)
  function assertGZOpen (line 769) | void assertGZOpen(gzstreambase& gh, const std::string& fn)
  function checkGenotypesExist (line 778) | void checkGenotypesExist(const std::vector<std::string>& fields, const i...
  function file_exists (line 823) | bool file_exists(const std::string& name) {
  function assignSplits01FromAlleleFrequency (line 829) | void assignSplits01FromAlleleFrequency(const double p, double& splitA, d...

FILE: Dsuite_utils.h
  function string (line 93) | string numToString(T i) {
  function class (line 102) | class median_of_empty_list_exception:public std::exception{
  function copy_except (line 162) | inline void copy_except(int i, std::vector<double>& inVec, std::vector<d...
  function nChoosek (line 169) | inline unsigned nChoosek( unsigned n, unsigned k )
  function class (line 206) | class GeneralSetCounts {
  function class (line 269) | class GeneralSetCountsWithSplits : public GeneralSetCounts {
  function class (line 321) | class TrioDinfo {
  function addRegionDs (line 622) | void addRegionDs(const int arrangement) {
  function class (line 644) | class QuartetDinfo: public TrioDinfo {

FILE: KolmogorovSmirnovDist.cpp
  function log1p (line 63) | double log1p(double x)
  function getLogFactorial (line 155) | static double getLogFactorial (int n)
  function DeleteMatrixD (line 187) | static void DeleteMatrixD (double **T)
  function KSPlusbarAsymp (line 196) | static double KSPlusbarAsymp (int n, double x)
  function KSPlusbarUpper (line 214) | static double KSPlusbarUpper (int n, double x)
  function Pelz (line 280) | static double Pelz (int n, double x)
  function CalcFloorCeil (line 383) | static void CalcFloorCeil (
  function Pomeranz (line 442) | static double Pomeranz (int n, double x)
  function cdfSpecial (line 560) | static double cdfSpecial (int n, double x)
  function KScdf (line 595) | double KScdf (int n, double x)
  function fbarSpecial (line 620) | static double fbarSpecial (int n, double x)
  function KSfbar (line 651) | double KSfbar (int n, double x)
  function DurbinMatrix (line 708) | static double DurbinMatrix (int n, double d)
  function mMultiply (line 757) | static void mMultiply (double *A, double *B, double *C, int m)
  function renormalize (line 771) | static void renormalize (double *V, int m, int *p)
  function mPower (line 780) | static void mPower (double *A, int eA, double *V, int *eV, int m, int n)
  function main (line 816) | int main(void)

FILE: gzstream.cpp
  type GZSTREAM_NAMESPACE (line 41) | namespace GZSTREAM_NAMESPACE {
    function gzstreambuf (line 52) | gzstreambuf* gzstreambuf::open( const char* name, int open_mode) {
    function gzstreambuf (line 75) | gzstreambuf * gzstreambuf::close() {

FILE: gzstream.h
  function class (line 50) | class gzstreambuf : public std::streambuf {
  function class (line 79) | class gzstreambase : virtual public std::ios {

FILE: kstest.cpp
  function mMultiply (line 246) | void mMultiply(double *A,double *B,double *C,int m)
  function mPower (line 253) | void mPower(double *A,int eA,double *V,int *eV,int m,int n)
  function K (line 266) | double K(int n,double d)
  function ks_test (line 338) | double ks_test (std::list<int64_t> sample1, std::list<int64_t> sample2,
  function ks_test_of_uniformity (line 475) | double ks_test_of_uniformity(std::vector<double> sampleVect0to1, std::os...

FILE: utils/dtools.py
  function set_float_format (line 72) | def set_float_format(formatter):
  function format_node (line 140) | def format_node(node, node_type, format, dist_formatter=None,
  function print_supported_formats (line 204) | def print_supported_formats():
  class NewickError (line 213) | class NewickError(Exception):
    method __init__ (line 216) | def __init__(self, value):
  function read_newick (line 223) | def read_newick(newick, root_node=None, format=0, quoted_names=False):
  function _read_newick_from_string (line 264) | def _read_newick_from_string(nw, root_node, matcher, formatcode, quoted_...
  function _parse_extra_features (line 346) | def _parse_extra_features(node, NHX_string):
  function compile_matchers (line 359) | def compile_matchers(formatcode):
  function _read_node_data (line 403) | def _read_node_data(subnw, current_node, node_type, matcher, formatcode):
  function write_newick (line 444) | def write_newick(rootnode, features=None, format=1, format_root_node=True,
  function _get_features_string (line 479) | def _get_features_string(self, features=None):
  class TreeError (line 524) | class TreeError(Exception):
    method __init__ (line 529) | def __init__(self, value=''):
    method __str__ (line 532) | def __str__(self):
  class TreeNode (line 536) | class TreeNode(object):
    method _get_dist (line 577) | def _get_dist(self):
    method _set_dist (line 580) | def _set_dist(self, value):
    method _get_support (line 586) | def _get_support(self):
    method _set_support (line 589) | def _set_support(self, value):
    method _get_up (line 595) | def _get_up(self):
    method _set_up (line 598) | def _set_up(self, value):
    method _get_children (line 604) | def _get_children(self):
    method _set_children (line 607) | def _set_children(self, value):
    method _get_style (line 614) | def _get_style(self):
    method _set_style (line 620) | def _set_style(self, value):
    method _set_face_areas (line 635) | def _set_face_areas(self, value):
    method _get_face_areas (line 641) | def _get_face_areas(self):
    method __init__ (line 649) | def __init__(self, newick=None, format=0, dist=None, support=None,
    method __nonzero__ (line 672) | def __nonzero__(self):
    method __bool__ (line 675) | def __bool__(self):
    method __repr__ (line 683) | def __repr__(self):
    method __and__ (line 686) | def __and__(self, value):
    method __add__ (line 696) | def __add__(self, value):
    method __str__ (line 707) | def __str__(self):
    method __contains__ (line 712) | def __contains__(self, item):
    method __len__ (line 720) | def __len__(self):
    method __iter__ (line 724) | def __iter__(self):
    method add_feature (line 728) | def add_feature(self, pr_name, pr_value):
    method add_features (line 735) | def add_features(self, **features):
    method del_feature (line 742) | def del_feature(self, pr_name):
    method add_child (line 751) | def add_child(self, child=None, name=None, dist=None, support=None):
    method remove_child (line 778) | def remove_child(self, child):
    method add_sister (line 791) | def add_sister(self, sister=None, name=None, dist=None):
    method remove_sister (line 802) | def remove_sister(self, sister=None):
    method delete (line 820) | def delete(self, prevent_nondicotomic=True, preserve_branch_length=Fal...
    method detach (line 874) | def detach(self):
    method prune (line 889) | def prune(self, nodes, preserve_branch_length=False):
    method swap_children (line 1026) | def swap_children(self):
    method get_children (line 1037) | def get_children(self):
    method get_sisters (line 1043) | def get_sisters(self):
    method iter_leaves (line 1052) | def iter_leaves(self, is_leaf_fn=None):
    method get_leaves (line 1067) | def get_leaves(self, is_leaf_fn=None):
    method iter_leaf_names (line 1076) | def iter_leaf_names(self, is_leaf_fn=None):
    method get_leaf_names (line 1086) | def get_leaf_names(self, is_leaf_fn=None):
    method iter_descendants (line 1096) | def iter_descendants(self, strategy="levelorder", is_leaf_fn=None):
    method get_descendants (line 1107) | def get_descendants(self, strategy="levelorder", is_leaf_fn=None):
    method traverse (line 1117) | def traverse(self, strategy="levelorder", is_leaf_fn=None):
    method iter_prepostorder (line 1143) | def iter_prepostorder(self, is_leaf_fn=None):
    method _iter_descendants_postorder (line 1170) | def _iter_descendants_postorder(self, is_leaf_fn=None):
    method _iter_descendants_levelorder (line 1192) | def _iter_descendants_levelorder(self, is_leaf_fn=None):
    method _iter_descendants_preorder (line 1203) | def _iter_descendants_preorder(self, is_leaf_fn=None):
    method iter_ancestors (line 1218) | def iter_ancestors(self):
    method get_ancestors (line 1230) | def get_ancestors(self):
    method describe (line 1239) | def describe(self):
    method write (line 1258) | def write(self, features=None, outfile=None, format=0, is_leaf_fn=None,
    method get_tree_root (line 1306) | def get_tree_root(self):
    method get_common_ancestor (line 1315) | def get_common_ancestor(self, *target_nodes, **kargs):
    method iter_search_nodes (line 1377) | def iter_search_nodes(self, **conditions):
    method search_nodes (line 1393) | def search_nodes(self, **conditions):
    method get_leaves_by_name (line 1409) | def get_leaves_by_name(self, name):
    method is_leaf (line 1415) | def is_leaf(self):
    method is_root (line 1421) | def is_root(self):
    method get_distance (line 1433) | def get_distance(self, target, target2=None, topology_only=False):
    method get_farthest_node (line 1475) | def get_farthest_node(self, topology_only=False):
    method _get_farthest_and_closest_leaves (line 1518) | def _get_farthest_and_closest_leaves(self, topology_only=False, is_lea...
    method get_farthest_leaf (line 1546) | def get_farthest_leaf(self, topology_only=False, is_leaf_fn=None):
    method get_closest_leaf (line 1563) | def get_closest_leaf(self, topology_only=False, is_leaf_fn=None):
    method get_midpoint_outgroup (line 1581) | def get_midpoint_outgroup(self):
    method populate (line 1603) | def populate(self, size, names_library=None, reuse_names=False,
    method set_outgroup (line 1677) | def set_outgroup(self, outgroup):
    method unroot (line 1762) | def unroot(self):
    method _asciiArt (line 1830) | def _asciiArt(self, char1='-', show_internal=True, compact=False, attr...
    method get_ascii (line 1874) | def get_ascii(self, show_internal=True, compact=False, attributes=None):
    method ladderize (line 1889) | def ladderize(self, direction=0):
    method sort_descendants (line 1947) | def sort_descendants(self, attr="name"):
    method get_cached_content (line 1968) | def get_cached_content(self, store_attr=None, container_type=set, leav...
    method robinson_foulds (line 2029) | def robinson_foulds(self, t2, attr_t1="name", attr_t2="name",
    method compare (line 2200) | def compare(self, ref_tree, use_collateral=False, min_support_source=0...
    method _diff (line 2365) | def _diff(self, t2, output='topology', attr_t1='name', attr_t2='name',...
    method iter_edges (line 2388) | def iter_edges(self, cached_content=None):
    method get_edges (line 2403) | def get_edges(self, cached_content=None):
    method standardize (line 2414) | def standardize(self, delete_orphan=True, preserve_branch_length=True):
    method convert_to_ultrametric (line 2483) | def convert_to_ultrametric(self, tree_length=None, strategy='balanced'):
    method check_monophyly (line 2527) | def check_monophyly(self, values, target_attr, ignore_missing=False,
    method get_monophyletic (line 2622) | def get_monophyletic(self, values, target_attr):
    method expand_polytomies (line 2650) | def expand_polytomies(self, map_attr="name", polytomy_size_limit=5,
    method resolve_polytomy (line 2714) | def resolve_polytomy(self, default_dist=0.0, default_support=0.0,
    method add_face (line 2757) | def add_face(self, face, column, position="branch-right"):
    method from_parent_child_table (line 2785) | def from_parent_child_table(parent_child_table):
    method from_skbio (line 2826) | def from_skbio(skbio_tree, map_attributes=None):
    method phonehome (line 2866) | def phonehome(self):
  function _translate_nodes (line 2871) | def _translate_nodes(root, *nodes):
  class HsTree (line 2899) | class HsTree(TreeNode):
    method __init__ (line 2902) | def __init__(tree, *args, **kwa):
    method get_time (line 2937) | def get_time(node):
    method get_name (line 2942) | def get_name(tree):
    method write (line 2951) | def write(tree, **kwa):
    method add_mass_migration (line 2977) | def add_mass_migration(tree, source, destination, fraction, time):
    method add_property_to_nodes (line 2991) | def add_property_to_nodes(tree, property_name, property_node_dict):
    method add_properties_to_nodes (line 3031) | def add_properties_to_nodes(tree, properties, properties_node_dict):
    method get_nodes_at_time (line 3066) | def get_nodes_at_time(tree, time):
    method plot (line 3078) | def plot(tree, ax=None, style='orthogonal',
    method search_node_by_newick (line 3395) | def search_node_by_newick(tree, newick):
    method set_leaf_order (line 3401) | def set_leaf_order(tree, order, check_consistent=True):
    method set_outgroup (line 3430) | def set_outgroup(tree, outgroup, end_at_present=True):
    method reverse (line 3448) | def reverse(tree):
  function align_fbranch_with_tree (line 3454) | def align_fbranch_with_tree(fbranch, tree, outgroup, ladderize=False):
  function plot_fbranch (line 3487) | def plot_fbranch(fbranch, tree_no_outgroup, leaves_to_present=True,
  function main (line 3589) | def main():
Condensed preview — 27 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (478K chars).
[
  {
    "path": "Build/README.md",
    "chars": 71,
    "preview": "### The Build folder.\nHere will be the executable after compilation.\n\n\n"
  },
  {
    "path": "D.cpp",
    "chars": 19278,
    "preview": "//\n//  D.cpp\n//  Dsuite\n//\n//  Created by Milan Malinsky on 11/04/2019.\n//\n\n#include \"D.h\"\n#include \"Dsuite_common.h\"\n#i"
  },
  {
    "path": "D.h",
    "chars": 1740,
    "preview": "//\n//  D.h\n//  Dsuite\n//\n//  Created by Milan Malinsky on 11/04/2019.\n//\n\n#ifndef D_h\n#define D_h\n\n#include \"Dsuite_util"
  },
  {
    "path": "Dmin.cpp",
    "chars": 39858,
    "preview": "//\n//  Dmin.cpp\n//  Dsuite\n//\n//  Created by Milan Malinsky on 02/04/2019.\n//\n\n#include \"Dmin.h\"\n#include \"Dsuite_common"
  },
  {
    "path": "Dmin.h",
    "chars": 241,
    "preview": "//\n//  Dmin.h\n//  Dsuite\n//\n//  Created by Milan Malinsky on 02/04/2019.\n//\n\n#ifndef Dmin_h\n#define Dmin_h\n#include \"Dsu"
  },
  {
    "path": "Dmin_combine.cpp",
    "chars": 12513,
    "preview": "//\n//  Dmin_combine.cpp\n//  Dsuite\n//\n//  Created by Milan Malinsky on 11/04/2019.\n//\n\n#include \"Dmin_combine.h\"\n#includ"
  },
  {
    "path": "Dmin_combine.h",
    "chars": 285,
    "preview": "//\n//  Dmin_combine.h\n//  Dsuite\n//\n//  Created by Milan Malinsky on 11/04/2019.\n//\n\n#ifndef Dmin_combine_h\n#define Dmin"
  },
  {
    "path": "Dquartets.cpp",
    "chars": 29252,
    "preview": "//\n//  Dquartets.cpp\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 14/07/2020.\n//\n\n#include \"Dquartets.h\"\n#include"
  },
  {
    "path": "Dquartets.h",
    "chars": 273,
    "preview": "//\n//  Dquartets.h\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 14/07/2020.\n//\n\n#ifndef Dquartets_h\n#define Dquar"
  },
  {
    "path": "Dsuite.cpp",
    "chars": 2549,
    "preview": "//\n//  Dsuite.cpp\n//  Dsuite\n//\n//  Created by Milan Malinsky on 02/04/2019.\n//\n\n#include <iostream>\n#include \"Dsuite_ut"
  },
  {
    "path": "Dsuite_common.cpp",
    "chars": 6819,
    "preview": "//\n//  Dsuite_common.cpp\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 21/07/2020.\n//\n\n#include \"Dsuite_common.h\"\n"
  },
  {
    "path": "Dsuite_common.h",
    "chars": 7223,
    "preview": "//\n//  Dsuite_common.h\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 21/07/2020.\n//\n\n#ifndef Dsuite_common_h\n#defi"
  },
  {
    "path": "Dsuite_fBranch.cpp",
    "chars": 14170,
    "preview": "//\n//  Dsuite_fBranch.cpp\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 11/11/2019.\n//\n\n#include \"Dsuite_fBranch.h"
  },
  {
    "path": "Dsuite_fBranch.h",
    "chars": 8850,
    "preview": "//\n//  Dsuite_fBranch.h\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 11/11/2019.\n//\n\n#ifndef Dsuite_fBranch_h\n#de"
  },
  {
    "path": "Dsuite_utils.cpp",
    "chars": 37066,
    "preview": "//\n//  Dsuite_utils.cpp\n//  Dsuite\n//\n//  Created by Milan Malinsky on 02/04/2019.\n//\n\n#include \"Dsuite_utils.h\"\n\nlong d"
  },
  {
    "path": "Dsuite_utils.h",
    "chars": 42226,
    "preview": "//\n//  Dsuite_utils.h\n//  Dsuite\n//\n//  Created by Milan Malinsky on 02/04/2019.\n//\n\n#ifndef Dsuite_utils_h\n#define Dsui"
  },
  {
    "path": "KolmogorovSmirnovDist.cpp",
    "chars": 20487,
    "preview": "//\n//  KolmogorovSmirnovDist.cpp\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 30.10.22.\n//\n\n/********************"
  },
  {
    "path": "KolmogorovSmirnovDist.hpp",
    "chars": 2776,
    "preview": "//\n//  KolmogorovSmirnovDist.hpp\n//  DsuiteXcode\n//\n//  Created by Milan Malinsky on 30.10.22.\n//  Copyright © 2022 Mila"
  },
  {
    "path": "Makefile",
    "chars": 712,
    "preview": "\nCXXFLAGS=-std=c++11\nCXX=g++\nBIN := Build\nLDFLAGS=-lz\n\nall: $(BIN)/Dsuite\n\n$(BIN)/Dsuite: $(BIN)/Dsuite.o $(BIN)/Dsuite_"
  },
  {
    "path": "README.md",
    "chars": 29779,
    "preview": "#  Dsuite\nPublication:  \nMalinsky, M., Matschiner, M. and Svardal, H. (2021) Dsuite ‐ fast D‐statistics and related admi"
  },
  {
    "path": "gzstream.cpp",
    "chars": 5864,
    "preview": "// ============================================================================\n// gzstream, C++ iostream classes wrappi"
  },
  {
    "path": "gzstream.h",
    "chars": 4726,
    "preview": "// ============================================================================\n// gzstream, C++ iostream classes wrappi"
  },
  {
    "path": "kstest.cpp",
    "chars": 18662,
    "preview": "/**************************************************************************/\n/*    Copyright (C) 2006 Romain Michalec   "
  },
  {
    "path": "kstest.h",
    "chars": 1983,
    "preview": "/**************************************************************************/\n/*    Copyright (C) 2006 Romain Michalec   "
  },
  {
    "path": "utils/DtriosParallel",
    "chars": 18912,
    "preview": "#!/usr/bin/env python\n\n\"\"\"\nThis script automates parallelisation of Dsuite Dtrios/ Dsuite DtriosCombine.\nThis script was"
  },
  {
    "path": "utils/dtools.py",
    "chars": 135864,
    "preview": "#!/usr/bin/env python3\n\n# #START_LICENSE###########################################################\n#\n# Parts of the cod"
  },
  {
    "path": "utils/setup.py",
    "chars": 608,
    "preview": "from setuptools import setup                                                                                            "
  }
]

About this extraction

This page contains the full source code of the millanek/Dsuite GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 27 files (451.9 KB), approximately 120.8k tokens, and a symbol index with 221 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!