[
  {
    "path": ".gitmodules",
    "content": "[submodule \"src\"]\n\tpath = src\n\turl = https://github.com/tanghaibao/allhic\n"
  },
  {
    "path": "README.md",
    "content": "# ALLHiC\nALLHiC: phasing and scaffolding polyploid genomes based on Hi-C data  \nSee wiki for details (https://github.com/tangerzhang/ALLHiC/wiki).\n# Note\nPlease be aware that ALLHiC is no longer maintained. We recommend using two recently released algorithm packages developed by our team, which are reference-free and much faster: \n- **C-Phasing**: [C-Phasing GitHub Repository](https://github.com/wangyibin/CPhasing)  \n- **HapHiC**: [HapHiC GitHub Repository](https://github.com/zengxiaofei/HapHiC)\n"
  },
  {
    "path": "bin/ALLHiC_build",
    "content": "#!/usr/bin/perl -w\n\n\ndie \"Usage: perl $0 refSeq.fasta\\n\" if(!(defined $ARGV[0]));\n\nprint \"1. tour format to agp ...\\n\";\n\nmy $refSeq = $ARGV[0];\nmy $Nseq   = \"N\" x 100;\nmy %anchordb;\nmy %seqdb;\nopen(IN, $refSeq) or die\"\";\n$/='>';\n<IN>;\nwhile(<IN>){\n\tchomp;\n\tmy ($ctg,$seq) = split(/\\n/,$_,2);\n\t$ctg\t       =~ s/\\s+.*//g;\n\t$seq           =~ s/\\s+//g;\n\t$seqdb{$ctg}   = $seq;\n\t}\nclose IN;\n\nopen(OUT, \"> groups.agp\") or die\"Error: $!\";\nopen(SEQ, \"> groups.asm.fasta\") or die\"Error: $!\";\nwhile(my $tour = glob \"*.tour\"){\nprint \"Processing $tour ...\\n\";\nmy $gid    = $tour;\n   $gid    =~ s/.tour//g;\nmy $agp = $gid.\".agp\";\nmy $last_line = `tail -n 1 $tour`;\nmy @ctgdb     = split(/\\s+/,$last_line);\nmy $a         = 0;\nmy $b         = 0;\nmy $len       = 0;\nmy $count     = 0;\nmy $fullSeq   = \"\";\nforeach my $i(0..$#ctgdb){\n\tmy $ctg; my $dir;\n\tif($ctgdb[$i]=~/(.*)([+|-])/){\n\t\t$ctg = $1; $dir = $2;\n\t\t}\n\t$a    = $b + 1;\n\t$len  = length $seqdb{$ctg};\n\t$anchordb{$ctg}++;\n\t$b    = $a + $len - 1;\n\t$count++;\n\tprint OUT \"$gid\t$a\t$b\t$count\tW\t$ctg\t1\t$len\t$dir\\n\";\n\tmy $seq   = uc $seqdb{$ctg};\n\tif($dir eq \"-\"){\n\t\t$seq    = reverse $seq;\n\t\t$seq    =~ tr/ATGC/TACG/;\n\t\t}\n\t$fullSeq .= $seq;\n\t$a    = $b + 1;\n\t$b    = $a + 100 - 1;\n\t$count++;\n\tprint OUT \"$gid\t$a\t$b\t$count\tU\t100\tcontig\tyes\tmap\\n\" if($i!=$#ctgdb);\n\t$fullSeq .= $Nseq if($i!=$#ctgdb);\n\t}\nprint SEQ \">$gid\\n$fullSeq\\n\";\n\n}\n\nforeach my $ctg (keys %seqdb){\n\tnext if(exists($anchordb{$ctg}));\n\tmy $len = length $seqdb{$ctg};\n\tprint OUT \"$ctg\t1\t$len\t1\tW\t$ctg\t1\t$len\t+\\n\";\n\tprint SEQ \">$ctg\\n$seqdb{$ctg}\\n\";\n\t}\n\nclose OUT;\nclose SEQ;\n\n"
  },
  {
    "path": "bin/ALLHiC_corrector",
    "content": "#!/usr/bin/env python\nimport sys\nimport multiprocessing\nimport math\nimport numpy as np\nimport pysam\nimport time\nimport argparse\n\n\ndef time_print(str):\n\tprint(\"\\033[32m%s\\033[0m %s\"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())), str))\n\n\ndef get_opt():\n\tgroup = argparse.ArgumentParser()\n\tgroup.add_argument(\"-m\", \"--mapping\", help=\"Input mapping file\", required=True)\n\tgroup.add_argument(\"-r\", \"--reference\", help=\"Contig fasta file\", required=True)\n\tgroup.add_argument(\"-o\", \"--output\", help=\"Corrected fasta file\", required=True)\n\tgroup.add_argument(\"-p\", \"--percent\", type=float, help=\"Percent of the map to saturate, default is 0.95\", default=0.95)\n\tgroup.add_argument(\"-s\", \"--sensitive\", type=float, help=\"sensitivity to depletion score, default is 0.5\", default=0.5)\n\tgroup.add_argument(\"-q\", \"--mapq\", type=int, help=\"MAPQ of mapping lower bound, default is 1\", default=1)\n\tgroup.add_argument(\"-w\", \"--wide\", type=int, help=\"Resolution for first pass search of mismatches, default is 25000 bp\", default=25000)\n\tgroup.add_argument(\"-n\", \"--narrow\", type=int, help=\"Resolution for the precise mismatch localizaton, n<w default is 1000 bp\", default=1000)\n\tgroup.add_argument(\"-d\", \"--depletion\", type=int, help=\"The size of the region to aggregate the depletion score in the wide path, d >= 2*w, default is 100000 bp\", default=100000)\n\tgroup.add_argument(\"-t\", \"--threads\", type=int, help=\"Threads, default is 1\", default=1)\n\n\treturn group.parse_args()\n\n\ndef get_ctg_len(bam):\n\tctg_len = {}\n\tfor item in bam.header[\"SQ\"]:\n\t\titem = dict(item)\n\t\tctg_len[item['SN']] = item['LN']\n\n\treturn ctg_len\n\n\ndef get_pos_list(bam_fetch, min_mapq):\n\tpos_list = []\n\tfor line in bam_fetch:\n\t\tctg1 = line.reference_name\n\t\tctg2 = line.next_reference_name\n\t\tpos1 = line.reference_start\n\t\tpos2 = line.next_reference_start\n\t\tif pos1 == -1 or pos2 == -1 or ctg1 != ctg2 or line.mapq < min_mapq:\n\t\t\tcontinue\n\t\tpos_list.append([pos1, pos2])\n\t\n\treturn pos_list\n\n\ndef get_hic_list(pos_list, bin_size):\n\thic_db = {}\n\thic_list = []\n\n\tpos_mat = np.matrix(pos_list)\n\tpos_mat = pos_mat//bin_size*bin_size\n\t\n\tfor i in range(0, len(pos_mat)):\n\t\tkey = (pos_mat[i, 0], pos_mat[i, 1])\n\t\tif key not in hic_db:\n\t\t\thic_db[key] =0\n\t\thic_db[key] += 1\n\tfor key in hic_db:\n\t\thic_list.append([key[0], key[1], hic_db[key]])\n\n\treturn hic_list\n\n\ndef calc_sat_level(hic_list, pct):\n\ttmp_list = []\n\tnan_cnt = 0\n\tfor i in range(0, len(hic_list)):\n\t\tif hic_list[i][0] != hic_list[i][1]:\n\t\t\tif math.isnan(hic_list[i][2]):\n\t\t\t\tnan_cnt += 1\n\t\t\t\ttmp_list.append(0)\n\t\t\telse:\n\t\t\t\ttmp_list.append(hic_list[i][2])\n\tif tmp_list == []:\n\t\treturn -1\n\ttmp_list = sorted(tmp_list)\n\tfor i in range(0, nan_cnt):\n\t\ttmp_list[i] = float('nan')\n\tif len(tmp_list) == 1:\n\t\treturn tmp_list[0]\n\telse:\n\t\tpos = pct*(len(tmp_list)+1)\n\t\tif pos<1:\n\t\t\treturn tmp_list[0]\n\t\telse:\n\t\t\tif pos >= len(tmp_list):\n\t\t\t\treturn tmp_list[-1]\n\t\t\telse:\n\t\t\t\td = pos-int(pos)\n\t\t\t\treturn tmp_list[int(pos)-1]+d*(tmp_list[int(pos)]-tmp_list[int(pos)-1])\n\n\ndef precompute_dep_score(hic_list, bin_size, dep_size, sat_level):\n\tscore_db = {}\n\tfor s, e, val in hic_list:\n\t\tif math.isnan(val):\n\t\t\tcontinue\n\t\tif e-s>dep_size:\n\t\t\tcontinue\n\t\tif val >= sat_level:\n\t\t\tval = sat_level\n\t\tfor i in range(s+bin_size, e, bin_size):\n\t\t\tif i not in score_db:\n\t\t\t\tscore_db[i] = 0\n\t\t\tscore_db[i] += val\n\tpos = score_db.keys()\n\tif len(pos) != 0:\n\t\treturn score_db, min(pos), max(pos)\n\telse:\n\t\treturn score_db, 0, 0\n\n\ndef get_sub_score_db(score_db, min_pos, max_pos, bin_size, dep_size):\n\tsub_score_db = {}\n\tfor i in range(min_pos+dep_size-2*bin_size, max_pos-dep_size+3*bin_size, bin_size):\n\t\tif i in score_db:\n\t\t\tsub_score_db[i] = score_db[i]\n\t\telse:\n\t\t\tsub_score_db[i] = 0\n\treturn sub_score_db\n\n\ndef get_wide_mismatch(score_db, thr, bin_size):\n\ttmp_list = [[]]\n\tfor i in sorted(score_db):\n\t\tif score_db[i] < thr:\n\t\t\tif tmp_list[-1] == []:\n\t\t\t\ttmp_list[-1].append(i)\n\t\telse:\n\t\t\tif tmp_list[-1] != []:\n\t\t\t\ttmp_list[-1].append(i)\n\t\t\t\ttmp_list.append([])\n\tif len(tmp_list[-1]) == 1:\n\t\ttmp_list[-1].append(i+bin_size)\n\telif len(tmp_list[-1]) == 0:\n\t\tdel tmp_list[-1]\n\treturn tmp_list\n\t\n\ndef get_mismatch(hic_list, bin_size, dep_size, pct, sens, is_wide):\n\tsat_level = round(calc_sat_level(hic_list, pct), 5)\n\tif sat_level == -1:\n\t\treturn []\n\tthr = sens*sat_level*0.5*dep_size/bin_size*(dep_size/bin_size-1)\n\tscore_db, min_pos, max_pos = precompute_dep_score(hic_list, bin_size, dep_size, sat_level)\n\tif len(score_db) != 0:\n\t\tscore_db = get_sub_score_db(score_db, min_pos, max_pos, bin_size, dep_size)\n\tif is_wide:\n\t\tif len(score_db) != 0:\n\t\t\twide_mismatch = get_wide_mismatch(score_db, thr, bin_size)\n\t\telse:\n\t\t\twide_mismatch = []\n\t\treturn wide_mismatch\n\telse:\n\t\treturn score_db\n\n\ndef merge_region(wide_list, narrow_score, bin_size):\n\tidx_wide = 0\n\tmin_val = 0\n\ttmp_list = []\n\tif narrow_score == {}:\n\t\treturn wide_list\n\tfor pos in sorted(narrow_score):\n\t\tif idx_wide >= len(wide_list):\n\t\t\tbreak\n\t\tif pos <= wide_list[idx_wide][0]:\n\t\t\tmin_val = narrow_score[pos]\n\t\telse:\n\t\t\tif narrow_score[pos] < min_val:\n\t\t\t\tmin_val = narrow_score[pos]\n\t\tif pos+bin_size <= wide_list[idx_wide][0]:\n\t\t\tcontinue\n\t\tif pos >= wide_list[idx_wide][1]:\n\t\t\tfor i in range(wide_list[idx_wide][0], wide_list[idx_wide][1], bin_size):\n\t\t\t\tif i in narrow_score and narrow_score[i] == min_val:\n\t\t\t\t\ttmp_list.append([i, i+bin_size])\n\t\t\tidx_wide += 1\n\tif idx_wide < len(wide_list):\n\t\tfor i in range(wide_list[idx_wide][0], wide_list[idx_wide][1], bin_size):\n\t\t\t\tif i in narrow_score and narrow_score[i] == min_val:\n\t\t\t\t\ttmp_list.append([i, i+bin_size])\n\tif tmp_list == []:\n\t\treturn wide_list\n\tnarrow_mismatch = []\n\tlast_e = 0\n\tfor s, e in tmp_list:\n\t\tif last_e == 0:\n\t\t\tnarrow_mismatch.append([s])\n\t\t\tlast_e = e\n\t\telse:\n\t\t\tif s != last_e:\n\t\t\t\tnarrow_mismatch[-1].append(last_e)\n\t\t\t\tnarrow_mismatch.append([s])\n\t\tlast_e = e\n\t\n\tnarrow_mismatch[-1].append(last_e)\n\n\treturn narrow_mismatch\n\t\t\n\ndef pipeline(in_bam, mapq, dep_size, bin_size, narrow_bin_size, percent, sensitive, ctg):\n\ttime_print(\"\\tContig: %s Getting mapping list\"%ctg)\n\twith pysam.AlignmentFile(in_bam, 'rb') as bam:\n\t\tmapping_list = get_pos_list(bam.fetch(contig=ctg), mapq)\n\t\n\tif mapping_list == []:\n\t\ttime_print(\"\\tContig: %s Could not found mapping list\"%ctg)\n\t\treturn []\n\t\n\ttime_print(\"\\tContig: %s Getting hic list with bin size: %d\"%(ctg, bin_size))\n\thic_list = get_hic_list(mapping_list, bin_size)\n\n\ttime_print(\"\\tContig: %s Getting wide mismatch\"%ctg)\n\twide_mismatch = get_mismatch(hic_list, bin_size, dep_size, percent, sensitive, True)\n\tif wide_mismatch == []:\n\t\ttime_print(\"\\tContig: %s Could not found mismatch\"%ctg)\n\t\treturn []\n\t\n\tdep_size = bin_size\n\tbin_size = narrow_bin_size\n\n\ttime_print(\"\\tContig: %s Getting narrow score with bin size: %d\"%(ctg, bin_size))\n\thic_list = get_hic_list(mapping_list, bin_size)\n\n\tnarrow_score = get_mismatch(hic_list, bin_size, dep_size, percent, sensitive, False)\n\t\n\ttime_print(\"\\tContig: %s Getting narrow mismatch\"%ctg)\n\tnarrow_mismatch = merge_region(wide_mismatch, narrow_score, bin_size)\n\tif narrow_mismatch == wide_mismatch:\n\t\ttime_print(\"\\tContig: %s Wide mismatch without update\"%ctg)\n\treturn narrow_mismatch\n\n\ndef ALLHiC_correct(in_bam, in_fa, out_fa, mapq, dep_size, bin_size, narrow_bin_size, percent, sensitive, thread):\n\ttime_print(\"Reading mapping\")\n\twith pysam.AlignmentFile(in_bam, 'rb') as bam:\n\t\tctg_len = get_ctg_len(bam)\n\n\ttime_print(\"Running pipeline\")\n\tpool = multiprocessing.Pool(processes=thread)\n\tres = []\n\tfor ctg in ctg_len:\n\t\tr = pool.apply_async(pipeline, (in_bam, mapq, dep_size, bin_size, narrow_bin_size, percent, sensitive, ctg,))\n\t\tres.append([ctg, r])\n\tpool.close()\n\tpool.join()\n\t\n\tbam.close()\n\tnarrow_mismatch = {}\n\tfor ctg, r in res:\n\t\tsub_mismatch = r.get()\n\t\tif sub_mismatch != []:\n\t\t\tnarrow_mismatch[ctg] = sub_mismatch\n\t\n\ttime_print(\"Found all mismatches\")\n\t\n\ttime_print(\"Reading origin fasta\")\n\tfa_db = {}\n\twith open(in_fa, 'r') as fin:\n\t\tfor line in fin:\n\t\t\tif line[0] == '>':\n\t\t\t\tid = line.strip().split()[0][1:]\n\t\t\t\tfa_db[id] = []\n\t\t\telse:\n\t\t\t\tfa_db[id].append(line.strip())\n\t\n\tfor id in fa_db:\n\t\tfa_db[id] = ''.join(fa_db[id])\n\t\n\ttime_print(\"Writing result\")\n\twith open(out_fa, 'w') as fout:\n\t\tfor ctg in sorted(fa_db):\n\t\t\tif ctg in narrow_mismatch:\n\t\t\t\tbase = 0\n\t\t\t\tfor s, e in narrow_mismatch[ctg]:\n\t\t\t\t\ts = s-1\n\t\t\t\t\te = e-1\n\t\t\t\t\tfout.write(\">%s_%d_%d\\n%s\\n\"%(ctg, base+1, s, fa_db[ctg][base: s]))\n\t\t\t\t\tfout.write(\">%s_%d_%d\\n%s\\n\"%(ctg, s+1, e, fa_db[ctg][s: e]))\n\t\t\t\t\tbase = e\n\t\t\t\tif base < len(fa_db[ctg]):\n\t\t\t\t\tfout.write(\">%s_%d_%d\\n%s\\n\"%(ctg, base, len(fa_db[ctg]), fa_db[ctg][base:]))\n\t\t\telse:\n\t\t\t\tfout.write(\">%s\\n%s\\n\"%(ctg, fa_db[ctg]))\n\t\n\ttime_print(\"Finished\")\n\n\nif __name__ == \"__main__\":\n\topts = get_opt()\n\tin_bam = opts.mapping\n\tin_fa = opts.reference\n\tout_fa = opts.output\n\tmapq = opts.mapq\n\tpercent = opts.percent\n\tsensitive = opts.sensitive\n\tdep_size = opts.depletion\n\tbin_size = opts.wide\n\tnarrow_bin_size = opts.narrow\n\tthread = opts.threads\n\tALLHiC_correct(in_bam, in_fa, out_fa, mapq, dep_size, bin_size, narrow_bin_size, percent, sensitive, thread)\n\n\n"
  },
  {
    "path": "bin/ALLHiC_partition",
    "content": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"b:r:e:k:m:\";\n\n\nif ( (!defined $opt_r)|| (!defined $opt_e)|| (!defined $opt_k)) {\n    die \"************************************************************************\n    Usage: ALLHiC_partition -r draft.asm.fasta -e enzyme_sites -k Num of groups\n      -h : help and usage.\n      -b : prunned bam (optional, default prunning.bam)\n      -r : draft.sam.fasta\n      -e : enzyme_sites (HindIII: AAGCTT; MboI: GATC, Arima)\n      -k : number of groups (user defined K value)\n      -m : minimum number of restriction sites (default, 25)\n************************************************************************\\n\";\n}\n\nmy $bam     = (defined $opt_b)?$opt_b:\"prunning.bam\";\nmy $refSeq  = $opt_r;\nmy $esites  = uc $opt_e;\n$esites     = \"AAGCTT\" if($esites eq \"HINDIII\");\n$esites     = \"GATC\" if($esites eq \"MBOI\");\n\n\nmy $K       = $opt_k;\nmy $minRes  = (defined $opt_m)?$opt_m:25;   \nmy $runcmd  = \"\";\nprint \"Extract function: calculate an empirical distribution of Hi-C link size based on intra-contig links\\n\";\nif ($esites eq \"ARIMA\") {\n  $runcmd     = \"allhic extract \".$bam.\" \".$refSeq.\" --RE='GATCGATC,GANTGATC,GANTANTC,GATCANTC'\";\n  $esites = \"GATCGATC_GANTGATC_GANTANTC_GATCANTC\";\n}\nelse {\n  $runcmd     = \"allhic extract \".$bam.\" \".$refSeq.\" --RE \".$esites;\n}\nprint \"CMD: $runcmd\\n\";\nsystem($runcmd);\n\n\nprint \"Partition contigs based on prunning bam file\\n\";\nmy $counts_file = $bam.\".counts_\".$esites.\".txt\";\n$counts_file    =~ s/.bam//g;\nmy $pairs_file  = $bam.\".pairs.txt\";\n$pairs_file     =~ s/.bam//g;\n$runcmd         = \"allhic partition $counts_file $pairs_file \".$K.\" --minREs \".$minRes;\nprint \"CMD: $runcmd\\n\";\nsystem($runcmd);\n\n\n"
  },
  {
    "path": "bin/ALLHiC_pip.sh",
    "content": "#!/bin/bash\n\nusage()\n{\n\techo \"    Usage: `basename $0` -r reference -1 R1.fq -2 R2.fq -k group_count [-e enzyme] [-t threads] [-b bin_size]\"\n\techo \"          -r: reference genome\"\n\techo \"          -1: Lib_R1.fq.gz\"\n\techo \"          -2: Lib_R2.fq.gz\"\n\techo \"          -k: group_count\"\n\techo \"          -e: enzyme_sites (HindIII: AAGCTT; MboI: GATC), default: HindIII\"\n\techo \"          -t: threads, default: 10\"\n\techo \"          -b: bin_size for hic heatmap, can be divided with comma, default: 500k\"\n\texit 0\n}\n\n### get options\nwhile getopts ':r:1:2:k:e:t:b:' OPT; do\n\tcase $OPT in\n\t\tr)\n\t\t\tref=\"$OPTARG\";;\n\t\t1)\n\t\t\tR1=\"$OPTARG\";;\n\t\t2)\n\t\t\tR2=\"$OPTARG\";;\n\t\te)\n\t\t\tenzyme=\"$OPTARG\";;\n\t\tk)\n\t\t\tgroup_count=\"$OPTARG\";;\n\t\tt)\n\t\t\tthreads=\"$OPTARG\";;\n\t\tb)\n\t\t\tbin_size=\"$OPTARG\";;\n\t\t?)\n\t\t\tusage;;\n\tesac\ndone\nbwa=\"bwa\"\n\n### check required variants\nif [ -z $ref ] || [ -z $R1 ] || [ -z $R2 ] || [ -z $group_count ]; then\n\tusage\nfi\n\n### set default values while optional variants were not set\nif [ -z $threads ]; then\n\tthreads=10\nfi\n\nif [ -z $bin_size ]; then\n\tbin_size=500k\nfi\n\nif [ -z $enzyme ]; then\n\tenzyme=AAGCTT\nfi\n\nenzyme=`echo $enzyme | tr '[a-z]' '[A-Z]'`\n\nif [ $enzyme = HINDIII ]; then\n\tenzyme=AAGCTT\nfi\n\nif [ $enzyme = MBOI ]; then\n\tenzyme=GATC\nfi\n\n### link required files\nln -s ${ref} ./seq.fasta\nln -s ${R1} ./Lib_R1.fastq.gz\nln -s ${R2} ./Lib_R2.fastq.gz\n\n### index reference genome\nbwa index seq.fasta\nsamtools faidx seq.fasta\n\n\n### 1st round of mapping\nbwa mem -SP5M -t $threads seq.fasta Lib_R1.fastq.gz Lib_R2.fastq.gz \\\n     | samtools view -hF 256 - \\\n     | samtools sort -@ $threads -o sorted.bam -T tmp.ali\nsamtools index sorted.bam\n\n### correct contig\nALLHiC_corrector -m sorted.bam -r seq.fasta -o seq.HiCcorrected.fasta -t $threads\n\n### 2nd round of mapping\nbwa index seq.HiCcorrected.fasta\nsamtools faidx seq.HiCcorrected.fasta\nbwa mem -SP5M -t $threads seq.HiCcorrected.fasta Lib_R1.fastq.gz Lib_R2.fastq.gz \\\n     | samtools view -hF 256 - \\\n     | samtools sort -@ $threads -o sample.bwa_mem.bam -T tmp.ali\n\n\n### filter bam\nsamtools view -bq 40 sample.bwa_mem.bam  |samtools view -bt seq.HiCcorrected.fasta.fai > sample.unique.bam\nPreprocessSAMs.pl sample.unique.bam seq.HiCcorrected.fasta $enzyme\n\n### partition\nALLHiC_partition -r seq.HiCcorrected.fasta -e $enzyme -k $group_count -b sample.unique.REduced.paired_only.bam\n\n### optimize\nrm cmd.list\nfor((K=1;K<=$group_count;K++));do echo \"allhic optimize sample.unique.REduced.paired_only.counts_${enzyme}.${group_count}g${K}.txt sample.unique.REduced.paired_only.clm\" >> cmd.list;done\nParaFly -c cmd.list -CPU $threads\n\n### build\nALLHiC_build seq.HiCcorrected.fasta\n\n### plot\nsamtools faidx groups.asm.fasta\ncut -f1,2 groups.asm.fasta.fai|grep sample > chrn.list\nALLHiC_plot sample.bwa_mem.bam groups.agp chrn.list $bin_size pdf\n\n\n"
  },
  {
    "path": "bin/ALLHiC_plot",
    "content": "#!/usr/bin/env python\nimport argparse\nimport numpy as np\nimport matplotlib as mpl\nmpl.use(\"Agg\")\nimport matplotlib.pyplot as plt\nimport pysam\nimport time\nimport os\n\n\ndef time_print(info):\n    print(\"\\033[32m%s\\033[0m %s\"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())), info))\n\n\ndef get_opts():\n    groups = argparse.ArgumentParser()\n    groups.add_argument('-b', '--bam', help='Input bam file', required=True)\n    groups.add_argument('-a', '--agp', help='Input AGP file', required=True)\n    groups.add_argument('-l', '--list', help='Chromosome list, contain: ID\\tLength', required=True)\n    groups.add_argument('-n', '--npz', help=\"npz file of hic signal, optional, if not exist, it will be generate after reading hic signals, or it will be loaded for drawing other resolution of heatmap\", default=\"\")\n    groups.add_argument('-m', '--min_size', help=\"Minium bin size of heatmap, default=50k\", default=\"50k\")\n    groups.add_argument('-s', '--size', help=\"Bin size of heatmap, can be a list separated by comma, default=500k, notice: it must be n times of min_size (n is integer) or we will ajust it to nearest one\", default=\"500k\")\n    groups.add_argument('-o', '--outdir', help='Output directory, default=workdir', default='workdir')\n\n    return groups.parse_args()\n\n\n# Get chromosome length\ndef get_chr_len(chr_list):\n    chr_len_db = {}\n    chr_order = []\n    with open(chr_list, 'r') as f_in:\n        for line in f_in:\n            if line.strip() == '':\n                continue\n            data = line.strip().split()\n            chr_order.append(data[0])\n            chr_len_db[data[0]] = int(data[1])\n    return chr_len_db, chr_order\n\n\n# Calc read counts on each bin\ndef calc_read_count_per_min_size(chr_len_db, chr_order, bam, agp, min_size):\n    long_bin_size=min_size\n    read_count_whole_genome = {}\n    \n    bin_offset = [0 for i in range(0, len(chr_order)+1)]\n    bin_count = [0 for i in range(0, len(chr_order)+1)]\n    total_bin_count = 0\n    \n    for chrn in chr_len_db:\n        bin_count_of_chr = int(round((chr_len_db[chrn]*1.0/long_bin_size+0.51)))\n        total_bin_count += bin_count_of_chr\n        bin_count[chr_order.index(chrn)+1] = bin_count_of_chr\n    \n    for i in range(1, len(bin_count)):\n        bin_offset[i] = bin_count[i]+bin_offset[i-1]\n    read_count_whole_genome = [[0 for i in range(0, total_bin_count)] for j in range(0, total_bin_count)]\n    \n    ctg_on_chr = {}\n    with open(agp, 'r') as f_in:\n        for line in f_in:\n            if line.strip() == '' or line.strip().startswith('#'):\n                continue\n            data = line.strip().split()\n            if data[4] == 'U':\n                continue\n            chrn = data[0]\n            start_pos = int(data[1])\n            end_pos = int(data[2])\n            ctg = data[5].replace('_pilon', '')\n            direct = data[-1]\n            ctg_on_chr[ctg] = [chrn, start_pos, end_pos, direct]\n\n    with pysam.AlignmentFile(bam, 'rb') as fin:\n        for line in fin:\n            if line.is_unmapped or line.mate_is_unmapped:\n                continue\n            ctg1 = line.reference_name\n            ctg2 = line.next_reference_name\n            read_pos1 = line.reference_start+1\n            read_pos2 = line.next_reference_start+1\n\n            if ctg1 not in ctg_on_chr or ctg2 not in ctg_on_chr:\n                continue\n            chrn1, ctg_start_pos1, ctg_end_pos1, ctg_direct1 = ctg_on_chr[ctg1]\n            chrn2, ctg_start_pos2, ctg_end_pos2, ctg_direct2 = ctg_on_chr[ctg2]\n            if ctg_direct1 == '+':\n                converted_pos1 = ctg_start_pos1 + read_pos1 - 1\n            else:\n                converted_pos1 = ctg_end_pos1 - read_pos1 + 1\n            if ctg_direct2 == '+':\n                converted_pos2 = ctg_start_pos2 + read_pos2 - 1\n            else:\n                converted_pos2 = ctg_end_pos2 - read_pos2 + 1\n            if chrn1 not in chr_len_db or chrn2 not in chr_len_db:\n                continue\n            pos1_index = int(converted_pos1/long_bin_size)\n            pos2_index = int(converted_pos2/long_bin_size)\n            \n            chr1_index = chr_order.index(chrn1)\n            chr2_index = chr_order.index(chrn2)\n            \n            whole_pos1 = bin_offset[chr1_index] + pos1_index\n            whole_pos2 = bin_offset[chr2_index] + pos2_index\n            try:\n                read_count_whole_genome[whole_pos1][whole_pos2] += 1\n                read_count_whole_genome[whole_pos2][whole_pos1] += 1\n            except Exception:\n                time_print(\"Index error on whole genome: index1: %d, index2: %d, bin counts: %d\"%(whole_pos1, whole_pos2, total_bin_count))\n    \n    return np.array(bin_offset), np.array(read_count_whole_genome)\n\n\ndef draw_heatmap(read_count_whole_genome_min_size, bin_offset_min_size, ratio, chr_order, min_size):\n    bin_size = str(int(ratio*min_size))\n    if bin_size[-9:] == '000000000':\n        short_bin_size = bin_size[:-9]+'G'\n    elif bin_size[-6:] == '000000':\n        short_bin_size = bin_size[:-6]+'M'\n    elif bin_size[-3:] == '000':\n        short_bin_size = bin_size[:-3]+'K'\n\n    total_cnt = len(read_count_whole_genome_min_size)\n    ratio_cnt = int(round(total_cnt*1.0/ratio+0.51, 0))\n    plt_cnt = int(total_cnt*1.0/ratio)\n\n    data = read_count_whole_genome_min_size\n    \n    data = np.pad(data, ((0, ratio_cnt*ratio-total_cnt), (0, ratio_cnt*ratio-total_cnt)), 'constant', constant_values=0)\n    data = data.reshape(-1, ratio_cnt, ratio).sum(axis=2)\n    data = data.reshape(ratio_cnt, -1, ratio_cnt).sum(axis=1)\n\n    fn = \"%s_Whole_genome.pdf\"%short_bin_size\n    cmap = plt.get_cmap(\"YlOrRd\")\n    cmap.set_over('black')\n    ax = plt.gca()\n    with np.errstate(divide='ignore'):\n        hmap = ax.imshow(np.log2(data[: plt_cnt, : plt_cnt]), interpolation='nearest', origin='lower',cmap=cmap, aspect='equal')\n    plt.colorbar(mappable=hmap, cax=None, ax=None, shrink=0.5)\n    plt.tick_params(labelsize=6)\n    for ticks in ax.get_xticklabels():\n        ticks.set_rotation(90)\n    for ticks in ax.get_yticklabels():\n        ticks.set_rotation(0)\n    title = 'Whole_genome_'+short_bin_size\n    plt.xlabel(\"Bins (\"+short_bin_size.lower()+\"b per bin)\", fontsize=8)\n    plt.xticks([])\n    plt.yticks([])\n    plt.title(title, y=1.01, fontsize=12)\n    plt.savefig(fn, bbox_inches='tight', dpi=200)\n    plt.close('all')\n\n    chr_cnt = len(chr_order)\n    row_cnt = int(round(np.sqrt(chr_cnt)+0.51))\n    col_cnt = int(round(chr_cnt*1.0/row_cnt+0.51))\n    all_fn = '%s_all_chrs.pdf'%short_bin_size\n    plt.figure(figsize=(col_cnt*2, row_cnt*2))\n    idx = 1\n    for chrn in chr_order:\n        sr = bin_offset_min_size[idx-1]\n        er = bin_offset_min_size[idx]\n        sub_data = read_count_whole_genome_min_size[sr: er, sr: er]\n        total_cnt = len(sub_data)\n        ratio_cnt = int(round(total_cnt*1.0/ratio+0.51, 0))\n        plt_cnt = int(total_cnt*1.0/ratio)\n\n        sub_data = np.pad(sub_data, ((0, ratio_cnt*ratio-total_cnt), (0, ratio_cnt*ratio-total_cnt)), 'constant', constant_values=0)\n        sub_data = sub_data.reshape(-1, ratio_cnt, ratio).sum(axis=2)\n        sub_data = sub_data.reshape(ratio_cnt, -1, ratio_cnt).sum(axis=1)\n\n        plt.subplot(row_cnt, col_cnt, idx)\n        ax = plt.gca()\n        cmap = plt.get_cmap('YlOrRd')\n        cmap.set_over('black')\n        with np.errstate(divide='ignore'):\n            hmap = ax.imshow(np.log2(sub_data[: plt_cnt, : plt_cnt]), interpolation='nearest', origin='lower', cmap=cmap, aspect='equal')\n        plt.colorbar(mappable=hmap, cax=None, ax=None, shrink=0.5)\n        plt.tick_params(labelsize=5)\n        plt.title(chrn)\n        idx += 1\n    \n    plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5)\n    plt.savefig(all_fn, bbox_inches='tight', dpi=200)\n    plt.close('all')\n\n\ndef ALLHiC_plot(bam, agp, chrlist, npzfile, minsize, binsize, outdir):\n    bam = os.path.abspath(bam)\n    agp = os.path.abspath(agp)\n    chrlist = os.path.abspath(chrlist)\n    if npzfile != \"\":\n        npzfile = os.path.abspath(npzfile)\n\n    if not os.path.exists(outdir):\n        os.mkdir(outdir)\n    os.chdir(outdir)\n\n    min_size = minsize.upper()\n    min_size = min_size.replace('K', '000')\n    min_size = min_size.replace('M', '000000')\n    min_size = min_size.replace('G', '000000000')\n    min_size = int(min_size)\n\n    bin_list = binsize.split(',')\n    bin_ratio = []\n    for bin_size in bin_list:\n        long_bin_size = bin_size.upper()\n        long_bin_size = long_bin_size.replace('K', '000')\n        long_bin_size = long_bin_size.replace('M', '000000')\n        long_bin_size = long_bin_size.replace('G', '000000000')\n        long_bin_size = int(long_bin_size)\n        bin_ratio.append(int(round(long_bin_size/min_size+0.01, 0)))\n        \n    \n    time_print(\"Step1: Get chromosome length\")\n    chr_len_db, chr_order = get_chr_len(chrlist)\n\n    time_print(\"Step2: Get signal matrix\")\n    if npzfile != \"\" and os.path.exists(npzfile):\n        npzdata = np.load(npzfile)\n        bin_offset_min_size = npzdata['bin_offset_min_size']\n        read_count_whole_genome_min_size = npzdata['read_count_whole_genome_min_size']\n    else:\n        bin_offset_min_size, read_count_whole_genome_min_size = calc_read_count_per_min_size(chr_len_db, chr_order, bam, agp, min_size)\n        if npzfile != \"\":\n            np.savez(npzfile.replace('.npz', ''), bin_offset_min_size=bin_offset_min_size, read_count_whole_genome_min_size=read_count_whole_genome_min_size)\n    \n    time_print(\"Step3: Draw heatmap\")\n    \n    for i in range(0, len(bin_ratio)):\n        ratio = bin_ratio[i]\n        time_print(\"Drawing with bin size %s\"%bin_list[i])\n        draw_heatmap(read_count_whole_genome_min_size, bin_offset_min_size, ratio, chr_order, min_size)\n    os.chdir('..')\n    time_print(\"Success\")\n\n\nif __name__ == \"__main__\":\n    opts = get_opts()\n    bam = opts.bam\n    agp = opts.agp\n    chrlist = opts.list\n    npzfile = opts.npz\n    minsize = opts.min_size\n    binsize = opts.size\n    outdir = opts.outdir\n\n    ALLHiC_plot(bam, agp, chrlist, npzfile, minsize, binsize, outdir)\n"
  },
  {
    "path": "bin/ALLHiC_rescue",
    "content": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"b:r:c:i:m:\";\n\n\n\nif ( (!defined $opt_b)|| (!defined $opt_r)|| (!defined $opt_c)|| (!defined $opt_i)) {\n    die \"**************************************************************************************\n    Usage: ALLHiC_rescue -r draft.asm.fasta -b sample.clean.bam -c clusters.txt -i counts.file\n      -h : help and usage.\n      -b : sample.clean.bam (unpruned bam)\n      -r : draft.sam.fasta\n      -c : prunning.clusters.txt\n      -i : prunning.counts_AAGCTT.txt\n      -m : minimum single density for rescuing contigs (optional, default 0.01)\n**************************************************************************************\\n\";\n}\n\nmy $bam         = $opt_b;\nmy $refSeq      = $opt_r;\nmy $clusters    = $opt_c;\nmy $counts_file = $opt_i;\nmy $minSig      = (defined $opt_m)?$opt_m:0.01;\n\nprint \"Starting rescue ungrouped contigs\\n\";\nprint \"Reading contig length\\n\";\nmy %ctgdb;\nopen(IN, $counts_file) or die\"\";\nwhile(<IN>){\n\tchomp;\n\tnext if(/#/);\n\tmy ($ctg, $RECounts, $len) = split(/\\s+/,$_);\n\t$ctgdb{$ctg}->{'RECounts'} = $RECounts;\n\t$ctgdb{$ctg}->{'length'}   = $len;\n\t}\nclose IN;\n\nprint \"Reading link signals ...\\n\";      \nmy %signaldb;\nmy @bamList = split(/,/,$opt_b);\nforeach my $bam (@bamList){\n\tprint \"Reading $bam\\n\";\n\topen(my $in, \"samtools view $bam|\") or die\"\";\n\twhile(<$in>){\n\t\tchomp;\n\t\tmy ($reads,$ctga,$ctgb) = (split/\\s+/,$_)[0,2,6];\n\t\tnext if($ctgb eq \"=\");\n\t\tnext if($ctgb eq \"*\");\n\t\tmy ($a,$b) = sort ($ctga,$ctgb);\n\t\tmy $key    = $a.\",\".$b;\n\t\t$signaldb{$key}++;\n\t\t}\n\tclose $in;\n\t}\n\nprint \"find ungrouped contigs ...\\n\";\nmy %GROUPDB;\nmy %anchordb;\nmy $gid = 0;\nopen(IN, $clusters) or die\"\";\nwhile(<IN>){\n\tchomp;\n\tnext if(/#/);\n\t$gid++;\n\tmy $g = \"group\".$gid;\n\tmy @data = split(/\\s+/,$_);\n\tforeach my $i(2..$#data){\n\t\t$anchordb{$data[$i]} = $gid;\n\t\t$GROUPDB{$g}->{'origin'} .= $data[$i].\" \";\n\t\t}\n\t}\nclose IN;\n\nprint \"output HiC link signals ...\\n\";\nopen(OUT, \"> signals.txt\") or die\"\";\nprint OUT \"#GID\tunclustered_ctg\tLinked_reads\tAnchored_ctgs\\n\";\nforeach my $key (keys %signaldb){\n\tmy ($a,$b) = split(/,/,$key);\n\tnext if(exists($anchordb{$a}) and exists($anchordb{$b}));\n\tnext if(!exists($anchordb{$a}) and !exists($anchordb{$b}));\n#\t$a         = \"group\".$anchordb{$a} if(exists($anchordb{$a}));\n#\t$b         = \"group\".$anchordb{$b} if(exists($anchordb{$b}));\n#\tnext if($a=~/group/ and $b=~/group/);\n#\tnext if(!($a=~/group/) and !($b=~/group/));\n\tmy $ga  = (exists($anchordb{$a}))?$a:$b; ### anchored contig should be placed in the first row\n\tmy $ub  = (!exists($anchordb{$b}))?$b:$a;### followed by unanchored contig\n\tif(!exists($ctgdb{$a}->{'length'})){\n\t\tprint \"WARNING: $a not found in $counts_file, PASS\\n\";\n\t\tnext;\n\t\t}\n\tif(!exists($ctgdb{$b}->{'length'})){\n\t\tprint \"WARNING: $b not found in $counts_file, PASS\\n\";\n\t\tnext;\n\t\t}\t\n\tmy $lenA     = $ctgdb{$ga}->{'length'};\n\tmy $lenB     = $ctgdb{$ub}->{'length'};\n\tmy $sigD     = ($signaldb{$key}*1000)/($lenA+$lenB);\n\t#   $sigD     = sprintf(\"%.2f\",$sigD);\n\tprint OUT \"group$anchordb{$ga}\t$ub\t$sigD\t$ga\\n\"\n\t}\nclose OUT;\n\nmy %infordb;\nmy %groupdb;\nopen(IN, \"signals.txt\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tnext if(/#/);\n\tmy ($gid,$ctg,$value) = (split/\\s+/,$_)[0,1,2];\n\t$infordb{$ctg}->{$gid} += $value;  \n\t$groupdb{$gid}++;\n\t}\nclose IN;\n\nmy $num_of_groups = keys %groupdb;\nopen(OUT, \"> unanchor.signal.txt\") or die \"\";\nprint OUT \"unanchored_contig\t\";\nforeach my $gid(sort keys %groupdb){\n\tprint OUT \"$gid\t\";\n\t}\nprint OUT \"best_group\tbest_ctg1\tsigD\tbest_ctg2\\n\";\n\nforeach my $ctg (sort keys %infordb){\n\tmy $v    = 0;\n\tmy $maxv = 0;\n\tprint OUT \"$ctg\t\";\n\tforeach my $g(sort keys %groupdb){\n\t\t$v  = $infordb{$ctg}->{$g} if(exists($infordb{$ctg}->{$g}));\n\t\t$v  = 0 if(!exists($infordb{$ctg}->{$g}));\n#\t\t$v  = sprintf (\"%.2f\",$v);\n\t\t$maxv = $v if($v>$maxv);\n\t  print OUT \"$v\t\";\n\t\t}\n#\tnext if($maxv<=$minSig); ### minimum singal density should be larger than 0.01\n\tmy $count = 0;\n\tmy $best_g;\n\tforeach $g (sort {$infordb{$ctg}->{$b}<=>$infordb{$ctg}->{$a}} keys %{$infordb{$ctg}}){\n\t\t$count++;\n\t\tlast if($count>1);\n\t\t$best_g = $g;\n\t\t}\n\tprint OUT \"$best_g\t$maxv #\\n\" if($maxv<=$minSig);\n\tprint OUT \"$best_g      $maxv \\n\" if($maxv>$minSig);\n\t$GROUPDB{$best_g}->{'rescued'} .= $ctg.\" \" if($maxv>$minSig);\n#\tmy $line = `grep \\'$ctg\\' signals.txt |grep \\'$best_g\\' |sort -k 3 -n -r |head -n 2|cut -f4`;\n#\tmy ($best_ctg1,$best_ctg2) = split(/\\n/,$line);\n#\tprint OUT \"$best_ctg1\t$maxv\t$best_ctg2\t\\n\";\n\t}\nclose OUT;\n\nprint \"Output refined clusters \\n\";\nforeach my $gid (keys %GROUPDB){\n\tmy @odb = split(/\\s+/,$GROUPDB{$gid}->{'origin'});\n\t$GROUPDB{$gid}->{'rescued'} = \"\" if(!exists($GROUPDB{$gid}->{'rescued'}));\n\tmy @rdb = split(/\\s+/,$GROUPDB{$gid}->{'rescued'});\n\tmy $no  = @odb;\n\tmy $nr  = @rdb;\n\tprint \"Number of original contigs in $gid: $no\\n\";\n\tprint \"Number of rescued contigs in $gid: $nr\\n\";\n\tmy $outfile = $gid.\".txt\";\n\topen(my $out, \"> $outfile\") or die\"\";\n\tprint $out \"#Contig\tRECounts\tLength\\n\";\n\tmap {print $out \"$_\t$ctgdb{$_}->{'RECounts'}\t$ctgdb{$_}->{'length'}\\n\"} @odb;\n\tmap {print $out \"$_\t$ctgdb{$_}->{'RECounts'}\t$ctgdb{$_}->{'length'}\\n\"} @rdb;\n\tclose $out;\t\n\t}\n\n\n\n"
  },
  {
    "path": "scripts/ALLHiC2ALLMAPS.pl",
    "content": "#!/usr/bin/perl -w\n### Convert ALLHiC output AGP file to ALLMAPS input csv file\nprint \"Convert ALLHiC output AGP file to ALLMAPS input csv file\\n\";\ndie \"Usage: perl $0 groups.agp\\n\" if(!defined $ARGV[0]);\n\nmy $agp = $ARGV[0];\nopen(OUT, \"> hic.csv\") or die\"\";\nprint OUT \"Scafffold ID,scaffold position,LG,genetic position\\n\";\nopen(IN, \"grep -v 'contig' $agp|\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\tif($data[8] eq \"+\"){\n\t\t$a = $data[6]; $b = $data[7];\n\t}elsif($data[8] eq \"-\"){\n\t\t$a = $data[7]; $b = $data[6];\n\t\t}\n\tprint OUT \"$data[5],$a,$data[0],$data[1]\\n\";\n\tprint OUT \"$data[5],$b,$data[0],$data[2]\\n\";\n\t}\nclose IN;\nclose OUT;\n\n"
  },
  {
    "path": "scripts/PreprocessSAMs.pl",
    "content": "#!/usr/bin/perl -w\nuse strict;\n\n\n\n# PreprocessSAMs.pl\n#\n# Syntax: PreprocessSAMs.pl <sam or bam filename> <draft assembly fasta>\n#\n# This Perl script prepares a SAM/BAM file for use with Lachesis.\n# Specifically, it pre-processes the file with bedtools, samtools, picard to remove redundant, chimeric, and/or uninformative read pairs.\n# This creates a dataset of Hi-C links with as strong a signal as possible, and it's also as small as possible, so as to reduce I/O runtime in Lachesis.\n# (NOTE: As of August 24, 2013, I'm no longer removing PCR duplicates.  Picard's MarkDuplicates is extremely slow and resource-intensive - far more so than\n# the runtime benefit in Lachesis of having fewer reads.  I don't think it's removing PCR duplicates properly, nor do I think PCR duplicate removal is even\n# necessary - http://seqanswers.com/forums/showthread.php?t=6854).\n#\n# This script will determine whether the file is a SAM or a BAM file, and then run the following commands:\n#\n# COMMAND                                OUTPUT FILENAME                               WHAT THE COMMAND DOES\n# make_bed_around_RE_site.pl             <fasta>.near_<RE>.<range>.bed                 Prepare the bed file for bedtools intersect (next command)\n# bedtools intersect                     <head>.REduced.bam                            Remove all reads that aren't within 500 bp of a restriction site\n### picard SortSam.jar                     <head>.REduced.sort_coord.bam                 Sort the file in coordinate order so PCR duplicates can be removed\n### picard MarkDuplicates.jar              <head>.REduced.sort_coord.nodups.bam          Remove PCR duplicates\n### picard SortSam.jar                     <head>.REduced.nodups.bam                     Sort the file in query-name order so Lachesis can read it\n# samtools view -F12                     <head>.REduced.nodups.paired_only.bam         Filter out all pairs in which both reads are not aligned\n# samtools flagstat                      <head>.REduced.nodups.paired_only.flagstat    Make a flagstat file that describes the contents of the BAM file\n#\n#\n# The final output file will be <head>.REduced.paired_only.bam.  This is what should be entered into the Lachesis INI file under the key \"SAM_FILES\".\n#\n# To pre-process several SAM/BAM files in parallel, use the script PreprocessSAMs.sh, which can be submitted to a cluster via qsub.\n#\n# Josh Burton\n# July 2013\n\n\n\n################################\n#                              #\n#   USER-DEFINED PARAMETERS    #\n#                              #\n################################\n\n\nmy $dry_run = 0; # if true, just print the commands to be run - don't actually run them\n#my $RE_site = 'AAGCTT'; # the restriction enzyme site at which the DNA was cut for the Hi-C experiment\n\n# Paths to the necessary scripts and software packages.\nmy $make_bed_around_RE_site_pl = 'make_bed_around_RE_site.pl';\nmy $bedtools = 'bedtools';\nmy $samtools = 'samtools';\n#my $mem = \"16G\";\n#my $picard_head = \"java -d64 -Xmx$mem -jar /net/shendure/vol10/jnburton/extern/picard-tools-1.50/\";\n\n\n\n################################\n#                              #\n#         SUBROUTINES          #\n#                              #\n################################\n\n# Print and then run a command in bash (unless $dry_run, in which case just print it.)\n# First argument: the command to run.\n# Second argument (optional): the file to redirect stdout to.\nsub run_cmd(@) {\n    \n    my ($cmd,$redirect) = @_;\n    \n    print localtime() . \": PreprocessSAMs.pl: $cmd\\n\";\n    \n    return if $dry_run;\n    \n    if ($redirect) { system ( \"$cmd > $redirect\" ) }\n    else           { system ( $cmd ); }\n}\n\n\n\n\n################################\n#                              #\n#     CONTROL STARTS HERE      #\n#                              #\n################################\n\n\n# Get the command-line arguments, or check syntax.\nif ( @ARGV != 3 ) {\n    print STDERR \"\\nPreprocessSAMs.pl: A script to prepare SAM or BAM files for use with Lachesis.\\n\\nSyntax: $0 <sam-or-bam-filename> <draft-assembly-fasta> enzyme(HINDIII/MBOI/Arima)\\n\\n\";\n    exit;\n}\n\n\n# Get the input filenames, and check that they actually exist.\nmy ( $SAM, $fasta) = @ARGV;\nunless ( -e $SAM ) {\n    print STDERR \"$0: Can't find input SAM/BAM file `$SAM`\\n\";\n    exit;\n}\nunless ( -e $fasta) {\n    print STDERR \"$0: Can't find draft assembly file `$fasta`\\n\";\n    exit;\n}\n\n$ARGV[2] = uc $ARGV[2];\nmy $RE_site;\nif($ARGV[2] eq \"HINDIII\" or $ARGV[2] eq \"AAGCTT\"){\n  $RE_site = 'AAGCTT';\n  }elsif($ARGV[2] eq \"MBOI\" or $ARGV[2] eq \"GATC\"){\n  $RE_site = 'GATC';\n  }elsif($ARGV[2] eq \"ARIMA\"){\n  $RE_site = 'arima';\n  }\n# Find the input file's \"head\" and extension.\nmy ($head,$extension) = $SAM =~ /^(.*)\\.(.*)$/;\n\n\n# Examine the extension to determine whether this is a SAM or a BAM file.  If it's a SAM, convert it to BAM.  If it doesn't seem to be either, throw an error.\nif    ( uc($extension) eq 'SAM' ) { run_cmd( \"$samtools view -bS $SAM -o $head.bam\" ); }\nelsif ( uc($extension) eq 'BAM' ) {}\nelse {\n    print STDERR \"$0: Can't determine file type for input file `$SAM`.\\nFilename should end in '.SAM' or '.BAM' (not case-sensitive.)\\n\";\n    exit;\n}\n\n\nprint \"$0 @ARGV\\n\\n\";\n\n\n\n# COMMAND                                OUTPUT FILENAME                               WHAT THE COMMAND DOES\n# make_bed_around_RE_site.pl             <fasta>.near_<RE>.<range>.bed                 Prepare the bed file for bedtools intersect (next command)\n#\n# Make the BED file for the restriction sites on the draft assembly.  This only needs to be done once.\nmy $BED_RE_file;\nif ($RE_site eq \"arima\") {\n\t$BED_RE_file = \"$fasta.near_arima.500.bed\";\n\n\tmy $BED_re_file_gatc = \"$fasta.near_GATC.500.bed\";\n\tmy $BED_re_file_gaat = \"$fasta.near_GAAT.500.bed\";\n\tmy $BED_re_file_gact = \"$fasta.near_GACT.500.bed\";\n\tmy $BED_re_file_gagt = \"$fasta.near_GAGT.500.bed\";\n\tmy $BED_re_file_gatt = \"$fasta.near_GATT.500.bed\";\n\n\trun_cmd( \"$make_bed_around_RE_site_pl $fasta GATC 500\" ) unless -e $BED_re_file_gatc;\n\trun_cmd( \"$make_bed_around_RE_site_pl $fasta GAAT 500\" ) unless -e $BED_re_file_gaat;\n\trun_cmd( \"$make_bed_around_RE_site_pl $fasta GACT 500\" ) unless -e $BED_re_file_gact;\n\trun_cmd( \"$make_bed_around_RE_site_pl $fasta GAGT 500\" ) unless -e $BED_re_file_gagt;\n\trun_cmd( \"$make_bed_around_RE_site_pl $fasta GATT 500\" ) unless -e $BED_re_file_gatt;\n\n\trun_cmd( \"cat $BED_re_file_gatc $BED_re_file_gaat $BED_re_file_gact $BED_re_file_gagt $BED_re_file_gatt | sort -k1,1 -k2,2b -u > $BED_RE_file\" );\n}\nelse {\n    $BED_RE_file = \"$fasta.near_$RE_site.500.bed\";\n\trun_cmd( \"$make_bed_around_RE_site_pl $fasta $RE_site 500\" ) unless -e $BED_RE_file;\n}\n\n\n# Do the pre-processing on this file.\n#\n# COMMAND                                OUTPUT FILENAME                               WHAT THE COMMAND DOES\n# bedtools intersect                     <head>.REduced.bam                            Remove all reads that aren't within 500 bp of a restriction site\n### picard SortSam.jar                     <head>.REduced.sort_coord.bam                 Sort the file in coordinate order so PCR duplicates can be removed\n### picard MarkDuplicates.jar              <head>.REduced.sort_coord.nodups.bam          Remove PCR duplicates\n### picard SortSam.jar                     <head>.REduced.nodups.bam                     Sort the file in query-name order so Lachesis can read it\n# samtools view -F12                     <head>.REduced.paired_only.bam         Filter out all pairs in which both reads are not aligned\n# samtools flagstat                      <head>.REduced.paired_only.flagstat    Make a flagstat file that describes the contents of the BAM file\n\nmy $opts = \"VALIDATION_STRINGENCY=SILENT\";\nmy $nodups = \"\"; # or \".nodups\", if removing PCR duplicates\n\nrun_cmd( \"$bedtools intersect -abam $head.bam -b $BED_RE_file > $head.REduced.bam\" );\n#run_cmd( \"${picard_head}SortSam.jar $opts I=$head.REduced.bam O=$head.REduced.sort_coord.bam SO=coordinate\" );\n#run_cmd( \"${picard_head}MarkDuplicates.jar $opts I=$head.REduced.sort_coord.bam O=$head.REduced.sort_coord.nodups.bam M=$head.REduced.sort_coord.dup_metrics AS=true REMOVE_DUPLICATES=true\" );\n#run_cmd( \"${picard_head}SortSam.jar $opts I=$head.REduced.sort_coord.nodups.bam O=$head.REduced.nodups.bam SO=queryname\" );\nrun_cmd( \"$samtools view -F12 $head.REduced$nodups.bam -b -o $head.REduced$nodups.paired_only.bam\" );\nrun_cmd( \"$samtools flagstat $head.REduced$nodups.paired_only.bam > $head.REduced$nodups.paired_only.flagstat\" );\n"
  },
  {
    "path": "scripts/agp2tour.pl",
    "content": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 chr.agp\\n\" if(!defined $ARGV[0]);\nmy %infordb;\nmy $cnt = 0;\nopen(IN, \"grep -v contig $ARGV[0]|\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\tmy $chrn = $data[0];\n\tnext if(!($chrn=~/Chr/) and !($chrn=~/group/));\n\tif(!exists($infordb{$chrn})){\n\t\t$cnt = 1;\n\t\t$infordb{$chrn}->{$cnt} .= $data[5].\"\".$data[8];\n\t\t}else{\n\t\t\t$cnt++;\n\t\t\t$infordb{$chrn}->{$cnt} .= $data[5].\"\".$data[8];\n\t\t\t}\n\t}\nclose IN;\n\n\nforeach my $c (sort keys %infordb){\n\tmy $outfile = $c.\".tour\";\n\topen(my $out, \">$outfile\") or die\"\";\n\tforeach my $i (sort {$a<=>$b} keys %{$infordb{$c}}){\n\t\tprint $out \"$infordb{$c}->{$i} \";\n\t\t}\n\tclose $out;\n\t}\n"
  },
  {
    "path": "scripts/bam2CLM.pl",
    "content": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"b:r:d:\";\n\n\nif ((!defined $opt_b)|| (!defined $opt_r) || (!defined $opt_d) ) {\n    die \"************************************************************************\n    Usage: perl $0 -b mapping.bam -r refSeq.fasta -d main_results/\n      -h : help and usage.\n      -b : mapping.bam\n      -r : reference genome, fasta format\n      -d : LACHESIS main_results/\n************************************************************************\\n\";\n\n}\n\n\n\nmy %seqdb;\nmy $ctg;\nopen(IN, $opt_r) or die\"\";\nwhile(<IN>){\n\tchomp;\n\tif(/>/){\n\t\t$ctg = $_;\n\t\t$ctg =~ s/>//g;\n\t\t$ctg =~ s/\\s+.*//g;\n                $ctg =~ s/_pilon//g;\n\t}else{\n\t\t$seqdb{$ctg} .= $_;\n\t\t}\n\t}\nclose IN;\n\nforeach $ctg(keys %seqdb){\n\t$seqdb{$ctg} =~ s/\\s+//g;\n\t}\n\n\n######GET GROUP IDS######\nprint \"a. Getting group ids ...\\n\";\nprint \"Reading anchored contigs ...\\n\";\nmy %anchordb;\nmy %gidb;\nwhile(my $file = glob \"$opt_d/group*ordering\"){\n\tmy $gid = $1 if($file=~/group(\\d+).ordering/);\n        open(my $in, $file) or die\"\";\n\twhile(<$in>){\n\t\tchomp;\n\t\tnext if(/#/);\n\t\tmy $ctg = (split/\\s+/,$_)[1];\n                $ctg    =~ s/_pilon//g;\n\t\t$anchordb{$ctg}->{'gid'}  = $gid;\n\t\t$anchordb{$ctg}->{'stat'} = \"An\";\n\t\t$gidb{$gid}->{$ctg}       = \"An\";\n\t\t}\n\tclose $in;\n\t}\n\nmy $ufile = \"unanchor.signal.txt\";\nif(!(-e $ufile)){\n  system(\"touch $ufile\");\n  }\n\nmy $num_of_group  = keys %gidb;\nprint \"Number of groups: $num_of_group\\n\";\nprint \"Reading unanchored contigs ...\\n\";\nopen(IN, \"unanchor.signal.txt\") or die\"\";\n<IN>;\nwhile(<IN>){\n\tchomp;\n\tmy $i          = $num_of_group + 1;\n        my ($ctg,$gid) = (split/\\s+/,$_)[0,$i];\n            $ctg       =~ s/_pilon//g;\n\t$gid =~ s/group//g;\n\t$anchordb{$ctg}->{'gid'}  = $gid;\n\t$anchordb{$ctg}->{'stat'} = \"Un\";\n\t$gidb{$gid}->{$ctg}       = \"Un\";\n\t}\nclose IN;\n\nprint \"Output group ids ...\\n\";\nforeach my $gid(sort keys %gidb){\n\tmy $outid = \"group\".$gid.\".ids\";\n\topen(my $out, \">$outid\") or die\"\";\n\tforeach my $ctg (keys %{$gidb{$gid}}){\n\t  my $len = length $seqdb{$ctg};\n\t\tprint $out \"$ctg\t$len\\n\" if($gidb{$gid}->{$ctg} eq \"An\");\n\t\tprint $out \"$ctg\t$len\trecover\\n\" if($gidb{$gid}->{$ctg} eq \"Un\");\n\t\t}\n\tclose $out;\n\t}\n\nprint \"b. Getting CLM files ...\\n\";\n\nprint \"Reading and filtering $opt_b file ...\\n\";\nmy %tmprdb = (); ###store reads name\nmy %infordb;     ###store contig pairs with directions: e.g. A+B+,A+B-,A-B+,A-B-\nmy $count = 0;   ###used for sorting\nopen(IN, \"samtools view $opt_b |awk \\'\\$7!=\\\"*\\\" && \\$7!=\\\"=\\\"\\' |\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\t$_                =~ s/_pilon//g;\n\tmy @data          = split(/\\s+/,$_);\n\tnext if(exists($tmprdb{$data[0]}));\n\t$tmprdb{$data[0]}++;\n\tmy ($ctgA,$ctgB)  = sort ($data[2], $data[6]);\n###determine gid for the contig pairs\n  next if(!exists($anchordb{$ctgA}->{'gid'}));\n  next if(!exists($anchordb{$ctgB}->{'gid'}));\n  my $ctgAgid       = $anchordb{$ctgA}->{'gid'};\n\tmy $ctgBgid       = $anchordb{$ctgB}->{'gid'};\n\tnext if($ctgAgid ne $ctgBgid);\n  my $ctgAL         = length $seqdb{$ctgA};\n  my $ctgBL         = length $seqdb{$ctgB};\n  my $RAP           = ($data[2] le $data[6])?$data[3]:$data[7];\n  my $RBP           = ($data[2] le $data[6])?$data[7]:$data[3]; \n  my $A1            = $RAP;\n  my $A2            = $ctgAL - $RAP;\n  my $B1            = $RBP;\n  my $B2            = $ctgBL - $RBP;\n###calculate distance for contig pairs                    \n  my $ApBp          = $A2 + $B1;\n  my $ApBm          = $A2 + $B2;\n  my $AmBp          = $A1 + $B1;\n  my $AmBm          = $A1 + $B2;\n#  print \">$_\\n\";\n#  print \"$ctgA length=$ctgAL\tand $ctgB length=$ctgBL\\n\";\n#  print \"$ctgA+ $ctgB+: $ApBp\\n\";\n#  print \"$ctgA+ $ctgB-: $ApBm\\n\";\n#  print \"$ctgA- $ctgB+: $AmBp\\n\";\n#  print \"$ctgA- $ctgB-: $AmBm\\n\";\n#  print \"\\n\";\n  my $PApBp         = $ctgA.\"+ \".$ctgB.\"+\"; #P means pair\n  my $PApBm         = $ctgA.\"+ \".$ctgB.\"-\"; \n  my $PAmBp         = $ctgA.\"- \".$ctgB.\"+\";\n  my $PAmBm         = $ctgA.\"- \".$ctgB.\"-\";\n  $infordb{$PApBp}->{'d'} .= $ApBp.\" \";    #d means distance\n  $infordb{$PApBm}->{'d'} .= $ApBm.\" \";\n  $infordb{$PAmBp}->{'d'} .= $AmBp.\" \";\n  $infordb{$PAmBm}->{'d'} .= $AmBm.\" \";\n  $infordb{$PApBp}->{'g'}  = $ctgAgid ;    #g means group id\n  $infordb{$PApBm}->{'g'}  = $ctgAgid ; \n  $infordb{$PAmBp}->{'g'}  = $ctgAgid ; \n  $infordb{$PAmBm}->{'g'}  = $ctgAgid ;   \n  $infordb{$PApBp}->{'c'}  = $count++ ;    #c means count\n  $infordb{$PApBm}->{'c'}  = $count++ ; \n  $infordb{$PAmBp}->{'c'}  = $count++ ; \n  $infordb{$PAmBm}->{'c'}  = $count++ ;     \n\t}\nclose IN;\n\n###Get CLM FILES####\nprint \"Output CLM files ...\\n\";\nopen(ALLCLM, \"> all.clm\") or die\"\";\nforeach my $key (sort {$infordb{$a}->{'c'}<=>$infordb{$b}->{'c'}} keys %infordb){\n\tmy @t           = split(/\\s+/,$infordb{$key}->{'d'});\n\tmy $num_of_link = @t;\n\tprint ALLCLM \"$key\t$num_of_link\t$infordb{$key}->{'d'}\\n\";\n\t}\nclose ALLCLM;\n\nforeach my $gid (keys %gidb){\n \tmy $outfile        = \"group\".$gid.\".clm\";\n\topen(my $out, \">$outfile\") or die\"\";\n\tforeach my $key (sort {$infordb{$a}->{'c'}<=>$infordb{$b}->{c}}  keys %infordb){\n\t\tmy @t            = split(/\\s+/,$infordb{$key}->{'d'});\n\t\tmy $num_of_link  = @t;\n\t\tprint $out \"$key\t$num_of_link\t$infordb{$key}->{'d'}\\n\" if($infordb{$key}->{'g'} eq $gid);\n\t\t}\n\t\n\tclose $out;\n\t}\n\n\n"
  },
  {
    "path": "scripts/bam2CLM_simple.pl",
    "content": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 mapping.bam refSeq.fasta\\n\" if((!defined $ARGV[0]) or (!defined $ARGV[1]));\n\nmy %seqdb;\nmy $ctg;\nopen(IN, $ARGV[1]) or die\"\";\nwhile(<IN>){\n\tchomp;\n\tif(/>/){\n\t\t$ctg = $_;\n\t\t$ctg =~ s/>//g;\n\t\t$ctg =~ s/\\s+.*//g;\n    $ctg =~ s/_pilon//g;\n\t}else{\n\t\t$seqdb{$ctg} .= $_;\n\t\t}\n\t}\nclose IN;\n\nforeach $ctg(keys %seqdb){\n\t$seqdb{$ctg} =~ s/\\s+//g;\n\t}\n\t\nprint \"Reading and filtering $ARGV[0] file ...\\n\";\nmy %tmprdb = (); ###store reads name\nmy %infordb;     ###store contig pairs with directions: e.g. A+B+,A+B-,A-B+,A-B-\nmy $count = 0;   ###used for sorting\nopen(IN, \"samtools view $ARGV[0] |awk \\'\\$7!=\\\"*\\\" && \\$7!=\\\"=\\\"\\' |\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\t$_                =~ s/_pilon//g;\n\tmy @data          = split(/\\s+/,$_);\n\tnext if(exists($tmprdb{$data[0]}));\n\t$tmprdb{$data[0]}++;\n\tmy ($ctgA,$ctgB)  = sort ($data[2], $data[6]);\n  my $ctgAL         = length $seqdb{$ctgA};\n  my $ctgBL         = length $seqdb{$ctgB};\n  my $RAP           = ($data[2] le $data[6])?$data[3]:$data[7];\n  my $RBP           = ($data[2] le $data[6])?$data[7]:$data[3]; \n  my $A1            = $RAP;\n  my $A2            = $ctgAL - $RAP;\n  my $B1            = $RBP;\n  my $B2            = $ctgBL - $RBP;\n###calculate distance for contig pairs                    \n  my $ApBp          = $A2 + $B1;\n  my $ApBm          = $A2 + $B2;\n  my $AmBp          = $A1 + $B1;\n  my $AmBm          = $A1 + $B2;\n\n  my $PApBp         = $ctgA.\"+ \".$ctgB.\"+\"; #P means pair\n  my $PApBm         = $ctgA.\"+ \".$ctgB.\"-\"; \n  my $PAmBp         = $ctgA.\"- \".$ctgB.\"+\";\n  my $PAmBm         = $ctgA.\"- \".$ctgB.\"-\";\n  $infordb{$PApBp}->{'d'} .= $ApBp.\" \";    #d means distance\n  $infordb{$PApBm}->{'d'} .= $ApBm.\" \";\n  $infordb{$PAmBp}->{'d'} .= $AmBp.\" \";\n  $infordb{$PAmBm}->{'d'} .= $AmBm.\" \";\n  $infordb{$PApBp}->{'g'}  = $ctgAgid ;    #g means group id\n  $infordb{$PApBm}->{'g'}  = $ctgAgid ; \n  $infordb{$PAmBp}->{'g'}  = $ctgAgid ; \n  $infordb{$PAmBm}->{'g'}  = $ctgAgid ;   \n  $infordb{$PApBp}->{'c'}  = $count++ ;    #c means count\n  $infordb{$PApBm}->{'c'}  = $count++ ; \n  $infordb{$PAmBp}->{'c'}  = $count++ ; \n  $infordb{$PAmBm}->{'c'}  = $count++ ;     \n\t}\nclose IN;\n\n###Get CLM FILES####\nprint \"Output CLM files ...\\n\";\nopen(ALLCLM, \"> all.clm\") or die\"\";\nprint ALLCLM \"groupA\tgroupB\tnum_of_link\tAverage_distance\tsignalDensity\tdistance_list\\n\";\nforeach my $key (sort {$infordb{$a}->{'c'}<=>$infordb{$b}->{'c'}} keys %infordb){\n\tmy @t           = split(/\\s+/,$infordb{$key}->{'d'});\n\tmy $num_of_link = @t;\n        my $sum = 0; my $ave = 0;\n        map {$sum+=$_} @t;\n        $ave   = $sum/$num_of_link;\n        $ave   = sprintf(\"%.2f\",$ave);\n        my ($g1,$g2) = split(/\\s+/,$key);\n           $g1      =~ s/[+|-]//g;\n           $g2      =~ s/[+|-]//g;\n        my $l1      = length $seqdb{$g1};\n        my $l2      = length $seqdb{$g2};\n        my $len     = $l1 + $l2;\n        my $signalD = $num_of_link/$len * 1000;\n\tprint ALLCLM \"$key\t$num_of_link\t$ave\t$signalD\t$infordb{$key}->{'d'}\\n\";\n\t}\nclose ALLCLM;\n"
  },
  {
    "path": "scripts/bam2net.pl",
    "content": "#!/usr/bin/perl -w\nuse Getopt::Std;\ngetopts \"c:b:o:\";\n\n\nif ((!defined $opt_c)|| (!defined $opt_b)||(!defined $opt_o) ) {\n    die \"************************************************************************\n    Usage: bam2net.pl -c draft.asm.fasta -b file.bam -o out.net\n      -h : help and usage.\n      -c : draft.asm.fasta\n      -b : mapping.bam\n      -o : output\n************************************************************************\\n\";\n}\nmy $bam    = $opt_b;\nmy $refSeq = $opt_c;\n\nopen(IN, $refSeq) or die\"\";\nmy $name;\nwhile(<IN>){\n\tchomp;\n\tif(/>/){\n\t\t$name = $_;\n\t\t$name =~ s/>//g;\n\t}else{\n\t\t$refdb{$name} .= $_;\n\t\t}\n\t}\nclose IN;\n\nforeach $name (keys %refdb){\n  $refdb{$name} =~ s/\\s+//g;\n\t}\n\nmy %infordb;\nopen(IN, \"samtools view $bam |\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\tnext if($data[6] eq \"=\");\n\tnext if($data[6] eq \"*\");\n        my ($ctg1,$ctg2) = sort ($data[2],$data[6]);\n\t$infordb{$ctg1}->{$ctg2}++;\n\t}\nclose IN;\n\nopen(OUT, \"> $opt_o\") or die\"\";\nprint OUT \"ctg1\tctg1_size\tctg2\tctg2_size\tsignalDensity\\n\";\nforeach my $ctg1(keys %infordb){\n\tmy $len1 = length $refdb{$ctg1};\n\tforeach my $ctg2(keys %{$infordb{$ctg1}}){\n\t\tmy $len2  = length $refdb{$ctg2};\n                my $normL = ($len1 + $len2)/100000;\n                my $sigD  = $infordb{$ctg1}->{$ctg2}/$normL;\n                   $sigD  = sprintf(\"%.2f\",$sigD);\n\t\tprint OUT \"$ctg1\t$len1\t$ctg2\t$len2\t$sigD\\n\";\n\t\t}\n\t}\nclose OUT;\n"
  },
  {
    "path": "scripts/bam_HiCplotter.py",
    "content": "#!/usr/bin/env python\nimport os\nimport sys\nimport gc\nfrom math import log\nimport time\n\n\n# Get position of read based on contig with sam or bam file\ndef get_read_pos_with_sam_bam_file(sam_bam_file):\n\tread_on_chr = {}\n\tif sam_bam_file[-3:] == \"bam\":\n\t\tf_in = os.popen(\"samtools view \"+sam_bam_file, 'r')\n\telse:\n\t\tf_in = open(sam_bam_file, 'r')\n\n\tfor line in f_in:\n\t\tif line.strip() == '' or line[0] == '@':\n\t\t\tcontinue\n\t\tdata = line.strip().split()\n\t\tread_id = data[0]\n\t\tif data[2] == '*' or data[6] == '*':\n\t\t\tcontinue\n\t\tctg1 = data[2].replace('_pilon', '')\n\t\tread_pos1 = int(data[3])\n\t\tif data[6] != '=':\n\t\t\tctg2 = data[6].replace('_pilon', '')\n\t\telse:\n\t\t\tctg2 = ctg1\n\t\tread_pos2 = int(data[7])\n\t\tread_on_chr[read_id] = [ctg1, read_pos1, ctg2, read_pos2]\n\tf_in.close()\n\treturn read_on_chr\n\n\n# Get chromosome length\ndef get_chr_len(chr_list):\n\tchr_len_db = {}\n\tchr_order = []\n\twith open(chr_list, 'r') as f_in:\n\t\tfor line in f_in:\n\t\t\tif line.strip() == '':\n\t\t\t\tcontinue\n\t\t\tdata = line.strip().split()\n\t\t\tchr_order.append(data[0])\n\t\t\tchr_len_db[data[0]] = int(data[1])\n\treturn chr_len_db, chr_order\n\n\n# Calc read counts on each bin\ndef calc_read_count_per_bin(chr_len_db, chr_order, read_on_chr, bin_size):\n\tlong_bin_size = bin_size.upper()\n\tlong_bin_size = long_bin_size.replace('K', '000')\n\tlong_bin_size = long_bin_size.replace('M', '000000')\n\tlong_bin_size = long_bin_size.replace('G', '000000000')\n\tlong_bin_size = int(long_bin_size)\n\t\n\tread_count_per_chr = {}\n\tread_count_whole_genome = {}\n\t\n\tbin_offset = [0 for i in range(0, len(chr_order)+1)]\n\tbin_count = [0 for i in range(0, len(chr_order)+1)]\n\ttotal_bin_count = 0\n\t\n\tfor chrn in chr_len_db:\n\t\tbin_count_of_chr = int(round((chr_len_db[chrn]*1.0/long_bin_size+0.5)))\n\t\ttotal_bin_count += bin_count_of_chr\n\t\tbin_count[chr_order.index(chrn)+1] = bin_count_of_chr\n\t\tread_count_per_chr[chrn] = [[0 for i in range(0, bin_count_of_chr)] for j in range(0, bin_count_of_chr)]\n\t\n\tfor i in range(0, len(bin_count)):\n\t\tfor j in range(0, i+1):\n\t\t\tbin_offset[i] += bin_count[j]\n\t\n\tread_count_whole_genome = [[0 for i in range(0, total_bin_count)] for j in range(0, total_bin_count)]\n\t\n\tfor read in read_on_chr:\n\t\tchr1, pos1, chr2, pos2 = read_on_chr[read]\n\t\tif chr1 not in chr_len_db or chr2 not in chr_len_db:\n\t\t\tcontinue\n\t\tpos1_index = int(pos1/long_bin_size)\n\t\tpos2_index = int(pos2/long_bin_size)\n\t\tif chr1 == chr2 and chr1 in read_count_per_chr:\n\t\t\tread_count_per_chr[chr1][pos1_index][pos2_index] += 1\n\t\t\tread_count_per_chr[chr1][pos2_index][pos1_index] += 1\n\n\t\tchr1_index = chr_order.index(chr1)\n\t\tchr2_index = chr_order.index(chr2)\n\n\t\twhole_pos1 = bin_offset[chr1_index] + pos1_index\n\t\twhole_pos2 = bin_offset[chr2_index] + pos2_index\n\t\tread_count_whole_genome[whole_pos1][whole_pos2] += 1\n\t\tread_count_whole_genome[whole_pos2][whole_pos1] += 1\n\t\n\tfor chrn in read_count_per_chr:\n\t\tfor i in range(0, len(read_count_per_chr[chrn])):\n\t\t\tfor j in range(0, len(read_count_per_chr[chrn][i])):\n\t\t\t\tif read_count_per_chr[chrn][i][j] != 0:\n\t\t\t\t\tread_count_per_chr[chrn][i][j] = log(read_count_per_chr[chrn][i][j], 2)\n\t\t\t\telse:\n\t\t\t\t\tread_count_per_chr[chrn][i][j] = -float('inf')\n\t\n\tfor i in range(0, len(read_count_whole_genome)):\n\t\tfor j in range(0, len(read_count_whole_genome[i])):\n\t\t\tif read_count_whole_genome[i][j] != 0:\n\t\t\t\tread_count_whole_genome[i][j] = log(read_count_whole_genome[i][j], 2)\n\t\t\telse:\n\t\t\t\tread_count_whole_genome[i][j] = -float('inf')\n\n\n\treturn read_count_per_chr, read_count_whole_genome\n\n\n# Draw heatmap of allhic result with matplotlib\ndef draw_heatmap(data, chrn, bin_size, ext):\n\t\n\timport matplotlib as mpl\n\tmpl.use('Agg')\n\timport matplotlib.pyplot as plt\n\n\tshort_bin_size = bin_size.upper()\n\tshort_bin_size = short_bin_size.replace('000000000', 'G')\n\tshort_bin_size = short_bin_size.replace('000000', 'M')\n\tshort_bin_size = short_bin_size.replace('000', 'K')\n\n\tax = plt.gca()\n\t\n\tif chrn != 'all':\n\t\tfile_prefix = short_bin_size + \"_\" + chrn\n\telse:\n\t\tfile_prefix = short_bin_size + '_Whole_genome'\n\t\n\tprint(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+' Draw '+file_prefix)\n\t\n\t# mpl.cm.YlOrRd\n\tcmap = plt.get_cmap('YlOrRd')\n\tcmap.set_over('black')\n\tif chrn != 'all':\n\t\thmap = ax.imshow(data, interpolation='nearest', origin='lower', cmap=cmap, aspect='auto')\n\telse:\n\t\thmap = ax.imshow(data, interpolation='nearest', cmap=cmap, aspect='auto')\n\t\n\tplt.colorbar(mappable=hmap,cax=None, ax=None, shrink=0.5)\n\tplt.tick_params(labelsize=6)\n\tfor ticks in ax.get_xticklabels():\n\t\tticks.set_rotation(90)\n\tfor ticks in ax.get_yticklabels():\n\t\tticks.set_rotation(0)\n\t\n\tif chrn != 'all':\n\t\ttitle = chrn+'_'+short_bin_size\n\telse:\n\t\ttitle = 'Whole_genome_'+short_bin_size\n\t\n\tplt.xlabel(\"Bins (\"+short_bin_size.lower()+\"b per bin)\", fontsize=8)\n\tif chrn == 'all':\n\t\tplt.xticks([])\n\t\tplt.yticks([])\n\t\tplt.title(title, y=1.01, fontsize=12)\n\telse:\n\t\tplt.title(title, y=1.1, fontsize=12)\n\n\tplt.savefig(file_prefix+'.'+ext, filetype=ext, bbox_inches='tight', dpi=200)\n\tplt.close('all')\n\n\nif __name__ == \"__main__\":\n\tif len(sys.argv) < 5:\n\t\tprint(\"Notice: This script is using for drawing heatmap of the all-hic reasult\")\n\t\tprint(\"Usage: python \"+sys.argv[0]+\" <sam/bam file> <chr_list> <bin_size> <ext>\")\n\t\tprint(\"\\t<sam/bam_file> is the sam or bam file filtered by allhic\")\n\t\tprint(\"\\t<chr_prefix> is the part of chromosomes before chromosome index\")\n\t\tprint(\"\\t<bin_size> is the bin size of heatmap, it can be a list splited by comma\")\n\t\tprint(\"\\t<ext> is the file type of picture\")\n\n\telse:\n\t\tsam_bam_file = sys.argv[1]\n\t\tchr_list = sys.argv[2]\n\t\tbin_list = sys.argv[3]\n\t\text = sys.argv[4]\n\t\t\n\t\tprint(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+\" Step 1: Get read position based on chromosome\")\n\t\tread_on_chr = get_read_pos_with_sam_bam_file(sam_bam_file)\n\n\t\tprint(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+\" Step 2: Get chromosome length\")\n\t\tchr_len_db, chr_order = get_chr_len(chr_list)\n\t\t\n\t\tprint(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+\" Step 3: Calculating and Drawing heatmap\")\n\n\t\tbin_size_list = bin_list.split(',')\n\t\tfor bin_size in bin_size_list:\n\n\t\t\tprint(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+\" Calculating\")\n\t\t\tread_count_per_chr, read_count_whole_genome = calc_read_count_per_bin(chr_len_db, chr_order, read_on_chr, bin_size)\n\t\t\t\n\t\t\tprint(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+\" Drawing heatmap\")\n\t\t\n\t\t\tprint(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+\" Drawing with bin size \"+str(bin_size))\n\t\t\tfor chrn in read_count_per_chr:\n\t\t\t\tdraw_heatmap(read_count_per_chr[chrn], chrn, bin_size, ext)\n\t\t\t\n\t\t\tdraw_heatmap(read_count_whole_genome, 'all', bin_size, ext)\n\t\t\tdel read_count_per_chr, read_count_whole_genome\n\t\t\tgc.collect()\n\t\t\n\t\tdel read_on_chr\n\t\tgc.collect()\n\t\tprint(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+\" Success\")\n"
  },
  {
    "path": "scripts/blastn_parse.pl",
    "content": "#!/usr/bin/perl -w\n\n###This script was used to parse blast+ result (outfmt 6)\n###you can get best hit with parameter -b 1\n###or -b 0 to get more results \n###The default coverage and identity are 60%, respectively\n\nuse Getopt::Std;\ngetopts \"i:o:b:c:d:q:\";\n\n\nif ((!defined $opt_i)|| (!defined $opt_o)  || (!defined $opt_q)) {\n    die \"************************************************************************\n    Usage: perl $0 -i input -o output -q query.fasta -b 0||1  \n      -h : help and usage.\n      -q : query file, fasta format\n      -i : input file is the result of blast+\n      -b : (optioanl, default 1)1 means only output best hit; 0 means get more results\n      -d : identity (optional, default is 0.6)\n      -c : coverage (optional, defalut is 0.6)\n      -o : output\n************************************************************************\\n\";\n}\n\n$input         = $opt_i;\n$output        = $opt_o;\n$BestHit_model = (defined $opt_b) ? $opt_b : 1;\n$coverage \t   = (defined $opt_c) ? $opt_c : 0.6;\n$identity      = (defined $opt_d) ? $opt_d : 0.6;\n\nopen(IN, $opt_q) or die\"No query file: $opt_q\\n\";\nwhile(<IN>){\n\tif(/>/){\n\t\t$gene = $_;\n\t\t$gene =~ s/>//g;\n\t\t$gene =~ s/\\s+.*//g;\n\t}else{\n\t\t$infordb{$gene} .= $_;\n\t\t}\n\t}\nclose IN;\n\nopen(OUT, \"> $output\") or die\"No output file: $output\\n\";\nopen(IN, $input) or die\"No input file: $input\\n\";\nwhile(<IN>){\n\tchomp;\n\t@data    = split(/\\s+/,$_);\n\t$query   = $data[0];\n\t$countdb{$query} += 1;\n\tnext if($countdb{$query}>1 and $BestHit_model==1);\n\t$q_len   = length $infordb{$query};\n#\t$subject = $data[1];\n\t$blst_i  = $data[2]/100;\n\t$blst_c  = ($data[7]-$data[6])/$q_len;\n  if($blst_i>=$identity and $blst_c>=$coverage){\n  \tprint OUT \"$_\\n\";\n  \t}\n\t}\nclose IN;\nclose OUT;\n\n\n\n\n\n"
  },
  {
    "path": "scripts/classify.pl",
    "content": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"i:p:r:g:\";\n\n\nif ((!defined $opt_i)|| (!defined $opt_p) || (!defined $opt_r)|| (!defined $opt_g)) {\n    die \"************************************************************************\n    Usage: perl $0 -i blast.out -p polyploid -r ref.gff3 -g target.gff3 \n      -h : help and usage.\n      -i : blast.out\n      -p : number of alleles\n      -r : reference.gff3, annotation from close relative species\n      -g : target.gff3, annotation from target species\n\n************************************************************************\\n\";\n}\n\n### Parameter reading\nmy $blast     = $opt_i;\nmy $polyn     = $opt_p;\nmy $rGFF      = $opt_r;\nmy $tGFF      = $opt_g;\nmy $geneTable = \"Allele.gene.table\";\nmy $ctgTable  = \"Allele.ctg.table\";\n\nmy %infordb;\nmy $count = 0;\nopen(IN, \"sort -k2,2 -k12,12nr $blast|\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data  = split(/\\s+/,$_);\n\tmy $tgene = $data[0];\n\tmy $rgene = $data[1];\n\tmy $bits  = $data[11];\n\tif(!exists($infordb{$rgene})){\n\t\t$count = 1;\n\t\t$infordb{$rgene}->{$count} = $tgene;\n\t}else{\n\t\t$count++;\n\t\tnext if($count>$polyn);\n\t\t$infordb{$rgene}->{$count} = $tgene;\n\t\t}\n\t\n\t}\nclose IN;\n\nmy %tdb;  ### store target genome gff information, e.g het rice\nopen(IN, \"awk '\\$3==\\\"gene\\\"' $tGFF | \") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data     = split(/\\s+/,$_);\n\tmy $tgene    = $1 if(/Name=(\\S+)/);\n\t   $tgene    =~ s/;.*//g;\n\t$tdb{$tgene} = $data[0];\n\t}\nclose IN;\n\n\nopen(OUT, \"> $geneTable\") or die\"\";\nopen(IN, \"awk '\\$3==\\\"gene\\\"' $rGFF |sort -k1,1 -k4,4n |\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data  = split(/\\s+/,$_);\n\tmy $rgene = $1 if(/Name=(\\S+)/);\n\t$rgene    =~ s/;.*//g;\n\tnext if(!exists($infordb{$rgene}));\n\tprint OUT \"$rgene\t$data[0]\t$data[3]\t\";\n\tforeach my $i(sort {$a<=>$b} keys %{$infordb{$rgene}}){\n\t\tmy $tgene = $infordb{$rgene}->{$i};\n\t\t   $tctg  = $tdb{$tgene}; \n\t\tprint OUT \"$tgene,$tctg\t\";     ###print out target gene order and contig name\n\t\t}\n\tprint OUT \"\\n\";\n\t}\nclose IN;\n\nclose OUT;\n\n\nmy %alleledb;\nmy $ln = 0; ###store line number\nopen(IN, \"Allele.gene.table\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\t$ln++;\n\tmy @data = split(/\\s+/,$_);\n\tmy %tmpdb = ();\n\tforeach my $i(3..$#data){\n\t\tmy $ctg = (split/,/,$data[$i])[1];\n\t\t$tmpdb{$ctg}++;\n\t\t}\n\tmap {$alleledb{$ln}->{'ctg'} .= $_.\" \"} keys %tmpdb;\n\t$alleledb{$ln}->{'chrn'}      = $data[1];\n\t$alleledb{$ln}->{'posi'}      = $data[2];\n\t}\nclose IN;\n\nopen(OUT, \"> remove.log\") or die\"\";\nmy %removedb = ();\nfor(my $i=2;$i<=$ln;$i++){\n\tmy $chrI = $alleledb{$i}->{'chrn'};\n\tmy $ctgI = $alleledb{$i}->{'ctg'};\n\tmy $chrR; my $ctgR; my $R;\n\tfor(my $j=1;$j<$i;$j++){\n\t\tnext if(exists($removedb{$j}));\n\t\tmy $chrJ = $alleledb{$j}->{'chrn'};\n\t\tnext if($chrI ne $chrJ);\n\t\tmy $ctgJ = $alleledb{$j}->{'ctg'};\n\t\tmy $flag = & compare($ctgI,$ctgJ);\n\t\tprint OUT \"$i\t$chrI\t$ctgI\t$j\t$chrJ\t$ctgJ\t$flag\\n\" if($flag==1);\n### flag=1, remove\n\t\t$removedb{$i}++ if($flag==1);\n\t\t}\n\t\n\t}\nclose OUT;\n\n\nopen(OUT, \">$ctgTable\") or die\"\";\n$ln = 0;\nopen(IN, $geneTable) or die\"\";\nwhile(<IN>){\n\tchomp;\n\t$ln++;\n\tnext if(exists($removedb{$ln}));\n\tmy @data = split(/\\s+/,$_);\n\tprint OUT \"$data[1]\t$data[2]\t\";\n\tforeach my $i(3..$#data){\n\t\tmy $ctg = (split/,/,$data[$i])[1];\n\t\tprint OUT \"$ctg\t\";\n\t\t}\t\n\tprint OUT \"\\n\";\n\t}\nclose IN;\nclose OUT;\n\n\n\nsub compare{\n\tmy $ctgT   = shift;\n\tmy $ctgR   = shift;\n\tmy @ctgTdb = split(/\\s+/,$ctgT);\n\tmy @ctgRdb = split(/\\s+/,$ctgR);\n\tmy %tdb = ();\n\tmy $num_T  = @ctgTdb;\n  map {$tdb{$_}++} @ctgTdb;\n  my $num_S  = 0;   ###Number of Same contigs\n\tmy $num_D  = 0;   ###Number of Different contigs\t  \n  foreach my $ctg(@ctgRdb){\n  \tif(exists($tdb{$ctg})){\n  \t\t$num_S++;\n  \t}else{\n  \t\t$num_D++;\n  \t\t}\n  \t} \n \tif($num_S == $num_T){\n \t\treturn 1;\n \t}else{\n \t\treturn 0;\n \t\t}\n\t}\n\n\n\n\n"
  },
  {
    "path": "scripts/filterBAM_forHiC.pl",
    "content": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 file.bam out.sam\\n\" if(!defined($ARGV[0]) or !defined($ARGV[1]));\nopen(OUT, \"> $ARGV[1]\") or die\"\";\nopen(IN, \"samtools view $ARGV[0] |\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy $mapq = (split/\\s+/,$_)[4];\n\tmy ($NM,$XM,$XO,$XG);\n\tif(/NM:i:(\\d)/){\n\t\t$NM = $1;\n\t\t}\n\tif(/XM:i:(\\d)/){\n\t\t$XM = $1;\n\t\t}\n\tif(/XO:i:(\\d)/){\n\t\t$XO = $1;\n\t\t}\n\tif(/XG:i:(\\d)/){\n\t\t$XG = $1;\n\t\t}\n        next if($mapq<30);\n\tnext if(!(/XT:A:U/));\t\n\tnext if(!(defined $NM) or $NM>5);\n\tnext if(!(defined $XM) or $XM>3);\n\tnext if(!(defined $XO) or $XO>2);\n\tnext if(!(defined $XG) or $XG>2);\n\tnext if(/XA:/);\n\tprint OUT \"$_\\n\";\n\t}\nclose IN;\nclose OUT;\n\n\n#Tag\tMeaning\n#NM\tEdit distance\n#MD\tMismatching positions/bases\n#AS\tAlignment score\n#BC\tBarcode sequence\n#X0\tNumber of best hits\n#X1\tNumber of suboptimal hits found by BWA\n#XN\tNumber of ambiguous bases in the referenece\n#XM\tNumber of mismatches in the alignment\n#XO\tNumber of gap opens\n#XG\tNumber of gap extentions\n#XT\tType: Unique/Repeat/N/Mate-sw\n#XA\tAlternative hits; format: (chr,pos,CIGAR,NM;)*\n#XS\tSuboptimal alignment score\n#XF\tSupport from forward/reverse alignment\n#XE\tNumber of supporting seeds\n"
  },
  {
    "path": "scripts/gmap2AlleleTable.pl",
    "content": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 ref.gff3\\n\" if(!defined ($ARGV[0]));\nmy $refGFF = $ARGV[0];\nopen(IN, \"grep 'gene' gmap.gff3 |\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\tmy $gene = $1 if(/Name=([^;\\n]*)/);\n\t$infordb{$gene} .= $data[0].\"\t\";\n\t}\nclose IN;\n\nopen(OUT, \"> Allele.ctg.table\") or die\"\";\nopen(IN, \"awk '\\$3==\\\"gene\\\"' $refGFF |\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\tmy $gene = $1 if(/Name=(\\S+)/);\n\t   $gene =~ s/;.*//g;\n\tnext if(!exists($infordb{$gene}));\n\tmy @tdb = split(/\\s+/,$infordb{$gene});\n\tmy %tmpdb = ();\n\tmap {$tmpdb{$_}++} @tdb;\n\tprint OUT \"$data[0]\t$data[3]\t\";\n\tmap {print OUT \"$_\t\"} keys %tmpdb;\n\tprint OUT \"\\n\";\n\t}\nclose IN;\nclose OUT;\n"
  },
  {
    "path": "scripts/gmap2AlleleTableBED.pl",
    "content": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 ref.bed\\n\" if(!defined ($ARGV[0]));\nmy $refGFF = $ARGV[0];\nopen(IN, \"grep 'gene' gmap.gff3 |\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\tmy $gene = $1 if(/Name=([^;\\s]+)/);\n\t$infordb{$gene} .= $data[0].\"\t\";\n\t}\nclose IN;\n\nopen(OUT, \"> Allele.ctg.table\") or die\"\";\nopen(IN, $refGFF) or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\tmy $gene = $data[3];\n\t   $gene =~ s/;.*//g;\n\tnext if(!exists($infordb{$gene}));\n\tmy @tdb = split(/\\s+/,$infordb{$gene});\n\tmy %tmpdb = ();\n\tmap {$tmpdb{$_}++} @tdb;\n\tprint OUT \"$data[0]\t$data[3]\t\";\n\tmap {print OUT \"$_\t\"} keys %tmpdb;\n\tprint OUT \"\\n\";\n\t}\nclose IN;\nclose OUT;\n"
  },
  {
    "path": "scripts/link_superscaffold.pl",
    "content": "#!/usr/bin/perl -w\n\nmy %namedb;\nmy %removedb;\nwhile(<DATA>){\n\tchomp;\n\tmy ($id,$name) = (split/\\s+/,$_)[0,1];\n\t$namedb{$name} = $id;\n\tmy @data = split(/\\s+/,$_);\n\tmy $key  = \"\";\n\tforeach my $i (2..$#data){\n\t\tmy ($sa,$sb) = sort ($data[1],$data[$i]);\n\t\t$key   = $sa.\"\t\".$sb;\n\t\t$removedb{$key}++;\n\t\t}\n\t}\n\t\nmy %infordb;\nopen(IN, \"grep -v 'tig' all.clm|\") or die\"\";\n<IN>;\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\tmy $scf1 = $data[0];\n\tmy $scf2 = $data[1];\n\t   $scf1 =~ s/[+|-]//g;\n\t   $scf2 =~ s/[+|-]//g;\n#\tmy ($s1,$s2) = sort ($scf1,$scf2);\n\tmy $key1  = $scf1.\"\t\".$scf2;\n\tmy $key2  = $scf2.\"\t\".$scf1;\n\tnext if(exists($removedb{$key1}));\n\tif(!exists($infordb{$key1})){\n\t\t$infordb{$key1} = $data[4];\n\t}elsif(exists($infordb{$key1}) and $data[4]>$infordb{$key1}){\n\t\t$infordb{$key1} = $data[4];\n\t\t}\n\tif(!exists($infordb{$key2})){\n\t\t$infordb{$key2} = $data[4];\n\t}elsif(exists($infordb{$key2}) and $data[4]>$infordb{$key2}){\n\t\t$infordb{$key2} = $data[4];\n\t\t}\t\n\t}\nclose IN;\n\nmy %bestdb;\nopen(OUT, \"> tmp.txt\") or die\"\";\nforeach my $key (keys %infordb){\n\tmy ($sa,$sb) = split(/\\s+/,$key);\n\tmy $ida      = $namedb{$sa};\n\tmy $idb      = $namedb{$sb};\n\tprint OUT \"$ida\t$idb\t$sa\t$sb\t$infordb{$key}\\n\";\n\tif(!exists($bestdb{$ida}->{$idb})){\n\t\t$bestdb{$ida}->{$idb} = $infordb{$key};\n\t}elsif($infordb{$key}>$bestdb{$ida}->{$idb}){\n\t\t$bestdb{$ida}->{$idb} = $infordb{$key};\n\t\t}\n\tif(!exists($bestdb{$idb}->{$ida})){\n\t\t$bestdb{$idb}->{$ida} = $infordb{$key};\n\t}elsif($infordb{$key}>$bestdb{$idb}->{$ida}){\n\t\t$bestdb{$idb}->{$ida} = $infordb{$key};\n\t\t}\t\n\t}\nclose OUT;\n\nopen(OUT, \"> best_link.txt\") or die\"\";\nmy $ln = 0;\nmy %linkdb;\nopen(IN, \"sort -k5,5nr -k1,1n tmp.txt|\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\t$ln++;\n  my @data = split(/\\s+/,$_);\n  my $ida  = $data[0];\n  my $idb  = $data[1];\n  my $key  = $ida.\"\t\".$idb;\n  if($ln==1){\n  \t$linkdb{$key} = $_;\n  \t$tmpdb{$ida}++;\n  \t$tmpdb{$idb}++;\n  }else{\n  \tnext if(exists($tmpdb{$ida}) or exists($tmpdb{$idb}));\n   \t$linkdb{$key} = $_;\n  \t$tmpdb{$ida}++;\n  \t$tmpdb{$idb}++; \t\n  \t}\n\t}\nclose IN;\n\nforeach my $key (keys %linkdb){\n\tprint OUT \"$linkdb{$key}\\n\";\n\t}\n\nclose OUT;\n\n\n### Below are the information that listed allelic super-scaffolds for each target.\n#Format:\n#ID\ttarget\tallelic_superscaffold1\tallelic_superscaffold2 ...\n__DATA__\n1\tgroup1\tgroup2\tgroup4\tgroup6\tgroup8\tgroup9\tgroup11\tgroup14\tgroup15\tgroup16\n2\tgroup2\tgroup1\tgroup3\tgroup4\tgroup5\tgroup6\tgroup7\tgroup8\tgroup9\tgroup10\tgroup11\tgroup12\tgroup13\tgroup14\tgroup15\tgroup16\n3\tgroup3\tgroup2\tgroup5\tgroup7\tgroup9\tgroup10\tgroup11\tgroup12\tgroup13\tgroup14\n4\tgroup4\tgroup1\tgroup3\tgroup6\tgroup8\tgroup9\tgroup11\tgroup14\tgroup15\tgroup16\n5\tgroup5\tgroup2\tgroup3\tgroup7\tgroup9\tgroup10\tgroup11\tgroup12\tgroup13\tgroup14\n6\tgroup6\tgroup1\tgroup2\tgroup4\tgroup8\tgroup9\tgroup11\tgroup14\tgroup15\tgroup16\n7\tgroup7\tgroup2\tgroup3\tgroup5\tgroup9\tgroup10\tgroup11\tgroup12\tgroup13\tgroup14\n8\tgroup8\tgroup1\tgroup3\tgroup4\tgroup6\tgroup9\tgroup11\tgroup14\tgroup15\tgroup16\n9\tgroup9\tgroup1\tgroup3\tgroup4\tgroup5\tgroup6\tgroup7\tgroup8\tgroup2\tgroup10\tgroup11\tgroup12\tgroup13\tgroup14\tgroup15\tgroup16\n10\tgroup10\tgroup2\tgroup3\tgroup5\tgroup7\tgroup9\tgroup11\tgroup12\tgroup13\tgroup14\n11\tgroup11\tgroup1\tgroup3\tgroup4\tgroup5\tgroup6\tgroup7\tgroup8\tgroup9\tgroup10\tgroup2\tgroup12\tgroup13\tgroup14\tgroup15\tgroup16\n12\tgroup12\tgroup2\tgroup3\tgroup5\tgroup7\tgroup9\tgroup10\tgroup11\tgroup13\tgroup14\n13\tgroup13\tgroup12\tgroup2\tgroup3\tgroup5\tgroup7\tgroup9\tgroup10\tgroup11\n14\tgroup14\tgroup12\tgroup2\tgroup3\tgroup5\tgroup7\tgroup9\tgroup10\tgroup11\tgroup16\n15\tgroup15\tgroup1\tgroup2\tgroup4\tgroup6\tgroup8\tgroup9\tgroup14\n16\tgroup16\tgroup1\tgroup2\tgroup4\tgroup6\tgroup8\tgroup9\tgroup11\tgroup14\tgroup15\n"
  },
  {
    "path": "scripts/make_bed_around_RE_site.pl",
    "content": "#!/usr/bin/perl -w\nuse strict;\n\n\n# make_bed_around_restriction_site.pl: Make a BED file representing the regions around all occurrences of a restriction site.\n#\n# For syntax, run with no arguments.\n#\n# The output BED file is designed for use with bedtools intersect, as follows:\n# bedtools intersect -abam [SRR.bam] -b [$BED_out] > [SRR.REduced.bam]\n# samtools view -h [SRR.REduced.bam] > [SRR.REduced.sam]\n# This restricts a SAM/BAM file to only include reads close to a restriction site, which is a good way to filter Hi-C data, according to Fig. 1b of this paper:\n# http://www.nature.com/ng/journal/v43/n11/full/ng.947.html\n# Also see PreprocessSAM.pl, which uses the output file.\n#\n# Josh Burton\n# April 2013\n\n\n\n\nif ( scalar @ARGV != 3 ) {\n    \n    # Report syntax.\n    print \"\\nmake_bed_around_RE_site.pl\\n\\n\";\n    print \"Find all occurrences of a motif in a genome.  Make a 'POS' file listing these occurrences, and also a BED file representing the regions around these occurrences.\\n\\n\";\n    print \"SYNTAX:\\tmake_bed_around_RE_site.pl <fasta> <motif> <range>\\n\";\n    print \"fasta:\\tA fasta file representing a genome (reference or draft assembly.)\\n\";\n    print \"motif:\\tA motif, typically a restriction site sequence (e.g., HindIII = AAGCTT, NcoI = CCATGG, Dpn1 = GATC).\\n\";\n    print \"range:\\tA number representing how many bp around the sequence to include.  Recommend 500 based on Yaffe & Tanay, Nat. Genetics 2011.\\n\\n\";\n    print \"OUTPUT FILES:\\n\";\n    print \"<fasta>.near_<motif>.<range>.bed\\n\";\n    print \"<fasta>.near_pos_of_<motif>.txt\\n\";\n    print \"\\n\";\n    exit;\n}\n\n\n\n# Get command-line arguments.\nmy ( $FASTA_in, $motif_seq, $range ) = @ARGV;\n\nmy $verbose = 0;\n\n# Convert the motif from a string into a regex.  Unroll the IUPAC codes from single letters into Perl-parseable regular expressions.\nmy $motif_regex = $motif_seq;\n$motif_regex =~ s/R/\\[AG\\]/g;\n$motif_regex =~ s/Y/\\[CT\\]/g;\n$motif_regex =~ s/S/\\[CG\\]/g;\n$motif_regex =~ s/W/\\[AT\\]/g;\n$motif_regex =~ s/K/\\[GT\\]/g;\n$motif_regex =~ s/M/\\[AC\\]/g;\n$motif_regex =~ s/B/\\[CGT\\]/g;\n$motif_regex =~ s/D/\\[AGT\\]/g;\n$motif_regex =~ s/H/\\[ACT\\]/g;\n$motif_regex =~ s/V/\\[ACG\\]/g;\n$motif_regex =~ s/N/\\[ACGT\\]/g;\n\n\n\n\n# Derive an output filename.\nmy $BED_out = \"$FASTA_in.near_$motif_seq.$range.bed\";\nmy $POS_out = \"$FASTA_in.pos_of_$motif_seq.txt\";\n\n\n# Determine how many letters needed to be added to each line in order to find instances of the sequence that bridge lines in the fasta.\nmy $N_prev_chars = length($motif_seq) - 1;\n\n\nmy $contig_name = '';\nmy $offset = 0;\nmy $prev_chars;\nmy @motif_positions;\nmy $N_motifs_found = 0;\n\n\n# Open the input fasta file and read through it line-by-line.\nprint localtime() . \": Reading file $FASTA_in...\\n\";\nopen IN, '<', $FASTA_in or die \"Can't find file `$FASTA_in'\";\nopen BED, '>', $BED_out or die;\nopen POS, '>', $POS_out or die;\n\nwhile (<IN>) {\n    my $line = $_;\n    chomp $line;\n    \n    # If this is a header line, we're done with this contig/chromosome (unless we just started), and start a new contig/chromosome.\n    if ( $line =~ /^\\>(\\S+)/ ) {\n\t\n\t# The hash %motif_positions contains all positions on the (now complete) old contig at which this motif appears.\n\t# Convert this list of positions to a set of BED lines, as necessary.\n\tmy ( $prev_start, $prev_end ) = (-1,-1);\n\tforeach my $pos ( @motif_positions ) {\n\t    if ( $prev_end == -1 ) {\n\t\t$prev_start = $pos;\n\t\t$prev_end   = $pos;\n\t    }\n\t    if ( $prev_end + 2*$range < $pos ) {\n\t\t$prev_start =         $range if $prev_start < $range;\n\t\t$prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome\n\t\tprint BED \"$contig_name\\t\", $prev_start - $range, \"\\t\", $prev_end + $range, \"\\n\";\n\t\t$prev_start = $pos;\n\t    }\n\t    #print \"pos = $pos\\n\";\n\t    $prev_end = $pos;\n\t}\n\t\n\t# Print the final BED line for this contig/chromosome.\n\tif (@motif_positions) {\n\t    $prev_start =         $range if $prev_start < $range;\n\t    $prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome\n\t    print BED \"$contig_name\\t\", $prev_start - $range, \"\\t\", $prev_end + $range, \"\\n\";\n\t}\n\t\n\t# Get the new contig's name.\n\t$contig_name = $1;\n\tprint localtime() . \": $contig_name\\n\" if $verbose;\n\tprint POS \">$contig_name\\n\";\n\t\n\t# Reset other contig-related variables.\n\t$offset = 0;\n\t$prev_chars = '';\n\t@motif_positions = ();\n    }\n    \n    # Otherwise, read through this contig/chromosome.\n    else {\n\tif ( $offset != 0 ) { die unless $prev_chars; }\n\t\n\tmy $verbose = 0;\n\t\n\t# Look for instances of this motif in this line of the fasta (including the overlap characters from the previous line, tacked on at the beginning.)\n\tmy $motif_loc = -1;\n\tmy $target_str = \"$prev_chars\" . uc $line;\n\t\n\tmy @matches;\n\twhile ($target_str =~ /$motif_regex/g ) {\n\t    \n\t    # Every iteration in this loop represents a new match to the motif regex in the terget string.\n\t    my $motif_loc = $-[0];\n\t    \n\t    # Adjust the location so it properly describes the 0-indexed motif position in this contig.\n\t    # Then add it to the list of contig positions at which the motif has been seen.\n\t    $N_motifs_found++;\n\t    my $true_motif_loc = $motif_loc + $offset - length $prev_chars; # adjust index so it properly describes the 0-indexed motif position in this contig\n\t    push @motif_positions, $true_motif_loc;\n\t    \n\t    print \"$contig_name\\t$offset\\t$prev_chars\\t->\\t$motif_loc\\n\" if $verbose;\n\t    print POS \"$true_motif_loc\\n\";\n\t}\n\t\n\t\n\t# TODO: remove\n\twhile (0) {\n\t    $motif_loc = index \"$prev_chars$line\", $motif_seq, $motif_loc + 1;\n\t    last if ( $motif_loc == -1 ); # no more instances found\n\t    \n\t    # Found a motif!  Add its index to the list of contig positions at which the motif has been seen.\n\t    $N_motifs_found++;\n\t    my $true_motif_loc = $motif_loc + $offset - length $prev_chars; # adjust index so it properly describes the 0-indexed motif position in this contig\n\t    push @motif_positions, $true_motif_loc;\n\t    \n\t    print \"$contig_name\\t$offset\\t$prev_chars\\t->\\t$motif_loc\\n\" if $verbose;\n\t    print POS \"$true_motif_loc\\n\";\n\t}\n\t\n\t\n\t# Save the last few characters of this line, so that they can be appended onto the next line in a search for the sequence.\n\tmy $line_len = length $line;\n\t$prev_chars = substr( $line, $line_len - $N_prev_chars );\n\t$offset += $line_len;\n    }\n\n\n}\n\n################# modified based on the pull request from @FlyPythons: https://github.com/shendurelab/LACHESIS/pull/45\n\n# process the last fasta record\nmy ( $prev_start, $prev_end ) = (-1,-1);\nforeach my $pos ( @motif_positions ) {\n    if ( $prev_end == -1 ) {\n        $prev_start = $pos;\n        $prev_end   = $pos;\n    }\n    if ( $prev_end + 2*$range < $pos ) {\n        $prev_start =         $range if $prev_start < $range;\n        $prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome\n        print BED \"$contig_name\\t\", $prev_start - $range, \"\\t\", $prev_end + $range, \"\\n\";\n        $prev_start = $pos;\n    }\n    #print \"pos = $pos\\n\";\n    $prev_end = $pos;\n}\n\n# Print the final BED line for this contig/chromosome.\nif (@motif_positions) {\n    $prev_start =         $range if $prev_start < $range;\n    $prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome\n    print BED \"$contig_name\\t\", $prev_start - $range, \"\\t\", $prev_end + $range, \"\\n\";\n}\n\n# Reset other contig-related variables.\n$offset = 0;\n$prev_chars = '';\n@motif_positions = ();\n\n#################\n\nclose IN;\nclose BED;\nclose POS;\n\n\nprint localtime() . \": Done!  Found $N_motifs_found total instances of motif $motif_seq.  Created files:\\n\";\nprint \"$BED_out\\n$POS_out\\n\";\n"
  },
  {
    "path": "scripts/mc_bam.pl",
    "content": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"b:a:r:\";\n\n\nif ((!defined $opt_b)|| (!defined $opt_a) ||(!defined $opt_r)) {\n    die \"************************************************************************\n    Usage: mc_bam.pl -b mapping.bam -r groups.asm.fasta -a agp\n      -h : help and usage.\n           This script is used for modification the coordinates \n           in bam based on agp file\n      -b : mapping.bam\n      -r : reference genome, fasta format\n      -a : agp file\n************************************************************************\\n\";\n\n}\n\nmy %posidb = ();\nopen(IN, $opt_a) or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\tnext if($data[-1] eq \"map\");\n\tmy $ga   = $data[1]; \n\tmy $gb   = $data[2]; \n\tmy $ta   = $data[6];\n\tmy $tb   = $data[7];\n\tmy $ctg  = $data[5];\n\tif($data[8] eq \"+\"){\n\t\tmy $gi = $ga;\n\t\tforeach my $ti($ta..$tb){\n\t\t\t$posidb{$ctg}->{$ti} = $data[0].\",\".$gi;\n\t\t\t$gi++;\n\t\t\t}\n\t }elsif($data[8] eq \"-\"){\n\t  my $gi = $gb;\n\t  foreach my $ti ($ta..$tb){\n\t  \t\t$posidb{$ctg}->{$ti} = $data[0].\",\".$gi;\n\t  \t\t$gi--;\n\t  \t\t}\n\t  \t}\n\n\t}\nclose IN;\n\n#open(OUT, \"> posi.txt\") or die\"\";\n#foreach my $ctg (keys %posidb){\n#  foreach my $i(sort {$a<=>$b} keys %{$posidb{$ctg}}){\n#\t  print OUT \"$ctg\t$i\t$posidb{$ctg}->{$i}\\n\";\n#\t  }\n#\t}\n#close OUT;\n\nmy $outsam = \"mc.sam\";\nmy $outbam = \"mc.bam\";\nopen(OUT, \"> $outsam\") or die\"\";\nopen(IN, \"samtools view $opt_b |\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\tnext if($data[6] eq \"=\");\n\tmy $ctgA = $data[2]; \n\tmy $tiA  = $data[3]; \n\tmy $ctgB = ($data[6] eq \"=\")?$data[2]:$data[6];\n\tmy $tiB  = $data[7];\n\tmy $gidA; my $giA; my $gidB; my $giB;\n\tnext if(!exists($posidb{$ctgA}->{$tiA}));\n\tnext if(!exists($posidb{$ctgB}->{$tiB}));\n  ($gidA,$giA) = split(/,/,$posidb{$ctgA}->{$tiA});\n\t($gidB,$giB) = split(/,/,$posidb{$ctgB}->{$tiB});\n\t$data[2]       = $gidA;\n\t$data[3]       = $giA;\n\t$data[6]       = ($gidB eq $gidA)?\"=\":$gidB;\n\t$data[7]       = $giB;\n\tmap {print OUT \"$_\t\"} @data;\n\tprint OUT \"\\n\";\n\t}\nclose IN;\nclose OUT;\n\nsystem(\"samtools faidx $opt_r\");\nmy $fai = $opt_r.\".fai\";\nsystem(\"samtools view -bt $fai $outsam > $outbam\");\n"
  },
  {
    "path": "scripts/odering2tour.pl",
    "content": "#!/usr/bin/perl -w\n\nwhile(my $file=glob \"*_orderings.txt\"){\n\tmy $name = $file; \n\t\t $name =~ s/_orderings.txt//g;\n\t\t $name .= \".tour\";\n  open(my $out, \"> $name\") or die\"\";\n\topen(my $fh, $file) or die\"\";\n\twhile(<$fh>){\n\t\tchomp;\n\t\tmy ($ctg,$dir) = (split/\\s+/,$_)[0,1];\n\t\tmy $line = $ctg.\"\".$dir;\n\t\tprint $out \"$line\t\";\n\t\t}\n\tclose $fh;\n\tclose $out;\n\t}\n\n"
  },
  {
    "path": "scripts/partition.pl",
    "content": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"g:d:b:r:\";\n\n\nif ((!defined $opt_g)|| (!defined $opt_r)) {\n    die \"************************************************************************\n    Usage: perl $0 -g Allele.gene.table -r draft.asm.fasta\n      -h : help and usage.\n      -g : Allele.gene.table \n      -b : optional,default prunning.bam\n      -r : reference ctg assembly\n      -d : optional, default wrk_dir\n************************************************************************\\n\";\n}\n\nmy $bam    = (defined $opt_b)?$opt_b:\"prunning.bam\";\nmy $table  = $opt_g;\nmy $wrkd   = (defined $opt_d)?$opt_d:\"wrk_dir\";\nmy $refSeq = $opt_r;\n\n### Read referece ctg fasta\nmy %refdb = ();\nmy $ctgn;\nopen(IN, $refSeq) or die\"\";\nwhile(<IN>){\n\tchomp;\n\tif(/>/){\n\t\t$ctgn = $_;\n\t\t$ctgn =~ s/>//g;\n\t\t$ctgn =~ s/\\s+//g;\n\t}else{\n\t\t$refdb{$ctgn} .= $_;\n\t\t}\n\t}\nclose IN;\n\nforeach $ctgn (keys %refdb){\n\t$refdb{$ctgn} =~ s/\\s+//g;\n\t}\n\n### Read prunning BAM file\nmy %bamdb = ();\nmy $count = 1;\nmy %rdb;\nopen(IN, \"samtools view $bam |\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy $rname = (split/\\s+/,$_)[0];\n\tnext if(exists($rdb{$rname}));    ### only retain single-end reads\n\t$rdb{$rname}++;        \n\t$bamdb{$count++} = $_;\n\t}\nclose IN;\n### Assign ctgs to pre-defined clusters\n\nmy %ctgdb;\nopen(IN, $table) or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\tmy $chrn = $data[1];\n\tforeach my $i(3..$#data){\n\t\tmy $ctg = (split/,/,$data[$i])[1];\n\t\t$ctgdb{$ctg}->{$chrn}++;\n\t\t}\n\t}\nclose IN;\n\nmy %chrdb; ### pre-defined cluster based on chromosomes of close-releative species\nforeach my $ctg (keys %ctgdb){\n\tmy $count = 0;\n\tforeach my $chrn (sort {$ctgdb{$ctg}->{$b}<=>$ctgdb{$ctg}->{$a}} keys %{$ctgdb{$ctg}}){\n\t\t$count++;\n\t\tnext if($count>1);\n#\t\tprint \"$ctg\t$chrn\t$ctgdb{$ctg}->{$chrn}\\n\";\n\t\t$chrdb{$chrn} .= $ctg.\",\";\n\t\t}\n\t}\n\nsystem(\"rm -rf $wrkd\");\nsystem(\"mkdir $wrkd\");\nforeach my $chrn (keys %chrdb){\n\tnext if($chrn=~/tig/);\n\tnext if($chrn=~/ctg/);\n\tsystem(\"rm -rf $wrkd/$chrn\");\n\tsystem(\"mkdir $wrkd/$chrn\");\n\tmy @ctgdb  = split(/,/,$chrdb{$chrn});\n\tmy %tmpdb = (); $tmpdb{'='}++; ### need retain intra-contig links\n### output ctg list\tto each cluster\n\topen(my $out, \">$wrkd/$chrn/ctg.list\") or die\"\";\n\tmap {print $out \"$_\\n\";$tmpdb{$_}++} @ctgdb;\n\tclose $out;\n### output ctg sequence to each cluster\n\topen(my $faout, \">$wrkd/$chrn/seq.fasta\") or die\"\";\n\tmap {chomp;print $faout \">$_\\n$refdb{$_}\\n\" if(exists($refdb{$_}))} @ctgdb;\n\tclose $faout;\n### output bam file to each cluster\n\topen(my $bamout, \"> $wrkd/$chrn/sample.clean.sam\") or die\"\";\n\tforeach my $i(keys %bamdb){\n\t\tmy ($c1,$c2) = (split/\\s+/,$bamdb{$i})[2,6];\n\t\tnext if(!exists($tmpdb{$c1}) or !exists($tmpdb{$c2}));\n\t\tprint $bamout \"$bamdb{$i}\\n\";\n\t\t}\n\tclose $bamout;\n\tsystem(\"samtools faidx $wrkd/$chrn/seq.fasta\");\n\tsystem(\"samtools view -bt $wrkd/$chrn/seq.fasta.fai $wrkd/$chrn/sample.clean.sam > $wrkd/$chrn/sample.clean.bam\");\n\tsystem(\"rm $wrkd/$chrn/sample.clean.sam\");\n\t\n\t}\n\n\n\n\n\n"
  },
  {
    "path": "scripts/partition_gmap.pl",
    "content": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"g:d:b:r:l:\";\n\n\nif ((!defined $opt_g)|| (!defined $opt_r)) {\n    die \"************************************************************************\n    Usage: perl $0 -g Allele.ctg.table -r draft.asm.fasta\n      -h : help and usage.\n      -g : Allele.ctg.table \n      -b : optional,default prunning.bam\n      -r : reference ctg assembly\n      -d : optional, default wrk_dir\n      -l : chrn.list\n************************************************************************\\n\";\n}\n\nmy $bam    = (defined $opt_b)?$opt_b:\"prunning.bam\";\nmy $table  = $opt_g;\nmy $wrkd   = (defined $opt_d)?$opt_d:\"wrk_dir\";\nmy $refSeq = $opt_r;\n\nif(!defined $opt_l){\n  system(\"cut -f1 $table |sort -u > chrn.list\");\n  $opt_l = \"chrn.list\";\n  }\nmy %chrnListdb;\nopen(IN, $opt_l) or die\"\";\nwhile(<IN>){\n  chomp;\n  my $chrn = (split/\\s+/,$_)[0];\n  $chrnListdb{$chrn}++;\n  }\nclose IN;\n\n### Read referece ctg fasta\nmy %refdb = ();\nmy $ctgn;\nopen(IN, $refSeq) or die\"\";\nwhile(<IN>){\n\tchomp;\n\tif(/>/){\n\t\t$ctgn = $_;\n\t\t$ctgn =~ s/>//g;\n\t\t$ctgn =~ s/\\s+.*//g;\n\t}else{\n\t\t$refdb{$ctgn} .= $_;\n\t\t}\n\t}\nclose IN;\n\nforeach $ctgn (keys %refdb){\n\t$refdb{$ctgn} =~ s/\\s+//g;\n\t}\n\n### Read prunning BAM file\nmy %bamdb = ();\nmy $count = 1;\nmy %rdb;\nopen(IN, \"samtools view $bam |\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy $rname = (split/\\s+/,$_)[0];\n\tnext if(exists($rdb{$rname}));    ### only retain single-end reads\n\t$rdb{$rname}++;        \n\t$bamdb{$count++} = $_;\n\t}\nclose IN;\n### Assign ctgs to pre-defined clusters\n\nmy %ctgdb;\nopen(IN, $table) or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\tmy $chrn = $data[0];\n\tforeach my $i(2..$#data){\n\t\t#my $ctg = (split/,/,$data[$i])[1];\n\t\tmy $ctg  = $data[$i];\n                $ctgdb{$ctg}->{$chrn}++;\n\t\t}\n\t}\nclose IN;\n\nmy %chrdb; ### pre-defined cluster based on chromosomes of close-releative species\nforeach my $ctg (keys %ctgdb){\n\tmy $count = 0;\n\tforeach my $chrn (sort {$ctgdb{$ctg}->{$b}<=>$ctgdb{$ctg}->{$a}} keys %{$ctgdb{$ctg}}){\n\t\t$count++;\n\t\tnext if($count>1);\n#\t\tprint \"$ctg\t$chrn\t$ctgdb{$ctg}->{$chrn}\\n\";\n\t\t$chrdb{$chrn} .= $ctg.\",\";\n\t\t}\n\t}\n\nsystem(\"rm -rf $wrkd\");\nsystem(\"mkdir $wrkd\");\nforeach my $chrn (keys %chrdb){\n\tnext if(!exists($chrnListdb{$chrn}));\n\tprint \"Process $chrn ...\\n\";\n\tsystem(\"rm -rf $wrkd/$chrn\");\n\tsystem(\"mkdir $wrkd/$chrn\");\n\tmy @ctgdb  = split(/,/,$chrdb{$chrn});\n\tmy %tmpdb = (); $tmpdb{'='}++; ### need retain intra-contig links\n### output ctg list\tto each cluster\n\topen(my $out, \">$wrkd/$chrn/ctg.list\") or die\"\";\n\tmap {print $out \"$_\\n\";$tmpdb{$_}++} @ctgdb;\n\tclose $out;\n### output ctg sequence to each cluster\n\topen(my $faout, \">$wrkd/$chrn/seq.fasta\") or die\"\";\n\tmap {chomp;print $faout \">$_\\n$refdb{$_}\\n\" if(exists($refdb{$_}))} @ctgdb;\n\tclose $faout;\n### output bam file to each cluster\n\topen(my $bamout, \"> $wrkd/$chrn/prunning.sub.sam\") or die\"\";\n\tforeach my $i(keys %bamdb){\n\t\tmy ($c1,$c2) = (split/\\s+/,$bamdb{$i})[2,6];\n\t\tnext if(!exists($tmpdb{$c1}) or !exists($tmpdb{$c2}));\n\t\tprint $bamout \"$bamdb{$i}\\n\";\n\t\t}\n\tclose $bamout;\n\tsystem(\"samtools faidx $wrkd/$chrn/seq.fasta\");\n\tsystem(\"samtools view -bt $wrkd/$chrn/seq.fasta.fai $wrkd/$chrn/prunning.sub.sam > $wrkd/$chrn/prunning.sub.bam\");\n\tsystem(\"rm $wrkd/$chrn/prunning.sub.sam\");\n\t\n\t}\n\n\n\n"
  },
  {
    "path": "scripts/partition_gmap.py",
    "content": "#!/usr/bin/env python\nimport sys\nimport os\nimport argparse\nimport multiprocessing\nimport pysam\n\n\ndef get_opt():\n\tgroup = argparse.ArgumentParser()\n\tgroup.add_argument('-r', '--ref', help='reference contig level assembly', required=True)\n\tgroup.add_argument('-g', '--alleletable', help='Allele.ctg.table', required=True)\n\tgroup.add_argument('-b', '--bam', help='bam file, default: prunning.bam', default='prunning.bam')\n\tgroup.add_argument('-d', '--workdir', help='work directory, default: wrk_dir', default='wrk_dir')\n\tgroup.add_argument('-t', '--thread', help='threads, default: 10', type=int, default=10)\n\treturn group.parse_args()\n\n\ndef read_fasta(in_fa):\n\tfa_db = {}\n\twith open(in_fa, 'r') as fin:\n\t\tfor line in fin:\n\t\t\tif line[0] == '>':\n\t\t\t\tid = line.strip().split()[0][1:]\n\t\t\t\tfa_db[id] = []\n\t\t\telse:\n\t\t\t\tfa_db[id].append(line.strip())\n\tfor id in fa_db:\n\t\tfa_db[id] = ''.join(fa_db[id])\n\t\n\treturn fa_db\n\n\ndef load_allele(allele_table):\n\tctg_on_chr = {}\n\tchr_contain_ctg = {}\n\twith open(allele_table, 'r') as fin:\n\t\tfor line in fin:\n\t\t\tdata = line.strip().split()\n\t\t\tchrn = data[0]\n\t\t\tif chrn.startswith('tig') or chrn.startswith('scaffold') or chrn.startswith('utg') or chrn.startswith('ctg'):\n\t\t\t\tcontinue\n\t\t\tfor ctg in data[2:]:\n\t\t\t\tif ctg not in ctg_on_chr:\n\t\t\t\t\tctg_on_chr[ctg] = {}\n\t\t\t\tif chrn not in ctg_on_chr[ctg]:\n\t\t\t\t\tctg_on_chr[ctg][chrn] = 0\n\t\t\t\tctg_on_chr[ctg][chrn] += 1\n\tfor ctg in ctg_on_chr:\n\t\tmax_chr = \"\"\n\t\tmax_cnt = 0\n\t\tfor chrn in ctg_on_chr[ctg]:\n\t\t\tif ctg_on_chr[ctg][chrn] > max_cnt:\n\t\t\t\tmax_cnt = ctg_on_chr[ctg][chrn]\n\t\t\t\tmax_chr = chrn\n\t\tctg_on_chr[ctg] = max_chr\n\t\tif max_chr not in chr_contain_ctg:\n\t\t\tchr_contain_ctg[max_chr] = {}\n\t\tchr_contain_ctg[max_chr][ctg] = 1\n\treturn ctg_on_chr, chr_contain_ctg\n\n\ndef split_files(chrn, allele_table, ref, bam_file, wrkdir):\n\twrk_dir = os.path.join(wrkdir, chrn)\n\tif not os.path.exists(wrk_dir):\n\t\tos.mkdir(wrk_dir)\n\t\n\tprint(\"\\tDealing %s\"%chrn)\n\tctg_on_chr, chr_contain_ctg = load_allele(allele_table)\n\tfa_db = read_fasta(ref)\n\n\tsub_bam = os.path.join(wrk_dir, chrn+'.bam')\n\tsub_fa = os.path.join(wrk_dir, chrn+'.fa')\n\twith open(sub_fa, 'w') as fout:\n\t\tfor ctg in chr_contain_ctg[chrn]:\n\t\t\tfout.write(\">%s\\n%s\\n\"%(ctg, fa_db[ctg]))\n\n\twith pysam.AlignmentFile(bam_file, 'rb') as fin:\n\t\twith pysam.AlignmentFile(sub_bam, 'wb', template=fin) as fout:\n\t\t\tfor ctg in chr_contain_ctg[chrn]:\n\t\t\t\tfor line in fin.fetch(contig=ctg):\n\t\t\t\t\tif line.next_reference_name and line.next_reference_name in ctg_on_chr and ctg_on_chr[line.next_reference_name]==chrn:\n\t\t\t\t\t\tfout.write(line)\n\t\n\ndef partition_gmap(ref, allele_table, bam, wrkdir, threads):\n\tif not os.path.exists(wrkdir):\n\t\tos.mkdir(wrkdir)\n\t\n\tprint(\"Getting groups\")\n\tchrn_db = {}\n\twith open(allele_table, 'r') as fin:\n\t\tfor line in fin:\n\t\t\tchrn_db[line.strip().split()[0]] = 1\n\n\tbai = bam+'.bai'\n\tif not os.path.exists(bai):\n\t\tprint(\"BAI file not found, starting index...\")\n\t\tret = os.system('samtools index %s'%bam)\n\t\tif ret==0:\n\t\t\tprint(\"Index success\")\n\t\telse:\n\t\t\tprint(\"Fatal: bam file must be sorted\")\n\t\t\tsys.exit(-1)\n\n\tprint(\"Splitting files\")\n\tif len(chrn_db) < threads:\n\t\tthreads = len(chrn_db)\n\tpool = multiprocessing.Pool(processes=threads)\n\tresult_list = list()\n\tfor chrn in chrn_db:\n\t\tresult_list.append([chrn, pool.apply_async(split_files, (chrn, allele_table, ref, bam, wrkdir,))])\n\tpool.close()\n\tpool.join()\n\n\terror_list = list()\n\tfor chrn, result in result_list:\n\t\ttry:\n\t\t\tresult.get()\n\t\texcept Exception as e:\n\t\t\tprint('Exception raised when dealing with {}: {}'.format(chrn, e))\n\t\t\terror_list.append(chrn)\n\n\tif error_list:\n\t\traise Exception(\"{} exception(s) detected in : {}\".format(len(error_list), ', '.join(error_list)))\n\n\tprint(\"Notice: If you got errors of \\\"Length mismatch\\\" during allhic extract, it is normal because we split bam with the same header, it will not effect the result\")\n\tprint(\"Finished\")\n\n\nif __name__ == '__main__':\n\topts = get_opt()\n\tref = opts.ref\n\tallele_table = opts.alleletable\n\tbam = opts.bam\n\twrkdir = opts.workdir\n\tthreads = opts.thread\n\tpartition_gmap(ref, allele_table, bam, wrkdir, threads)\n\n"
  },
  {
    "path": "scripts/prune.pl",
    "content": "#!/usr/bin/perl -w\n\n\nuse Getopt::Std;\ngetopts \"i:b:r:\";\n\n\nif ((!defined $opt_i)|| (!defined $opt_b)|| (!defined $opt_r)) {\n    die \"************************************************************************\n    Usage: perl $0 -i Allele.ctg.table -b bam.list -r draft.asm.fasta\n      -h : help and usage.\n      -i : Allele.ctg.table \n      -b : bam.list, a file contains input bam files\n      -r : draft.sam.fasta\n************************************************************************\\n\";\n}\n\nmy $bamfile = $opt_b;\nmy $table   = $opt_i;\nmy $refSeq  = $opt_r;\n### Read bam files\n\nmy %pairdb = ();\nmy %ctgdb  = ();\nmy %bamdb  = ();\nopen(IN, $bamfile) or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy $bam = $_;\n\t   $bam =~ s/\\s+//g;\n\tnext if(!($bam =~ /.bam/));\n\t$bamdb{$bam}++;\n\topen(my $fh, \"samtools view $bam |\") or die\"\";\n\twhile(<$fh>){\n\t\tchomp;\n\t\tmy @data = split(/\\s+/,$_);\n\t\tmy $ctg1 = $data[2];\n\t\tmy $ctg2 = $data[6];\n\t\tnext if($ctg2 eq \"=\");\n\t\tmy ($sa,$sb) = sort ($ctg1,$ctg2);\n\t\t$pairdb{$sa}->{$sb}  .= $data[0].\",\";\n\t\t$ctgdb{$ctg1}++; $ctgdb{$ctg2}++;\n\t\t}\n\tclose $fh;\n\t}\nclose IN;\n\n### Read allele information\n### Remove signal between alleles\nopen(OUT1, \">removedb_Allele.txt\") or die\"\";\nopen(LOG, \"> log.txt\") or die\"\";\nopen(IN, $table) or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data     = split(/\\s+/,$_);\n\tnext if(@data<=3);\n\tmy %tmpdb    = (); ### Record alelle contigs\n\tmy $n        = $#data;\n\tfor(my $i=2;$i<$n;$i++){\n\t\tmy $ctg1 = $data[$i];\n\t\tfor(my $j=$i+1;$j<=$n;$j++){\n\t\t\tmy $ctg2 = $data[$j];\n\t\t\tmy ($sa,$sb) = sort ($ctg1,$ctg2);\n\t\t\tmy $key      = $sa.\",\".$sb;\n\t\t\t$tmpdb{$key}++;\n\t\t\tprint OUT1 \"$sa\t$sb\t$pairdb{$sa}->{$sb}\\n\" if(exists($pairdb{$sa}->{$sb}));\n\t\t\t}\n\t\t}\n\tprint LOG \">$_\\n\";\n\tforeach my $i(2..$#data){\n\t\tmy $ctg1    = $data[$i];\n\t\tforeach my $ctg2 (keys %ctgdb){\n\t\t\tmy ($sa,$sb) = sort ($ctg1,$ctg2);\n\t\t\tmy $key      = $sa.\",\".$sb;\n\t\t\tnext if(exists($tmpdb{$key})); \n\t\t\tnext if(!exists($pairdb{$sa}->{$sb}));\n\t\t\tmy @rnamedb = split(/,/,$pairdb{$sa}->{$sb});\n\t\t\tmy $num_r   = @rnamedb;\n\t\t\tprint LOG \"$ctg2\t$ctg1\t$num_r\t$pairdb{$sa}->{$sb}\\n\";\n\t\t\t}\n\t\t}\n\t}\nclose IN;\nclose OUT1;\nclose LOG;\n\n### Remove signal which are not best match with listed alleles (ctgs)\nopen(OUT2, \"> removedb_nonBest.txt\") or die\"\";\nopen(IN, \"log.txt\") or die\"\";\n$/='>';\n<IN>;\nwhile(<IN>){\n\tchomp;\n\tmy %hashdb = ();\n\tmy ($name,$info) = split(/\\n/,$_,2);\n\tmy @linedb   = split(/\\n/,$info);\n\tforeach my $line(@linedb){\n\t\tmy @data   = split(/\\s+/,$line);\n\t\tif(!exists($hashdb{$data[0]})){\n\t\t  $hashdb{$data[0]}->{'retain'} = $data[1];\n\t\t  $hashdb{$data[0]}->{'num'}    = $data[2];\t\t\t\t\n\t\t}elsif(exists($hashdb{$data[0]}) and $data[2]>$hashdb{$data[0]}->{'num'}){\n\t\t  $hashdb{$data[0]}->{'retain'} = $data[1];\n\t\t  $hashdb{$data[0]}->{'num'}    = $data[2];\t\t\t\t\n\t \t}\n\t }\n\tforeach $line (@linedb){\n\t\t@data = split(/\\s+/,$line);\n\t\tif($hashdb{$data[0]}->{'retain'}  eq $data[1]){\n#\t\t\tprint OUT2 \"$data[0]\t$data[1]\t$data[2]\tretain\t$data[3]\\n\";\n      next;\n\t\t}else{\n\t\t\tprint OUT2 \"$data[0]\t$data[1]\t$data[2]\tremove\t$data[3]\\n\";\n\t\t\t}\n\t\t}\t\n\t}\nclose IN;\nclose OUT2;\nsystem(\"remove_reads.pl\");\n### Reading removed reads\n#my %removedb = ();\n#open(IN, \"removedb_Allele.txt\") or die\"\";\n#my $content = <IN>;\n#my @linedb  = split(/\\n/,$content);\n#foreach my $line (@linedb){\n#\tmy $info    = (split/\\s+/,$line)[2];\n#\tmy @rnamedb = split(/,/,$info);\n#\tmap {$removedb{$_}++} @rnamedb;\n#\t\n#\t}\n#close IN;\n#\n#open(IN, \"removedb_nonBest.txt\") or die\"\";\n#$content = <IN>;\n#@linedb  = split(/\\n/,$content);\n#foreach my $line (@linedb){\n#\tmy $info  = (split/\\s+/,$line)[4];\n#\tmy @rnamedb = split(/,/,$info);\n#\tmap {$removedb{$_}++} @rnamedb;\t\n#\t}\n#close IN;\n\n#my $num_of_remove_reads = keys %removedb;\n#print \"Removing $num_of_remove_reads reads\\n\";\n\n#open(OUT, \"> prunning.sam\") or die\"\";\n#foreach my $bam (keys %bamdb){\n#\topen(my $fh, \"samtools view $bam |\") or die\"\";\n#\t$content = <$fh>;\n#\t@linedb  = split(/\\n/,$content);\n#\tforeach my $line (@linedb){\n#\t\tmy $rname = (split/\\s+/,$line)[0];\n#\t\tnext if(exists($removedb{$rname}));\n#\t\tprint OUT \"$line\\n\";\n#\t\t}\n#\tclose $fh;\n#\t}\n#close OUT;\n\nsystem(\"samtools faidx $refSeq\");\nmy $fai  = $refSeq.\".fai\";\nsystem(\"samtools view -bt $fai prunning.sam > prunning.bam\");\n\n\n\n\n"
  },
  {
    "path": "scripts/ragoo2ALLHiC.pl",
    "content": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"l:r:b:e:\";\n\n\nif ((!defined $opt_l)|| (!defined $opt_r) ||(!defined $opt_b)) {\n    die \"************************************************************************\n    Usage: perl ragoo2ALLHiC -l orderings.list -r draft.asm.fasta -b sample.clean.bam \n      -h : help and usage.\n      -l : ordering.list contains a list of output files from ragoo\n      -r : draft contig assembly\n      -b : sample.clean.bam\n      -e : restriction sites, optional, default GATC\n           MboI: GATC; HindIII: AAGCTT\n************************************************************************\\n\";\n}else{\n  print \"************************************************************************\\n\";\n  print \"Version demo\\n\";\n  print \"Copyright to Tanger\\n\";\n  print \"RUNNING...\\n\";\n  print \"************************************************************************\\n\";\n\t\n\t}\n\n$opt_e = (defined $opt_e)?$opt_e:\"GATC\";\n\n\nif(!(-e \"draft.asm.fasta\")){\n\tsystem(\"ln -s $opt_r ./draft.asm.fasta\");\n}else{\n\tprint \"check draft.asm.fasta file, exist\\n\";\n\t}\n\nif(!(-e \"sample.clean.bam\")){\n\tsystem(\"ln -s $opt_b ./sample.clean.bam\");\n}else{\n\tprint \"check sample.clean.bam file, exist\\n\";\n\t}\n\n\nmy $num_g = 0;\nmy %cntdb = ();\nopen(IN, $opt_l) or die\"\";\nwhile(<IN>){\n\tchomp;\n\t$num_g++;\n\tmy @linedb = split(/\\n/,$_);\n\tforeach my $file (@linedb){\n\t\t$gid = (split/\\//,$file)[-1];\n\t\t$gid =~ s/_orderings.txt//g;\n\t\topen(my $fh, $file) or die\"\";\n\t\twhile(<$fh>){\n\t\t\tchomp;\n\t\t\tmy $ctg = (split/\\s+/,$_)[0];\n\t\t\t$cntdb{$gid}->{$ctg}++;\n\t\t\t}\n\t\tclose $fh;\n\t\t}\n\t}\nclose IN;\n\n\nopen(OUT, \">clusters.txt\") or die\"\";\nprint OUT \"#Group\tnContigs\tContigs\\n\";\nforeach my $g (sort keys %cntdb){\n\tmy $num = keys %{$cntdb{$g}};\n\tprint OUT \"$g\t$num\t\";\n\tforeach my $c (keys %{$cntdb{$g}}){\n\t\tprint OUT \"$c\t\";\n\t\t}\n\tprint OUT \"\\n\";\n\t}\nclose OUT;\n\nprint \"#### Counting restriction sites from draft assembly\\n\";\nprint \"allhic extract sample.clean.bam draft.asm.fasta --RE $opt_e\\n...\\n\\n\";\nsystem(\"allhic extract sample.clean.bam draft.asm.fasta --RE $opt_e\");\n\nprint \"### Rescue unanchored contigs\\n\";\nmy $countRE = \"sample.clean.counts_\".$opt_e.\".txt\";\nprint \"ALLHiC_rescue -r draft.asm.fasta -b sample.clean.bam -c clusters.txt -i $countRE\\n...\\n\\n\";\nsystem(\"ALLHiC_rescue -r draft.asm.fasta -b sample.clean.bam -c clusters.txt -i $countRE -m 1\");\n\nforeach my $i (1..$num_g){\n\tmy $gn = \"group\".$i.\".txt\";\n\tprint \"### Scaffolding $gn\\n\";\n\tprint \"allhic optimize $gn sample.clean.clm\\n...\\n\\n\";\n\tsystem(\"allhic optimize $gn\tsample.clean.clm\");\n\t}\n\nprint \"### Build ALLHiC assembly\\n\";\nsystem(\"ALLHiC_build draft.asm.fasta\");\n\nsystem(\"Done ...\\n\");\n\n"
  },
  {
    "path": "scripts/release3DDNA.pl",
    "content": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 No_of_chr seq.FINAL.fasta\\n\" if(!defined ($ARGV[0]) or !defined($ARGV[1]));\nmy $Kchr = $ARGV[0];\n\nopen(IN, $ARGV[1]) or die\"\";\n$/='>';\n<IN>;\nwhile(<IN>){\n\tchomp;\n\tmy ($name,$seq) = split(/\\n/,$_,2);\n\t$seq =~ s/\\s+//g;\n\tmy $len = length $seq;\n\t$infordb{$name}->{'seq'} = $seq;\n\t$infordb{$name}->{'len'} = $len;\n\t}\nclose IN;\n\nopen(OUT, \"> chr.fasta\") or die\"\";\nmy $count = 0;\nforeach my $scaf (sort {$infordb{$b}->{'len'}<=>$infordb{$a}->{'len'}} keys %infordb){\n\t$count++;\n\tmy $chrname = \"\";\n\tif($count<=$Kchr){\n\t\t$chrname = 'Chr'.$count;\n\t}else{\n\t\t$chrname = 'scaffold'.$count;\n\t\t}\n\tprint OUT \">$chrname\\n$infordb{$scaf}->{'seq'}\\n\";\n\t}\n\nclose OUT;\n\n\nmy $ctgn = 0;\nopen(OUT, \">tig.HiCcorrected.fasta\") or die\"\";\nopen(IN, \"chr.fasta\") or die\"\";\n$/='>';\n<IN>;\nwhile(<IN>){\n\tchomp;\n\tmy ($chrn,$seq) = split(/\\n/,$_,2);\n\tprint \"Process $chrn\\n\";\t\n\t$seq            =~ s/N/\\n/g;\n\tmy $tour        = \"\";\n\tmy $ctgname     = \"\";\n\tmy $otour       = $chrn.\".tour\";\n\tmy @seqdb = split(/\\n/,$seq);\n\tforeach my $i (0..$#seqdb){\n\t\tnext if ($seqdb[$i] eq \"\");\n\t\t$ctgn++;\n\t\t$ctgn = sprintf(\"%07d\",$ctgn);\n\t\t$ctgname = \"tig\".$ctgn;\n\t\t$tour      .= $ctgname.\"+ \";\n\t\tprint OUT \">$ctgname\\n$seqdb[$i]\\n\";\n\t\t}\n\tnext if($chrn =~ /scaffold/);\n\topen(my $out, \">$otour\") or die\"\";\n\tprint $out \">$chrn\\n$tour\\n\";\n\tclose $out;\n\t}\nclose IN;\nclose OUT;\n\nsystem(\"ALLHiC_build tig.HiCcorrected.fasta\");\n"
  },
  {
    "path": "scripts/remove_reads.pl",
    "content": "#!/usr/bin/perl -w\n\nmy %bamdb  = ();\nopen(IN, \"bam.list\") or die\"\";\nwhile(<IN>){\n        chomp;\n        my $bam = $_;\n           $bam =~ s/\\s+//g;\n        next if(!($bam =~ /.bam/));\n        $bamdb{$bam}++;\n\n        }\nclose IN;\n\n\nmy %removedb = ();\nopen(IN, \"removedb_Allele.txt\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy $info    = (split/\\s+/,$_)[2];\n\tmy @rnamedb = split(/,/,$info);\n\tmap {$removedb{$_}++} @rnamedb;\n\t}\nclose IN;\n\nopen(IN, \"removedb_nonBest.txt\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy $info  = (split/\\s+/,$_)[4];\n\tmy @rnamedb = split(/,/,$info);\n\tmap {$removedb{$_}++} @rnamedb;\n\t}\nclose IN;\n\nmy $num_of_remove_reads = keys %removedb;\nprint \"Removing $num_of_remove_reads reads\\n\";\n\n\nopen(OUT, \"> prunning.sam\") or die\"\";\nforeach my $bam (keys %bamdb){\n        open(my $fh, \"samtools view $bam|\") or die\"\";\n        while(<$fh>){\n        \tchomp;\n                my @data  = split(/\\s+/,$_);\n        \tmy $rname = (split/\\s+/,$_)[0];\n                my $ctg2  = $data[6];\n                next if($ctg2 eq \"*\");\n        \tprint OUT \"$_\\n\" if(!exists($removedb{$rname}));\n        \t}\n        close $fh;\n        }\nclose OUT;\n\n\n\n"
  },
  {
    "path": "scripts/remove_small_contigs.py",
    "content": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\n# Author: Xiaofei Zeng\n# Email: xiaofei_zeng@whu.edu.cn\n# Created Time: 2021-04-16 18:16\n\nimport argparse\n\n\ndef assembly_to_groups(assembly, len_cutoff):\n    ctg_dict = dict()\n    cluster_list = list()\n    small_frag = set()\n    with open(assembly) as f:\n        for line in f:\n            if not line.strip():\n                continue\n            cols = line.split()\n            if line.startswith('>'):\n                ctg_dict[cols[1]] = cols[0][1:]\n                if int(cols[2]) < len_cutoff:\n                    small_frag.add(cols[1])\n            else:\n                cluster_list.append([num.strip('-') for num in cols if num.strip('-') not in small_frag])\n    return ctg_dict, cluster_list\n\n\ndef output_clusters(ctg_dict, cluster_list):\n    with open('prunning.clusters.txt', 'w') as f:\n        f.write('#Group\\tnContigs\\tContigs\\n')\n        ngroup = len(cluster_list)\n        for n, nums in enumerate(cluster_list, 1):\n            f.write('{0}g{1}\\t{2}\\t{3}\\n'.format(ngroup, n, len(nums), ' '.join([ctg_dict[num] for num in nums])))\n\n\ndef output_counts(ctg_dict, counts):\n    with open(counts) as fin, open('sub.'+counts, 'w') as fout:\n        for line in fin:\n            if line.startswith('#'):\n                fout.write(line)\n            else:\n                cols = line.split()\n                if cols[0] in ctg_dict.values():\n                    fout.write(line)\n\n\ndef output_fasta(ctg_dict, fasta):\n    output = False\n    with open(fasta) as fin, open('sub.'+fasta, 'w') as fout:\n        for line in fin:\n            if line.startswith('>'):\n                if line.split()[0][1:] in ctg_dict.values():\n                    output = True\n                    fout.write(line)\n                else:\n                    output = False\n            elif output:\n                fout.write(line)\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('assembly', help='*.review.assembly (output file of juicebox manual grouping), used to generate new prunning.clusters.txt')\n    parser.add_argument('--fasta', default=None, help='input fasta file of contigs, this parameter will remove contigs not in .review.assembly, optional')\n    parser.add_argument('--counts', default=None, help='input prunning.counts_RE.txt, this parameter will remove contigs not in .review.assembly, optional')\n    parser.add_argument('--len_cutoff', default=100, type=float, help='length cutoff, default: %(default)s Kbp')\n    args = parser.parse_args()\n\n    ctg_dict, cluster_list = assembly_to_groups(args.assembly, args.len_cutoff*1000)\n    output_clusters(ctg_dict, cluster_list)\n    if args.fasta:\n        output_fasta(ctg_dict, args.fasta)\n    if args.counts:\n        output_counts(ctg_dict, args.counts)\n\n\nif __name__ == '__main__':\n    main()\n\n"
  },
  {
    "path": "scripts/simuCTG.pl",
    "content": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"i:m:s:\";\n\nif ((!defined $opt_i)|| (!defined $opt_m)  || (!defined $opt_s)) {\n    die \"************************************************************************\n    Usage: perl $0 -i input.fasta -m mean -s SD\n      -h : help and usage.\n      -i : input.fasta, chromosome assembly\n      -m : mean length\n      -s : sd\n************************************************************************\\n\";\n}else{\n  print \"************************************************************************\\n\";\n  print \"Version 1.1\\n\";\n  print \"Copyright to Tanger\\n\";\n  print \"RUNNING...\\n\";\n  print \"************************************************************************\\n\";\n        \n        }\n\nmy $mean = lc $opt_m;\nmy $sd   = lc $opt_s;\n\nif($mean =~ /m/){\n\t$mean =~ s/m//g;\n\t$mean = $mean * 1000000;\n}elsif($mean =~ /k/){\n\t$mean =~ s/k//g;\n\t$mean = $mean * 1000;\n\t}\n\n\nif($sd =~ /m/){\n\t$sd =~ s/m//g;\n\t$sd = $sd * 1000000;\n}elsif($sd =~ /k/){\n\t$sd =~ s/k//g;\n\t$sd = $sd * 1000;\n\t}\n\nprint \"1. generate a contig assembly with Average length = $mean bp ...\\n\";\n\nmy %chrdb;\nopen(CTG, \"> chrUn.fasta\") or die\"\";\nopen(OUT, \"> new_genome.posi.bed\") or die\"\";\nopen(IN, $opt_i) or die\"\";\n$/='>';\n<IN>;\nwhile(<IN>){\n\tchomp;\n\tmy ($gene,$seq) = split(/\\n/,$_,2);\n\t$seq =~ s/\\s+//g;\n\tif($gene=~/[C|c]hrUn/){\n\t\tprint CTG \">$gene\\n$seq\\n\";\n\t\tnext;\n\t\t}\n\t$chrdb{$gene}   = $seq;\n\tmy $total_len   = length $seq;\n\tmy $num_seq     = int $total_len/$sd + 500;\n  system(\"echo \\\"data<-rnorm($num_seq,mean=$mean,sd=$sd)\\\" >>Rscript.txt\");\n  system(\"echo \\\"write.table\\(data,file\\=\\'x.txt\\'\\) \\\" >> Rscript.txt\");\n  system(\"chmod +x Rscript.txt\");\n  my $Rcmd = \"R CMD BATCH --no-save ./Rscript.txt\";\n  system($Rcmd);\n  my $start = 0; my $l  =  0; my $end = 0;\n  open(F, \"x.txt\") or die\"\";\n  my $content = <F>;\n  my @linedb = split(/\\n/,$content);\n  foreach my $i(1..$#linedb){\n  \tmy $line     = $linedb[$i];\n  \t$start       = $end+1;\n  \t$l           = (split/\\s+/,$line)[1];\n  \t$l           = int $l;\n#  \tnext if($l<=0);\n    $l           = 0 - $l if($l<0);\n  \tif($end>$total_len){\n  \t\t$end       = $total_len;\n  \t}else{\n  \t\t$end       = $start + $l - 1;\n  \t\t}\n  \tnext if($start>=$total_len);\n  \tprint OUT \"$gene\t$start\t$end\\n\";\n  \t}\n  close F;\n  system(\"rm x.txt\");\n  system(\"rm Rscript.*\");\n\t}\nclose IN;\nclose OUT;\nclose CTG;\n\nmy $count = 0;\nmy %tdb;\nopen(OUT, \"> ctg.tmp.fasta\") or die\"\";\nopen(IN, \"new_genome.posi.bed\") or die\"\";\n$content = <IN>;\n@linedb  = split(/\\n/,$content);\nforeach $line(@linedb){\n\tmy ($chrn,$a,$b) = split(/\\s+/,$line);\n\tmy $L            = $b - $a + 1;\n\tmy $subseq       = substr($chrdb{$chrn},$a-1,$L);\n\tif(!exists($tdb{$chrn})){\n\t\t$count = 0;\n\t\t$tdb{$chrn}++;\n\t\t$count++;\n\t\t$outname       = $chrn.\".ctg\".$count;\n\t}else{\n\t\t$count++;\n\t\t$outname       = $chrn.\".ctg\".$count;\t\n\t\t}\n\tprint OUT \">$outname\\n$subseq\\n\";\t\n\t}\nclose OUT;\n\nsystem(\"cat ctg.tmp.fasta chrUn.fasta > ctg.fasta\");\nsystem(\"rm ctg.tmp.fasta\");\n\nprint \"2. get statistics for the contig assembly ...\\n\";\nsystem(\"perl ~/software/script/faSize.pl ctg.fasta\");\n\n$content = `perl ~/software/script/faSize.pl ctg.fasta`;\nmy $N50  = $1 if($content=~/N50:\\s+(\\d+)/);\nmy $ave  = $1 if($content=~/Average\\s+length:\\s+(\\d+)/);\nmy $ctgname = \"ctg.\".\"n\".$N50.\"_m\".$ave.\".fasta\";\nsystem(\"mv ctg.fasta ./$ctgname\");\n"
  },
  {
    "path": "scripts/statAGP.pl",
    "content": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 chr.agp\\n\" if(!defined $ARGV[0]);\nmy $agp = $ARGV[0];\nmy %uctgdb;\nmy %actgdb;\nmy %chrdb;\nmy $sumL = 0;\nmy $sumC = 0;\nmy $sumU = 0;\nopen(IN, \"grep -v 'contig' $agp |grep -v '#'|\") or die\"\";\nwhile(<IN>){\n\tchomp;\n\tmy @data = split(/\\s+/,$_);\n\t$sumC++;\n\t$sumL   += $data[7];\n\tif($data[0] eq $data[5]){\n\t  $uctgdb{$data[5]} = \"Unanchor\"; \n\t  $sumU += $data[7];\n\t}else{\n\t\t$actgdb{$data[5]} = \"Anchor\";\n\t\t$chrdb{$data[0]}->{'ctg'}++;\n\t\t$chrdb{$data[0]}->{'len'} = $data[2];\n\t\t}\n\t}\nclose IN;\n\nmy $numU = keys %uctgdb;\nmy $numA = keys %actgdb;\nmy $sumA = 0;\nprint \"ChrID\tAnchored_ctg\tLength\\n\";\nforeach my $chrn (sort {$chrdb{$b}->{'len'}<=>$chrdb{$a}->{'len'} } keys %chrdb){\n\t$sumA += $chrdb{$chrn}->{'len'};\n\tprint \"$chrn\t$chrdb{$chrn}->{'ctg'}\t$chrdb{$chrn}->{'len'}\\n\";\n\t}\n\nprint \"Total number of contigs (bp): $sumC\\n\";\nprint \"Total length of contigs (bp): $sumL\\n\";\nprint \"Total number of anchored contgis: $numA\\n\";\nprint \"Total length of chromosome level assembly (bp): $sumA\\n\";\nprint \"Number of unanchored contigs: $numU\\n\";\nprint \"Length of unanchored contigs: $sumU\\n\";\nmy $arate = (1-$sumU/$sumL)*100;\n$arate = sprintf(\"%.2f\",$arate);\nprint \"Anchor rate (%): $arate\\n\";\n"
  }
]