Full Code of tangerzhang/ALLHiC for AI

master 4710d96b2872 cached

37 files

10.5 MB

35.9k tokens

14 symbols

1 requests

Download .txt

Repository: tangerzhang/ALLHiC
Branch: master
Commit: 4710d96b2872
Files: 37
Total size: 10.5 MB

Directory structure:
gitextract_i521fsdf/

├── .gitmodules
├── README.md
├── allhic.v0.9.8
├── bin/
│   ├── ALLHiC_build
│   ├── ALLHiC_corrector
│   ├── ALLHiC_partition
│   ├── ALLHiC_pip.sh
│   ├── ALLHiC_plot
│   ├── ALLHiC_prune
│   ├── ALLHiC_rescue
│   └── allhic
└── scripts/
    ├── ALLHiC2ALLMAPS.pl
    ├── PreprocessSAMs.pl
    ├── agp2tour.pl
    ├── bam2CLM.pl
    ├── bam2CLM_simple.pl
    ├── bam2net.pl
    ├── bam_HiCplotter.py
    ├── blastn_parse.pl
    ├── classify.pl
    ├── filterBAM_forHiC.pl
    ├── gmap2AlleleTable.pl
    ├── gmap2AlleleTableBED.pl
    ├── link_superscaffold.pl
    ├── make_bed_around_RE_site.pl
    ├── mc_bam.pl
    ├── odering2tour.pl
    ├── partition.pl
    ├── partition_gmap.pl
    ├── partition_gmap.py
    ├── prune.pl
    ├── ragoo2ALLHiC.pl
    ├── release3DDNA.pl
    ├── remove_reads.pl
    ├── remove_small_contigs.py
    ├── simuCTG.pl
    └── statAGP.pl

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitmodules
================================================
[submodule "src"]
	path = src
	url = https://github.com/tanghaibao/allhic


================================================
FILE: README.md
================================================
# ALLHiC
ALLHiC: phasing and scaffolding polyploid genomes based on Hi-C data  
See wiki for details (https://github.com/tangerzhang/ALLHiC/wiki).
# Note
Please be aware that ALLHiC is no longer maintained. We recommend using two recently released algorithm packages developed by our team, which are reference-free and much faster: 
- **C-Phasing**: [C-Phasing GitHub Repository](https://github.com/wangyibin/CPhasing)  
- **HapHiC**: [HapHiC GitHub Repository](https://github.com/zengxiaofei/HapHiC)


================================================
FILE: bin/ALLHiC_build
================================================
#!/usr/bin/perl -w


die "Usage: perl $0 refSeq.fasta\n" if(!(defined $ARGV[0]));

print "1. tour format to agp ...\n";

my $refSeq = $ARGV[0];
my $Nseq   = "N" x 100;
my %anchordb;
my %seqdb;
open(IN, $refSeq) or die"";
$/='>';
<IN>;
while(<IN>){
	chomp;
	my ($ctg,$seq) = split(/\n/,$_,2);
	$ctg	       =~ s/\s+.*//g;
	$seq           =~ s/\s+//g;
	$seqdb{$ctg}   = $seq;
	}
close IN;

open(OUT, "> groups.agp") or die"Error: $!";
open(SEQ, "> groups.asm.fasta") or die"Error: $!";
while(my $tour = glob "*.tour"){
print "Processing $tour ...\n";
my $gid    = $tour;
   $gid    =~ s/.tour//g;
my $agp = $gid.".agp";
my $last_line = `tail -n 1 $tour`;
my @ctgdb     = split(/\s+/,$last_line);
my $a         = 0;
my $b         = 0;
my $len       = 0;
my $count     = 0;
my $fullSeq   = "";
foreach my $i(0..$#ctgdb){
	my $ctg; my $dir;
	if($ctgdb[$i]=~/(.*)([+|-])/){
		$ctg = $1; $dir = $2;
		}
	$a    = $b + 1;
	$len  = length $seqdb{$ctg};
	$anchordb{$ctg}++;
	$b    = $a + $len - 1;
	$count++;
	print OUT "$gid	$a	$b	$count	W	$ctg	1	$len	$dir\n";
	my $seq   = uc $seqdb{$ctg};
	if($dir eq "-"){
		$seq    = reverse $seq;
		$seq    =~ tr/ATGC/TACG/;
		}
	$fullSeq .= $seq;
	$a    = $b + 1;
	$b    = $a + 100 - 1;
	$count++;
	print OUT "$gid	$a	$b	$count	U	100	contig	yes	map\n" if($i!=$#ctgdb);
	$fullSeq .= $Nseq if($i!=$#ctgdb);
	}
print SEQ ">$gid\n$fullSeq\n";

}

foreach my $ctg (keys %seqdb){
	next if(exists($anchordb{$ctg}));
	my $len = length $seqdb{$ctg};
	print OUT "$ctg	1	$len	1	W	$ctg	1	$len	+\n";
	print SEQ ">$ctg\n$seqdb{$ctg}\n";
	}

close OUT;
close SEQ;



================================================
FILE: bin/ALLHiC_corrector
================================================
#!/usr/bin/env python
import sys
import multiprocessing
import math
import numpy as np
import pysam
import time
import argparse


def time_print(str):
	print("\033[32m%s\033[0m %s"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())), str))


def get_opt():
	group = argparse.ArgumentParser()
	group.add_argument("-m", "--mapping", help="Input mapping file", required=True)
	group.add_argument("-r", "--reference", help="Contig fasta file", required=True)
	group.add_argument("-o", "--output", help="Corrected fasta file", required=True)
	group.add_argument("-p", "--percent", type=float, help="Percent of the map to saturate, default is 0.95", default=0.95)
	group.add_argument("-s", "--sensitive", type=float, help="sensitivity to depletion score, default is 0.5", default=0.5)
	group.add_argument("-q", "--mapq", type=int, help="MAPQ of mapping lower bound, default is 1", default=1)
	group.add_argument("-w", "--wide", type=int, help="Resolution for first pass search of mismatches, default is 25000 bp", default=25000)
	group.add_argument("-n", "--narrow", type=int, help="Resolution for the precise mismatch localizaton, n<w default is 1000 bp", default=1000)
	group.add_argument("-d", "--depletion", type=int, help="The size of the region to aggregate the depletion score in the wide path, d >= 2*w, default is 100000 bp", default=100000)
	group.add_argument("-t", "--threads", type=int, help="Threads, default is 1", default=1)

	return group.parse_args()


def get_ctg_len(bam):
	ctg_len = {}
	for item in bam.header["SQ"]:
		item = dict(item)
		ctg_len[item['SN']] = item['LN']

	return ctg_len


def get_pos_list(bam_fetch, min_mapq):
	pos_list = []
	for line in bam_fetch:
		ctg1 = line.reference_name
		ctg2 = line.next_reference_name
		pos1 = line.reference_start
		pos2 = line.next_reference_start
		if pos1 == -1 or pos2 == -1 or ctg1 != ctg2 or line.mapq < min_mapq:
			continue
		pos_list.append([pos1, pos2])
	
	return pos_list


def get_hic_list(pos_list, bin_size):
	hic_db = {}
	hic_list = []

	pos_mat = np.matrix(pos_list)
	pos_mat = pos_mat//bin_size*bin_size
	
	for i in range(0, len(pos_mat)):
		key = (pos_mat[i, 0], pos_mat[i, 1])
		if key not in hic_db:
			hic_db[key] =0
		hic_db[key] += 1
	for key in hic_db:
		hic_list.append([key[0], key[1], hic_db[key]])

	return hic_list


def calc_sat_level(hic_list, pct):
	tmp_list = []
	nan_cnt = 0
	for i in range(0, len(hic_list)):
		if hic_list[i][0] != hic_list[i][1]:
			if math.isnan(hic_list[i][2]):
				nan_cnt += 1
				tmp_list.append(0)
			else:
				tmp_list.append(hic_list[i][2])
	if tmp_list == []:
		return -1
	tmp_list = sorted(tmp_list)
	for i in range(0, nan_cnt):
		tmp_list[i] = float('nan')
	if len(tmp_list) == 1:
		return tmp_list[0]
	else:
		pos = pct*(len(tmp_list)+1)
		if pos<1:
			return tmp_list[0]
		else:
			if pos >= len(tmp_list):
				return tmp_list[-1]
			else:
				d = pos-int(pos)
				return tmp_list[int(pos)-1]+d*(tmp_list[int(pos)]-tmp_list[int(pos)-1])


def precompute_dep_score(hic_list, bin_size, dep_size, sat_level):
	score_db = {}
	for s, e, val in hic_list:
		if math.isnan(val):
			continue
		if e-s>dep_size:
			continue
		if val >= sat_level:
			val = sat_level
		for i in range(s+bin_size, e, bin_size):
			if i not in score_db:
				score_db[i] = 0
			score_db[i] += val
	pos = score_db.keys()
	if len(pos) != 0:
		return score_db, min(pos), max(pos)
	else:
		return score_db, 0, 0


def get_sub_score_db(score_db, min_pos, max_pos, bin_size, dep_size):
	sub_score_db = {}
	for i in range(min_pos+dep_size-2*bin_size, max_pos-dep_size+3*bin_size, bin_size):
		if i in score_db:
			sub_score_db[i] = score_db[i]
		else:
			sub_score_db[i] = 0
	return sub_score_db


def get_wide_mismatch(score_db, thr, bin_size):
	tmp_list = [[]]
	for i in sorted(score_db):
		if score_db[i] < thr:
			if tmp_list[-1] == []:
				tmp_list[-1].append(i)
		else:
			if tmp_list[-1] != []:
				tmp_list[-1].append(i)
				tmp_list.append([])
	if len(tmp_list[-1]) == 1:
		tmp_list[-1].append(i+bin_size)
	elif len(tmp_list[-1]) == 0:
		del tmp_list[-1]
	return tmp_list
	

def get_mismatch(hic_list, bin_size, dep_size, pct, sens, is_wide):
	sat_level = round(calc_sat_level(hic_list, pct), 5)
	if sat_level == -1:
		return []
	thr = sens*sat_level*0.5*dep_size/bin_size*(dep_size/bin_size-1)
	score_db, min_pos, max_pos = precompute_dep_score(hic_list, bin_size, dep_size, sat_level)
	if len(score_db) != 0:
		score_db = get_sub_score_db(score_db, min_pos, max_pos, bin_size, dep_size)
	if is_wide:
		if len(score_db) != 0:
			wide_mismatch = get_wide_mismatch(score_db, thr, bin_size)
		else:
			wide_mismatch = []
		return wide_mismatch
	else:
		return score_db


def merge_region(wide_list, narrow_score, bin_size):
	idx_wide = 0
	min_val = 0
	tmp_list = []
	if narrow_score == {}:
		return wide_list
	for pos in sorted(narrow_score):
		if idx_wide >= len(wide_list):
			break
		if pos <= wide_list[idx_wide][0]:
			min_val = narrow_score[pos]
		else:
			if narrow_score[pos] < min_val:
				min_val = narrow_score[pos]
		if pos+bin_size <= wide_list[idx_wide][0]:
			continue
		if pos >= wide_list[idx_wide][1]:
			for i in range(wide_list[idx_wide][0], wide_list[idx_wide][1], bin_size):
				if i in narrow_score and narrow_score[i] == min_val:
					tmp_list.append([i, i+bin_size])
			idx_wide += 1
	if idx_wide < len(wide_list):
		for i in range(wide_list[idx_wide][0], wide_list[idx_wide][1], bin_size):
				if i in narrow_score and narrow_score[i] == min_val:
					tmp_list.append([i, i+bin_size])
	if tmp_list == []:
		return wide_list
	narrow_mismatch = []
	last_e = 0
	for s, e in tmp_list:
		if last_e == 0:
			narrow_mismatch.append([s])
			last_e = e
		else:
			if s != last_e:
				narrow_mismatch[-1].append(last_e)
				narrow_mismatch.append([s])
		last_e = e
	
	narrow_mismatch[-1].append(last_e)

	return narrow_mismatch
		

def pipeline(in_bam, mapq, dep_size, bin_size, narrow_bin_size, percent, sensitive, ctg):
	time_print("\tContig: %s Getting mapping list"%ctg)
	with pysam.AlignmentFile(in_bam, 'rb') as bam:
		mapping_list = get_pos_list(bam.fetch(contig=ctg), mapq)
	
	if mapping_list == []:
		time_print("\tContig: %s Could not found mapping list"%ctg)
		return []
	
	time_print("\tContig: %s Getting hic list with bin size: %d"%(ctg, bin_size))
	hic_list = get_hic_list(mapping_list, bin_size)

	time_print("\tContig: %s Getting wide mismatch"%ctg)
	wide_mismatch = get_mismatch(hic_list, bin_size, dep_size, percent, sensitive, True)
	if wide_mismatch == []:
		time_print("\tContig: %s Could not found mismatch"%ctg)
		return []
	
	dep_size = bin_size
	bin_size = narrow_bin_size

	time_print("\tContig: %s Getting narrow score with bin size: %d"%(ctg, bin_size))
	hic_list = get_hic_list(mapping_list, bin_size)

	narrow_score = get_mismatch(hic_list, bin_size, dep_size, percent, sensitive, False)
	
	time_print("\tContig: %s Getting narrow mismatch"%ctg)
	narrow_mismatch = merge_region(wide_mismatch, narrow_score, bin_size)
	if narrow_mismatch == wide_mismatch:
		time_print("\tContig: %s Wide mismatch without update"%ctg)
	return narrow_mismatch


def ALLHiC_correct(in_bam, in_fa, out_fa, mapq, dep_size, bin_size, narrow_bin_size, percent, sensitive, thread):
	time_print("Reading mapping")
	with pysam.AlignmentFile(in_bam, 'rb') as bam:
		ctg_len = get_ctg_len(bam)

	time_print("Running pipeline")
	pool = multiprocessing.Pool(processes=thread)
	res = []
	for ctg in ctg_len:
		r = pool.apply_async(pipeline, (in_bam, mapq, dep_size, bin_size, narrow_bin_size, percent, sensitive, ctg,))
		res.append([ctg, r])
	pool.close()
	pool.join()
	
	bam.close()
	narrow_mismatch = {}
	for ctg, r in res:
		sub_mismatch = r.get()
		if sub_mismatch != []:
			narrow_mismatch[ctg] = sub_mismatch
	
	time_print("Found all mismatches")
	
	time_print("Reading origin fasta")
	fa_db = {}
	with open(in_fa, 'r') as fin:
		for line in fin:
			if line[0] == '>':
				id = line.strip().split()[0][1:]
				fa_db[id] = []
			else:
				fa_db[id].append(line.strip())
	
	for id in fa_db:
		fa_db[id] = ''.join(fa_db[id])
	
	time_print("Writing result")
	with open(out_fa, 'w') as fout:
		for ctg in sorted(fa_db):
			if ctg in narrow_mismatch:
				base = 0
				for s, e in narrow_mismatch[ctg]:
					s = s-1
					e = e-1
					fout.write(">%s_%d_%d\n%s\n"%(ctg, base+1, s, fa_db[ctg][base: s]))
					fout.write(">%s_%d_%d\n%s\n"%(ctg, s+1, e, fa_db[ctg][s: e]))
					base = e
				if base < len(fa_db[ctg]):
					fout.write(">%s_%d_%d\n%s\n"%(ctg, base, len(fa_db[ctg]), fa_db[ctg][base:]))
			else:
				fout.write(">%s\n%s\n"%(ctg, fa_db[ctg]))
	
	time_print("Finished")


if __name__ == "__main__":
	opts = get_opt()
	in_bam = opts.mapping
	in_fa = opts.reference
	out_fa = opts.output
	mapq = opts.mapq
	percent = opts.percent
	sensitive = opts.sensitive
	dep_size = opts.depletion
	bin_size = opts.wide
	narrow_bin_size = opts.narrow
	thread = opts.threads
	ALLHiC_correct(in_bam, in_fa, out_fa, mapq, dep_size, bin_size, narrow_bin_size, percent, sensitive, thread)




================================================
FILE: bin/ALLHiC_partition
================================================
#!/usr/bin/perl -w

use Getopt::Std;
getopts "b:r:e:k:m:";


if ( (!defined $opt_r)|| (!defined $opt_e)|| (!defined $opt_k)) {
    die "************************************************************************
    Usage: ALLHiC_partition -r draft.asm.fasta -e enzyme_sites -k Num of groups
      -h : help and usage.
      -b : prunned bam (optional, default prunning.bam)
      -r : draft.sam.fasta
      -e : enzyme_sites (HindIII: AAGCTT; MboI: GATC, Arima)
      -k : number of groups (user defined K value)
      -m : minimum number of restriction sites (default, 25)
************************************************************************\n";
}

my $bam     = (defined $opt_b)?$opt_b:"prunning.bam";
my $refSeq  = $opt_r;
my $esites  = uc $opt_e;
$esites     = "AAGCTT" if($esites eq "HINDIII");
$esites     = "GATC" if($esites eq "MBOI");


my $K       = $opt_k;
my $minRes  = (defined $opt_m)?$opt_m:25;   
my $runcmd  = "";
print "Extract function: calculate an empirical distribution of Hi-C link size based on intra-contig links\n";
if ($esites eq "ARIMA") {
  $runcmd     = "allhic extract ".$bam." ".$refSeq." --RE='GATCGATC,GANTGATC,GANTANTC,GATCANTC'";
  $esites = "GATCGATC_GANTGATC_GANTANTC_GATCANTC";
}
else {
  $runcmd     = "allhic extract ".$bam." ".$refSeq." --RE ".$esites;
}
print "CMD: $runcmd\n";
system($runcmd);


print "Partition contigs based on prunning bam file\n";
my $counts_file = $bam.".counts_".$esites.".txt";
$counts_file    =~ s/.bam//g;
my $pairs_file  = $bam.".pairs.txt";
$pairs_file     =~ s/.bam//g;
$runcmd         = "allhic partition $counts_file $pairs_file ".$K." --minREs ".$minRes;
print "CMD: $runcmd\n";
system($runcmd);




================================================
FILE: bin/ALLHiC_pip.sh
================================================
#!/bin/bash

usage()
{
	echo "    Usage: `basename $0` -r reference -1 R1.fq -2 R2.fq -k group_count [-e enzyme] [-t threads] [-b bin_size]"
	echo "          -r: reference genome"
	echo "          -1: Lib_R1.fq.gz"
	echo "          -2: Lib_R2.fq.gz"
	echo "          -k: group_count"
	echo "          -e: enzyme_sites (HindIII: AAGCTT; MboI: GATC), default: HindIII"
	echo "          -t: threads, default: 10"
	echo "          -b: bin_size for hic heatmap, can be divided with comma, default: 500k"
	exit 0
}

### get options
while getopts ':r:1:2:k:e:t:b:' OPT; do
	case $OPT in
		r)
			ref="$OPTARG";;
		1)
			R1="$OPTARG";;
		2)
			R2="$OPTARG";;
		e)
			enzyme="$OPTARG";;
		k)
			group_count="$OPTARG";;
		t)
			threads="$OPTARG";;
		b)
			bin_size="$OPTARG";;
		?)
			usage;;
	esac
done
bwa="bwa"

### check required variants
if [ -z $ref ] || [ -z $R1 ] || [ -z $R2 ] || [ -z $group_count ]; then
	usage
fi

### set default values while optional variants were not set
if [ -z $threads ]; then
	threads=10
fi

if [ -z $bin_size ]; then
	bin_size=500k
fi

if [ -z $enzyme ]; then
	enzyme=AAGCTT
fi

enzyme=`echo $enzyme | tr '[a-z]' '[A-Z]'`

if [ $enzyme = HINDIII ]; then
	enzyme=AAGCTT
fi

if [ $enzyme = MBOI ]; then
	enzyme=GATC
fi

### link required files
ln -s ${ref} ./seq.fasta
ln -s ${R1} ./Lib_R1.fastq.gz
ln -s ${R2} ./Lib_R2.fastq.gz

### index reference genome
bwa index seq.fasta
samtools faidx seq.fasta


### 1st round of mapping
bwa mem -SP5M -t $threads seq.fasta Lib_R1.fastq.gz Lib_R2.fastq.gz \
     | samtools view -hF 256 - \
     | samtools sort -@ $threads -o sorted.bam -T tmp.ali
samtools index sorted.bam

### correct contig
ALLHiC_corrector -m sorted.bam -r seq.fasta -o seq.HiCcorrected.fasta -t $threads

### 2nd round of mapping
bwa index seq.HiCcorrected.fasta
samtools faidx seq.HiCcorrected.fasta
bwa mem -SP5M -t $threads seq.HiCcorrected.fasta Lib_R1.fastq.gz Lib_R2.fastq.gz \
     | samtools view -hF 256 - \
     | samtools sort -@ $threads -o sample.bwa_mem.bam -T tmp.ali


### filter bam
samtools view -bq 40 sample.bwa_mem.bam  |samtools view -bt seq.HiCcorrected.fasta.fai > sample.unique.bam
PreprocessSAMs.pl sample.unique.bam seq.HiCcorrected.fasta $enzyme

### partition
ALLHiC_partition -r seq.HiCcorrected.fasta -e $enzyme -k $group_count -b sample.unique.REduced.paired_only.bam

### optimize
rm cmd.list
for((K=1;K<=$group_count;K++));do echo "allhic optimize sample.unique.REduced.paired_only.counts_${enzyme}.${group_count}g${K}.txt sample.unique.REduced.paired_only.clm" >> cmd.list;done
ParaFly -c cmd.list -CPU $threads

### build
ALLHiC_build seq.HiCcorrected.fasta

### plot
samtools faidx groups.asm.fasta
cut -f1,2 groups.asm.fasta.fai|grep sample > chrn.list
ALLHiC_plot sample.bwa_mem.bam groups.agp chrn.list $bin_size pdf




================================================
FILE: bin/ALLHiC_plot
================================================
#!/usr/bin/env python
import argparse
import numpy as np
import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pyplot as plt
import pysam
import time
import os


def time_print(info):
    print("\033[32m%s\033[0m %s"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())), info))


def get_opts():
    groups = argparse.ArgumentParser()
    groups.add_argument('-b', '--bam', help='Input bam file', required=True)
    groups.add_argument('-a', '--agp', help='Input AGP file', required=True)
    groups.add_argument('-l', '--list', help='Chromosome list, contain: ID\tLength', required=True)
    groups.add_argument('-n', '--npz', help="npz file of hic signal, optional, if not exist, it will be generate after reading hic signals, or it will be loaded for drawing other resolution of heatmap", default="")
    groups.add_argument('-m', '--min_size', help="Minium bin size of heatmap, default=50k", default="50k")
    groups.add_argument('-s', '--size', help="Bin size of heatmap, can be a list separated by comma, default=500k, notice: it must be n times of min_size (n is integer) or we will ajust it to nearest one", default="500k")
    groups.add_argument('-o', '--outdir', help='Output directory, default=workdir', default='workdir')

    return groups.parse_args()


# Get chromosome length
def get_chr_len(chr_list):
    chr_len_db = {}
    chr_order = []
    with open(chr_list, 'r') as f_in:
        for line in f_in:
            if line.strip() == '':
                continue
            data = line.strip().split()
            chr_order.append(data[0])
            chr_len_db[data[0]] = int(data[1])
    return chr_len_db, chr_order


# Calc read counts on each bin
def calc_read_count_per_min_size(chr_len_db, chr_order, bam, agp, min_size):
    long_bin_size=min_size
    read_count_whole_genome = {}
    
    bin_offset = [0 for i in range(0, len(chr_order)+1)]
    bin_count = [0 for i in range(0, len(chr_order)+1)]
    total_bin_count = 0
    
    for chrn in chr_len_db:
        bin_count_of_chr = int(round((chr_len_db[chrn]*1.0/long_bin_size+0.51)))
        total_bin_count += bin_count_of_chr
        bin_count[chr_order.index(chrn)+1] = bin_count_of_chr
    
    for i in range(1, len(bin_count)):
        bin_offset[i] = bin_count[i]+bin_offset[i-1]
    read_count_whole_genome = [[0 for i in range(0, total_bin_count)] for j in range(0, total_bin_count)]
    
    ctg_on_chr = {}
    with open(agp, 'r') as f_in:
        for line in f_in:
            if line.strip() == '' or line.strip().startswith('#'):
                continue
            data = line.strip().split()
            if data[4] == 'U':
                continue
            chrn = data[0]
            start_pos = int(data[1])
            end_pos = int(data[2])
            ctg = data[5].replace('_pilon', '')
            direct = data[-1]
            ctg_on_chr[ctg] = [chrn, start_pos, end_pos, direct]

    with pysam.AlignmentFile(bam, 'rb') as fin:
        for line in fin:
            if line.is_unmapped or line.mate_is_unmapped:
                continue
            ctg1 = line.reference_name
            ctg2 = line.next_reference_name
            read_pos1 = line.reference_start+1
            read_pos2 = line.next_reference_start+1

            if ctg1 not in ctg_on_chr or ctg2 not in ctg_on_chr:
                continue
            chrn1, ctg_start_pos1, ctg_end_pos1, ctg_direct1 = ctg_on_chr[ctg1]
            chrn2, ctg_start_pos2, ctg_end_pos2, ctg_direct2 = ctg_on_chr[ctg2]
            if ctg_direct1 == '+':
                converted_pos1 = ctg_start_pos1 + read_pos1 - 1
            else:
                converted_pos1 = ctg_end_pos1 - read_pos1 + 1
            if ctg_direct2 == '+':
                converted_pos2 = ctg_start_pos2 + read_pos2 - 1
            else:
                converted_pos2 = ctg_end_pos2 - read_pos2 + 1
            if chrn1 not in chr_len_db or chrn2 not in chr_len_db:
                continue
            pos1_index = int(converted_pos1/long_bin_size)
            pos2_index = int(converted_pos2/long_bin_size)
            
            chr1_index = chr_order.index(chrn1)
            chr2_index = chr_order.index(chrn2)
            
            whole_pos1 = bin_offset[chr1_index] + pos1_index
            whole_pos2 = bin_offset[chr2_index] + pos2_index
            try:
                read_count_whole_genome[whole_pos1][whole_pos2] += 1
                read_count_whole_genome[whole_pos2][whole_pos1] += 1
            except Exception:
                time_print("Index error on whole genome: index1: %d, index2: %d, bin counts: %d"%(whole_pos1, whole_pos2, total_bin_count))
    
    return np.array(bin_offset), np.array(read_count_whole_genome)


def draw_heatmap(read_count_whole_genome_min_size, bin_offset_min_size, ratio, chr_order, min_size):
    bin_size = str(int(ratio*min_size))
    if bin_size[-9:] == '000000000':
        short_bin_size = bin_size[:-9]+'G'
    elif bin_size[-6:] == '000000':
        short_bin_size = bin_size[:-6]+'M'
    elif bin_size[-3:] == '000':
        short_bin_size = bin_size[:-3]+'K'

    total_cnt = len(read_count_whole_genome_min_size)
    ratio_cnt = int(round(total_cnt*1.0/ratio+0.51, 0))
    plt_cnt = int(total_cnt*1.0/ratio)

    data = read_count_whole_genome_min_size
    
    data = np.pad(data, ((0, ratio_cnt*ratio-total_cnt), (0, ratio_cnt*ratio-total_cnt)), 'constant', constant_values=0)
    data = data.reshape(-1, ratio_cnt, ratio).sum(axis=2)
    data = data.reshape(ratio_cnt, -1, ratio_cnt).sum(axis=1)

    fn = "%s_Whole_genome.pdf"%short_bin_size
    cmap = plt.get_cmap("YlOrRd")
    cmap.set_over('black')
    ax = plt.gca()
    with np.errstate(divide='ignore'):
        hmap = ax.imshow(np.log2(data[: plt_cnt, : plt_cnt]), interpolation='nearest', origin='lower',cmap=cmap, aspect='equal')
    plt.colorbar(mappable=hmap, cax=None, ax=None, shrink=0.5)
    plt.tick_params(labelsize=6)
    for ticks in ax.get_xticklabels():
        ticks.set_rotation(90)
    for ticks in ax.get_yticklabels():
        ticks.set_rotation(0)
    title = 'Whole_genome_'+short_bin_size
    plt.xlabel("Bins ("+short_bin_size.lower()+"b per bin)", fontsize=8)
    plt.xticks([])
    plt.yticks([])
    plt.title(title, y=1.01, fontsize=12)
    plt.savefig(fn, bbox_inches='tight', dpi=200)
    plt.close('all')

    chr_cnt = len(chr_order)
    row_cnt = int(round(np.sqrt(chr_cnt)+0.51))
    col_cnt = int(round(chr_cnt*1.0/row_cnt+0.51))
    all_fn = '%s_all_chrs.pdf'%short_bin_size
    plt.figure(figsize=(col_cnt*2, row_cnt*2))
    idx = 1
    for chrn in chr_order:
        sr = bin_offset_min_size[idx-1]
        er = bin_offset_min_size[idx]
        sub_data = read_count_whole_genome_min_size[sr: er, sr: er]
        total_cnt = len(sub_data)
        ratio_cnt = int(round(total_cnt*1.0/ratio+0.51, 0))
        plt_cnt = int(total_cnt*1.0/ratio)

        sub_data = np.pad(sub_data, ((0, ratio_cnt*ratio-total_cnt), (0, ratio_cnt*ratio-total_cnt)), 'constant', constant_values=0)
        sub_data = sub_data.reshape(-1, ratio_cnt, ratio).sum(axis=2)
        sub_data = sub_data.reshape(ratio_cnt, -1, ratio_cnt).sum(axis=1)

        plt.subplot(row_cnt, col_cnt, idx)
        ax = plt.gca()
        cmap = plt.get_cmap('YlOrRd')
        cmap.set_over('black')
        with np.errstate(divide='ignore'):
            hmap = ax.imshow(np.log2(sub_data[: plt_cnt, : plt_cnt]), interpolation='nearest', origin='lower', cmap=cmap, aspect='equal')
        plt.colorbar(mappable=hmap, cax=None, ax=None, shrink=0.5)
        plt.tick_params(labelsize=5)
        plt.title(chrn)
        idx += 1
    
    plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5)
    plt.savefig(all_fn, bbox_inches='tight', dpi=200)
    plt.close('all')


def ALLHiC_plot(bam, agp, chrlist, npzfile, minsize, binsize, outdir):
    bam = os.path.abspath(bam)
    agp = os.path.abspath(agp)
    chrlist = os.path.abspath(chrlist)
    if npzfile != "":
        npzfile = os.path.abspath(npzfile)

    if not os.path.exists(outdir):
        os.mkdir(outdir)
    os.chdir(outdir)

    min_size = minsize.upper()
    min_size = min_size.replace('K', '000')
    min_size = min_size.replace('M', '000000')
    min_size = min_size.replace('G', '000000000')
    min_size = int(min_size)

    bin_list = binsize.split(',')
    bin_ratio = []
    for bin_size in bin_list:
        long_bin_size = bin_size.upper()
        long_bin_size = long_bin_size.replace('K', '000')
        long_bin_size = long_bin_size.replace('M', '000000')
        long_bin_size = long_bin_size.replace('G', '000000000')
        long_bin_size = int(long_bin_size)
        bin_ratio.append(int(round(long_bin_size/min_size+0.01, 0)))
        
    
    time_print("Step1: Get chromosome length")
    chr_len_db, chr_order = get_chr_len(chrlist)

    time_print("Step2: Get signal matrix")
    if npzfile != "" and os.path.exists(npzfile):
        npzdata = np.load(npzfile)
        bin_offset_min_size = npzdata['bin_offset_min_size']
        read_count_whole_genome_min_size = npzdata['read_count_whole_genome_min_size']
    else:
        bin_offset_min_size, read_count_whole_genome_min_size = calc_read_count_per_min_size(chr_len_db, chr_order, bam, agp, min_size)
        if npzfile != "":
            np.savez(npzfile.replace('.npz', ''), bin_offset_min_size=bin_offset_min_size, read_count_whole_genome_min_size=read_count_whole_genome_min_size)
    
    time_print("Step3: Draw heatmap")
    
    for i in range(0, len(bin_ratio)):
        ratio = bin_ratio[i]
        time_print("Drawing with bin size %s"%bin_list[i])
        draw_heatmap(read_count_whole_genome_min_size, bin_offset_min_size, ratio, chr_order, min_size)
    os.chdir('..')
    time_print("Success")


if __name__ == "__main__":
    opts = get_opts()
    bam = opts.bam
    agp = opts.agp
    chrlist = opts.list
    npzfile = opts.npz
    minsize = opts.min_size
    binsize = opts.size
    outdir = opts.outdir

    ALLHiC_plot(bam, agp, chrlist, npzfile, minsize, binsize, outdir)


================================================
FILE: bin/ALLHiC_rescue
================================================
#!/usr/bin/perl -w

use Getopt::Std;
getopts "b:r:c:i:m:";



if ( (!defined $opt_b)|| (!defined $opt_r)|| (!defined $opt_c)|| (!defined $opt_i)) {
    die "**************************************************************************************
    Usage: ALLHiC_rescue -r draft.asm.fasta -b sample.clean.bam -c clusters.txt -i counts.file
      -h : help and usage.
      -b : sample.clean.bam (unpruned bam)
      -r : draft.sam.fasta
      -c : prunning.clusters.txt
      -i : prunning.counts_AAGCTT.txt
      -m : minimum single density for rescuing contigs (optional, default 0.01)
**************************************************************************************\n";
}

my $bam         = $opt_b;
my $refSeq      = $opt_r;
my $clusters    = $opt_c;
my $counts_file = $opt_i;
my $minSig      = (defined $opt_m)?$opt_m:0.01;

print "Starting rescue ungrouped contigs\n";
print "Reading contig length\n";
my %ctgdb;
open(IN, $counts_file) or die"";
while(<IN>){
	chomp;
	next if(/#/);
	my ($ctg, $RECounts, $len) = split(/\s+/,$_);
	$ctgdb{$ctg}->{'RECounts'} = $RECounts;
	$ctgdb{$ctg}->{'length'}   = $len;
	}
close IN;

print "Reading link signals ...\n";      
my %signaldb;
my @bamList = split(/,/,$opt_b);
foreach my $bam (@bamList){
	print "Reading $bam\n";
	open(my $in, "samtools view $bam|") or die"";
	while(<$in>){
		chomp;
		my ($reads,$ctga,$ctgb) = (split/\s+/,$_)[0,2,6];
		next if($ctgb eq "=");
		next if($ctgb eq "*");
		my ($a,$b) = sort ($ctga,$ctgb);
		my $key    = $a.",".$b;
		$signaldb{$key}++;
		}
	close $in;
	}

print "find ungrouped contigs ...\n";
my %GROUPDB;
my %anchordb;
my $gid = 0;
open(IN, $clusters) or die"";
while(<IN>){
	chomp;
	next if(/#/);
	$gid++;
	my $g = "group".$gid;
	my @data = split(/\s+/,$_);
	foreach my $i(2..$#data){
		$anchordb{$data[$i]} = $gid;
		$GROUPDB{$g}->{'origin'} .= $data[$i]." ";
		}
	}
close IN;

print "output HiC link signals ...\n";
open(OUT, "> signals.txt") or die"";
print OUT "#GID	unclustered_ctg	Linked_reads	Anchored_ctgs\n";
foreach my $key (keys %signaldb){
	my ($a,$b) = split(/,/,$key);
	next if(exists($anchordb{$a}) and exists($anchordb{$b}));
	next if(!exists($anchordb{$a}) and !exists($anchordb{$b}));
#	$a         = "group".$anchordb{$a} if(exists($anchordb{$a}));
#	$b         = "group".$anchordb{$b} if(exists($anchordb{$b}));
#	next if($a=~/group/ and $b=~/group/);
#	next if(!($a=~/group/) and !($b=~/group/));
	my $ga  = (exists($anchordb{$a}))?$a:$b; ### anchored contig should be placed in the first row
	my $ub  = (!exists($anchordb{$b}))?$b:$a;### followed by unanchored contig
	if(!exists($ctgdb{$a}->{'length'})){
		print "WARNING: $a not found in $counts_file, PASS\n";
		next;
		}
	if(!exists($ctgdb{$b}->{'length'})){
		print "WARNING: $b not found in $counts_file, PASS\n";
		next;
		}	
	my $lenA     = $ctgdb{$ga}->{'length'};
	my $lenB     = $ctgdb{$ub}->{'length'};
	my $sigD     = ($signaldb{$key}*1000)/($lenA+$lenB);
	#   $sigD     = sprintf("%.2f",$sigD);
	print OUT "group$anchordb{$ga}	$ub	$sigD	$ga\n"
	}
close OUT;

my %infordb;
my %groupdb;
open(IN, "signals.txt") or die"";
while(<IN>){
	chomp;
	next if(/#/);
	my ($gid,$ctg,$value) = (split/\s+/,$_)[0,1,2];
	$infordb{$ctg}->{$gid} += $value;  
	$groupdb{$gid}++;
	}
close IN;

my $num_of_groups = keys %groupdb;
open(OUT, "> unanchor.signal.txt") or die "";
print OUT "unanchored_contig	";
foreach my $gid(sort keys %groupdb){
	print OUT "$gid	";
	}
print OUT "best_group	best_ctg1	sigD	best_ctg2\n";

foreach my $ctg (sort keys %infordb){
	my $v    = 0;
	my $maxv = 0;
	print OUT "$ctg	";
	foreach my $g(sort keys %groupdb){
		$v  = $infordb{$ctg}->{$g} if(exists($infordb{$ctg}->{$g}));
		$v  = 0 if(!exists($infordb{$ctg}->{$g}));
#		$v  = sprintf ("%.2f",$v);
		$maxv = $v if($v>$maxv);
	  print OUT "$v	";
		}
#	next if($maxv<=$minSig); ### minimum singal density should be larger than 0.01
	my $count = 0;
	my $best_g;
	foreach $g (sort {$infordb{$ctg}->{$b}<=>$infordb{$ctg}->{$a}} keys %{$infordb{$ctg}}){
		$count++;
		last if($count>1);
		$best_g = $g;
		}
	print OUT "$best_g	$maxv #\n" if($maxv<=$minSig);
	print OUT "$best_g      $maxv \n" if($maxv>$minSig);
	$GROUPDB{$best_g}->{'rescued'} .= $ctg." " if($maxv>$minSig);
#	my $line = `grep \'$ctg\' signals.txt |grep \'$best_g\' |sort -k 3 -n -r |head -n 2|cut -f4`;
#	my ($best_ctg1,$best_ctg2) = split(/\n/,$line);
#	print OUT "$best_ctg1	$maxv	$best_ctg2	\n";
	}
close OUT;

print "Output refined clusters \n";
foreach my $gid (keys %GROUPDB){
	my @odb = split(/\s+/,$GROUPDB{$gid}->{'origin'});
	$GROUPDB{$gid}->{'rescued'} = "" if(!exists($GROUPDB{$gid}->{'rescued'}));
	my @rdb = split(/\s+/,$GROUPDB{$gid}->{'rescued'});
	my $no  = @odb;
	my $nr  = @rdb;
	print "Number of original contigs in $gid: $no\n";
	print "Number of rescued contigs in $gid: $nr\n";
	my $outfile = $gid.".txt";
	open(my $out, "> $outfile") or die"";
	print $out "#Contig	RECounts	Length\n";
	map {print $out "$_	$ctgdb{$_}->{'RECounts'}	$ctgdb{$_}->{'length'}\n"} @odb;
	map {print $out "$_	$ctgdb{$_}->{'RECounts'}	$ctgdb{$_}->{'length'}\n"} @rdb;
	close $out;	
	}





================================================
FILE: bin/allhic
================================================
[File too large to display: 10.4 MB]

================================================
FILE: scripts/ALLHiC2ALLMAPS.pl
================================================
#!/usr/bin/perl -w
### Convert ALLHiC output AGP file to ALLMAPS input csv file
print "Convert ALLHiC output AGP file to ALLMAPS input csv file\n";
die "Usage: perl $0 groups.agp\n" if(!defined $ARGV[0]);

my $agp = $ARGV[0];
open(OUT, "> hic.csv") or die"";
print OUT "Scafffold ID,scaffold position,LG,genetic position\n";
open(IN, "grep -v 'contig' $agp|") or die"";
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	if($data[8] eq "+"){
		$a = $data[6]; $b = $data[7];
	}elsif($data[8] eq "-"){
		$a = $data[7]; $b = $data[6];
		}
	print OUT "$data[5],$a,$data[0],$data[1]\n";
	print OUT "$data[5],$b,$data[0],$data[2]\n";
	}
close IN;
close OUT;



================================================
FILE: scripts/PreprocessSAMs.pl
================================================
#!/usr/bin/perl -w
use strict;



# PreprocessSAMs.pl
#
# Syntax: PreprocessSAMs.pl <sam or bam filename> <draft assembly fasta>
#
# This Perl script prepares a SAM/BAM file for use with Lachesis.
# Specifically, it pre-processes the file with bedtools, samtools, picard to remove redundant, chimeric, and/or uninformative read pairs.
# This creates a dataset of Hi-C links with as strong a signal as possible, and it's also as small as possible, so as to reduce I/O runtime in Lachesis.
# (NOTE: As of August 24, 2013, I'm no longer removing PCR duplicates.  Picard's MarkDuplicates is extremely slow and resource-intensive - far more so than
# the runtime benefit in Lachesis of having fewer reads.  I don't think it's removing PCR duplicates properly, nor do I think PCR duplicate removal is even
# necessary - http://seqanswers.com/forums/showthread.php?t=6854).
#
# This script will determine whether the file is a SAM or a BAM file, and then run the following commands:
#
# COMMAND                                OUTPUT FILENAME                               WHAT THE COMMAND DOES
# make_bed_around_RE_site.pl             <fasta>.near_<RE>.<range>.bed                 Prepare the bed file for bedtools intersect (next command)
# bedtools intersect                     <head>.REduced.bam                            Remove all reads that aren't within 500 bp of a restriction site
### picard SortSam.jar                     <head>.REduced.sort_coord.bam                 Sort the file in coordinate order so PCR duplicates can be removed
### picard MarkDuplicates.jar              <head>.REduced.sort_coord.nodups.bam          Remove PCR duplicates
### picard SortSam.jar                     <head>.REduced.nodups.bam                     Sort the file in query-name order so Lachesis can read it
# samtools view -F12                     <head>.REduced.nodups.paired_only.bam         Filter out all pairs in which both reads are not aligned
# samtools flagstat                      <head>.REduced.nodups.paired_only.flagstat    Make a flagstat file that describes the contents of the BAM file
#
#
# The final output file will be <head>.REduced.paired_only.bam.  This is what should be entered into the Lachesis INI file under the key "SAM_FILES".
#
# To pre-process several SAM/BAM files in parallel, use the script PreprocessSAMs.sh, which can be submitted to a cluster via qsub.
#
# Josh Burton
# July 2013



################################
#                              #
#   USER-DEFINED PARAMETERS    #
#                              #
################################


my $dry_run = 0; # if true, just print the commands to be run - don't actually run them
#my $RE_site = 'AAGCTT'; # the restriction enzyme site at which the DNA was cut for the Hi-C experiment

# Paths to the necessary scripts and software packages.
my $make_bed_around_RE_site_pl = 'make_bed_around_RE_site.pl';
my $bedtools = 'bedtools';
my $samtools = 'samtools';
#my $mem = "16G";
#my $picard_head = "java -d64 -Xmx$mem -jar /net/shendure/vol10/jnburton/extern/picard-tools-1.50/";



################################
#                              #
#         SUBROUTINES          #
#                              #
################################

# Print and then run a command in bash (unless $dry_run, in which case just print it.)
# First argument: the command to run.
# Second argument (optional): the file to redirect stdout to.
sub run_cmd(@) {
    
    my ($cmd,$redirect) = @_;
    
    print localtime() . ": PreprocessSAMs.pl: $cmd\n";
    
    return if $dry_run;
    
    if ($redirect) { system ( "$cmd > $redirect" ) }
    else           { system ( $cmd ); }
}




################################
#                              #
#     CONTROL STARTS HERE      #
#                              #
################################


# Get the command-line arguments, or check syntax.
if ( @ARGV != 3 ) {
    print STDERR "\nPreprocessSAMs.pl: A script to prepare SAM or BAM files for use with Lachesis.\n\nSyntax: $0 <sam-or-bam-filename> <draft-assembly-fasta> enzyme(HINDIII/MBOI/Arima)\n\n";
    exit;
}


# Get the input filenames, and check that they actually exist.
my ( $SAM, $fasta) = @ARGV;
unless ( -e $SAM ) {
    print STDERR "$0: Can't find input SAM/BAM file `$SAM`\n";
    exit;
}
unless ( -e $fasta) {
    print STDERR "$0: Can't find draft assembly file `$fasta`\n";
    exit;
}

$ARGV[2] = uc $ARGV[2];
my $RE_site;
if($ARGV[2] eq "HINDIII" or $ARGV[2] eq "AAGCTT"){
  $RE_site = 'AAGCTT';
  }elsif($ARGV[2] eq "MBOI" or $ARGV[2] eq "GATC"){
  $RE_site = 'GATC';
  }elsif($ARGV[2] eq "ARIMA"){
  $RE_site = 'arima';
  }
# Find the input file's "head" and extension.
my ($head,$extension) = $SAM =~ /^(.*)\.(.*)$/;


# Examine the extension to determine whether this is a SAM or a BAM file.  If it's a SAM, convert it to BAM.  If it doesn't seem to be either, throw an error.
if    ( uc($extension) eq 'SAM' ) { run_cmd( "$samtools view -bS $SAM -o $head.bam" ); }
elsif ( uc($extension) eq 'BAM' ) {}
else {
    print STDERR "$0: Can't determine file type for input file `$SAM`.\nFilename should end in '.SAM' or '.BAM' (not case-sensitive.)\n";
    exit;
}


print "$0 @ARGV\n\n";



# COMMAND                                OUTPUT FILENAME                               WHAT THE COMMAND DOES
# make_bed_around_RE_site.pl             <fasta>.near_<RE>.<range>.bed                 Prepare the bed file for bedtools intersect (next command)
#
# Make the BED file for the restriction sites on the draft assembly.  This only needs to be done once.
my $BED_RE_file;
if ($RE_site eq "arima") {
	$BED_RE_file = "$fasta.near_arima.500.bed";

	my $BED_re_file_gatc = "$fasta.near_GATC.500.bed";
	my $BED_re_file_gaat = "$fasta.near_GAAT.500.bed";
	my $BED_re_file_gact = "$fasta.near_GACT.500.bed";
	my $BED_re_file_gagt = "$fasta.near_GAGT.500.bed";
	my $BED_re_file_gatt = "$fasta.near_GATT.500.bed";

	run_cmd( "$make_bed_around_RE_site_pl $fasta GATC 500" ) unless -e $BED_re_file_gatc;
	run_cmd( "$make_bed_around_RE_site_pl $fasta GAAT 500" ) unless -e $BED_re_file_gaat;
	run_cmd( "$make_bed_around_RE_site_pl $fasta GACT 500" ) unless -e $BED_re_file_gact;
	run_cmd( "$make_bed_around_RE_site_pl $fasta GAGT 500" ) unless -e $BED_re_file_gagt;
	run_cmd( "$make_bed_around_RE_site_pl $fasta GATT 500" ) unless -e $BED_re_file_gatt;

	run_cmd( "cat $BED_re_file_gatc $BED_re_file_gaat $BED_re_file_gact $BED_re_file_gagt $BED_re_file_gatt | sort -k1,1 -k2,2b -u > $BED_RE_file" );
}
else {
    $BED_RE_file = "$fasta.near_$RE_site.500.bed";
	run_cmd( "$make_bed_around_RE_site_pl $fasta $RE_site 500" ) unless -e $BED_RE_file;
}


# Do the pre-processing on this file.
#
# COMMAND                                OUTPUT FILENAME                               WHAT THE COMMAND DOES
# bedtools intersect                     <head>.REduced.bam                            Remove all reads that aren't within 500 bp of a restriction site
### picard SortSam.jar                     <head>.REduced.sort_coord.bam                 Sort the file in coordinate order so PCR duplicates can be removed
### picard MarkDuplicates.jar              <head>.REduced.sort_coord.nodups.bam          Remove PCR duplicates
### picard SortSam.jar                     <head>.REduced.nodups.bam                     Sort the file in query-name order so Lachesis can read it
# samtools view -F12                     <head>.REduced.paired_only.bam         Filter out all pairs in which both reads are not aligned
# samtools flagstat                      <head>.REduced.paired_only.flagstat    Make a flagstat file that describes the contents of the BAM file

my $opts = "VALIDATION_STRINGENCY=SILENT";
my $nodups = ""; # or ".nodups", if removing PCR duplicates

run_cmd( "$bedtools intersect -abam $head.bam -b $BED_RE_file > $head.REduced.bam" );
#run_cmd( "${picard_head}SortSam.jar $opts I=$head.REduced.bam O=$head.REduced.sort_coord.bam SO=coordinate" );
#run_cmd( "${picard_head}MarkDuplicates.jar $opts I=$head.REduced.sort_coord.bam O=$head.REduced.sort_coord.nodups.bam M=$head.REduced.sort_coord.dup_metrics AS=true REMOVE_DUPLICATES=true" );
#run_cmd( "${picard_head}SortSam.jar $opts I=$head.REduced.sort_coord.nodups.bam O=$head.REduced.nodups.bam SO=queryname" );
run_cmd( "$samtools view -F12 $head.REduced$nodups.bam -b -o $head.REduced$nodups.paired_only.bam" );
run_cmd( "$samtools flagstat $head.REduced$nodups.paired_only.bam > $head.REduced$nodups.paired_only.flagstat" );


================================================
FILE: scripts/agp2tour.pl
================================================
#!/usr/bin/perl -w

die "Usage: perl $0 chr.agp\n" if(!defined $ARGV[0]);
my %infordb;
my $cnt = 0;
open(IN, "grep -v contig $ARGV[0]|") or die"";
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	my $chrn = $data[0];
	next if(!($chrn=~/Chr/) and !($chrn=~/group/));
	if(!exists($infordb{$chrn})){
		$cnt = 1;
		$infordb{$chrn}->{$cnt} .= $data[5]."".$data[8];
		}else{
			$cnt++;
			$infordb{$chrn}->{$cnt} .= $data[5]."".$data[8];
			}
	}
close IN;


foreach my $c (sort keys %infordb){
	my $outfile = $c.".tour";
	open(my $out, ">$outfile") or die"";
	foreach my $i (sort {$a<=>$b} keys %{$infordb{$c}}){
		print $out "$infordb{$c}->{$i} ";
		}
	close $out;
	}


================================================
FILE: scripts/bam2CLM.pl
================================================
#!/usr/bin/perl -w

use Getopt::Std;
getopts "b:r:d:";


if ((!defined $opt_b)|| (!defined $opt_r) || (!defined $opt_d) ) {
    die "************************************************************************
    Usage: perl $0 -b mapping.bam -r refSeq.fasta -d main_results/
      -h : help and usage.
      -b : mapping.bam
      -r : reference genome, fasta format
      -d : LACHESIS main_results/
************************************************************************\n";

}



my %seqdb;
my $ctg;
open(IN, $opt_r) or die"";
while(<IN>){
	chomp;
	if(/>/){
		$ctg = $_;
		$ctg =~ s/>//g;
		$ctg =~ s/\s+.*//g;
                $ctg =~ s/_pilon//g;
	}else{
		$seqdb{$ctg} .= $_;
		}
	}
close IN;

foreach $ctg(keys %seqdb){
	$seqdb{$ctg} =~ s/\s+//g;
	}


######GET GROUP IDS######
print "a. Getting group ids ...\n";
print "Reading anchored contigs ...\n";
my %anchordb;
my %gidb;
while(my $file = glob "$opt_d/group*ordering"){
	my $gid = $1 if($file=~/group(\d+).ordering/);
        open(my $in, $file) or die"";
	while(<$in>){
		chomp;
		next if(/#/);
		my $ctg = (split/\s+/,$_)[1];
                $ctg    =~ s/_pilon//g;
		$anchordb{$ctg}->{'gid'}  = $gid;
		$anchordb{$ctg}->{'stat'} = "An";
		$gidb{$gid}->{$ctg}       = "An";
		}
	close $in;
	}

my $ufile = "unanchor.signal.txt";
if(!(-e $ufile)){
  system("touch $ufile");
  }

my $num_of_group  = keys %gidb;
print "Number of groups: $num_of_group\n";
print "Reading unanchored contigs ...\n";
open(IN, "unanchor.signal.txt") or die"";
<IN>;
while(<IN>){
	chomp;
	my $i          = $num_of_group + 1;
        my ($ctg,$gid) = (split/\s+/,$_)[0,$i];
            $ctg       =~ s/_pilon//g;
	$gid =~ s/group//g;
	$anchordb{$ctg}->{'gid'}  = $gid;
	$anchordb{$ctg}->{'stat'} = "Un";
	$gidb{$gid}->{$ctg}       = "Un";
	}
close IN;

print "Output group ids ...\n";
foreach my $gid(sort keys %gidb){
	my $outid = "group".$gid.".ids";
	open(my $out, ">$outid") or die"";
	foreach my $ctg (keys %{$gidb{$gid}}){
	  my $len = length $seqdb{$ctg};
		print $out "$ctg	$len\n" if($gidb{$gid}->{$ctg} eq "An");
		print $out "$ctg	$len	recover\n" if($gidb{$gid}->{$ctg} eq "Un");
		}
	close $out;
	}

print "b. Getting CLM files ...\n";

print "Reading and filtering $opt_b file ...\n";
my %tmprdb = (); ###store reads name
my %infordb;     ###store contig pairs with directions: e.g. A+B+,A+B-,A-B+,A-B-
my $count = 0;   ###used for sorting
open(IN, "samtools view $opt_b |awk \'\$7!=\"*\" && \$7!=\"=\"\' |") or die"";
while(<IN>){
	chomp;
	$_                =~ s/_pilon//g;
	my @data          = split(/\s+/,$_);
	next if(exists($tmprdb{$data[0]}));
	$tmprdb{$data[0]}++;
	my ($ctgA,$ctgB)  = sort ($data[2], $data[6]);
###determine gid for the contig pairs
  next if(!exists($anchordb{$ctgA}->{'gid'}));
  next if(!exists($anchordb{$ctgB}->{'gid'}));
  my $ctgAgid       = $anchordb{$ctgA}->{'gid'};
	my $ctgBgid       = $anchordb{$ctgB}->{'gid'};
	next if($ctgAgid ne $ctgBgid);
  my $ctgAL         = length $seqdb{$ctgA};
  my $ctgBL         = length $seqdb{$ctgB};
  my $RAP           = ($data[2] le $data[6])?$data[3]:$data[7];
  my $RBP           = ($data[2] le $data[6])?$data[7]:$data[3]; 
  my $A1            = $RAP;
  my $A2            = $ctgAL - $RAP;
  my $B1            = $RBP;
  my $B2            = $ctgBL - $RBP;
###calculate distance for contig pairs                    
  my $ApBp          = $A2 + $B1;
  my $ApBm          = $A2 + $B2;
  my $AmBp          = $A1 + $B1;
  my $AmBm          = $A1 + $B2;
#  print ">$_\n";
#  print "$ctgA length=$ctgAL	and $ctgB length=$ctgBL\n";
#  print "$ctgA+ $ctgB+: $ApBp\n";
#  print "$ctgA+ $ctgB-: $ApBm\n";
#  print "$ctgA- $ctgB+: $AmBp\n";
#  print "$ctgA- $ctgB-: $AmBm\n";
#  print "\n";
  my $PApBp         = $ctgA."+ ".$ctgB."+"; #P means pair
  my $PApBm         = $ctgA."+ ".$ctgB."-"; 
  my $PAmBp         = $ctgA."- ".$ctgB."+";
  my $PAmBm         = $ctgA."- ".$ctgB."-";
  $infordb{$PApBp}->{'d'} .= $ApBp." ";    #d means distance
  $infordb{$PApBm}->{'d'} .= $ApBm." ";
  $infordb{$PAmBp}->{'d'} .= $AmBp." ";
  $infordb{$PAmBm}->{'d'} .= $AmBm." ";
  $infordb{$PApBp}->{'g'}  = $ctgAgid ;    #g means group id
  $infordb{$PApBm}->{'g'}  = $ctgAgid ; 
  $infordb{$PAmBp}->{'g'}  = $ctgAgid ; 
  $infordb{$PAmBm}->{'g'}  = $ctgAgid ;   
  $infordb{$PApBp}->{'c'}  = $count++ ;    #c means count
  $infordb{$PApBm}->{'c'}  = $count++ ; 
  $infordb{$PAmBp}->{'c'}  = $count++ ; 
  $infordb{$PAmBm}->{'c'}  = $count++ ;     
	}
close IN;

###Get CLM FILES####
print "Output CLM files ...\n";
open(ALLCLM, "> all.clm") or die"";
foreach my $key (sort {$infordb{$a}->{'c'}<=>$infordb{$b}->{'c'}} keys %infordb){
	my @t           = split(/\s+/,$infordb{$key}->{'d'});
	my $num_of_link = @t;
	print ALLCLM "$key	$num_of_link	$infordb{$key}->{'d'}\n";
	}
close ALLCLM;

foreach my $gid (keys %gidb){
 	my $outfile        = "group".$gid.".clm";
	open(my $out, ">$outfile") or die"";
	foreach my $key (sort {$infordb{$a}->{'c'}<=>$infordb{$b}->{c}}  keys %infordb){
		my @t            = split(/\s+/,$infordb{$key}->{'d'});
		my $num_of_link  = @t;
		print $out "$key	$num_of_link	$infordb{$key}->{'d'}\n" if($infordb{$key}->{'g'} eq $gid);
		}
	
	close $out;
	}




================================================
FILE: scripts/bam2CLM_simple.pl
================================================
#!/usr/bin/perl -w

die "Usage: perl $0 mapping.bam refSeq.fasta\n" if((!defined $ARGV[0]) or (!defined $ARGV[1]));

my %seqdb;
my $ctg;
open(IN, $ARGV[1]) or die"";
while(<IN>){
	chomp;
	if(/>/){
		$ctg = $_;
		$ctg =~ s/>//g;
		$ctg =~ s/\s+.*//g;
    $ctg =~ s/_pilon//g;
	}else{
		$seqdb{$ctg} .= $_;
		}
	}
close IN;

foreach $ctg(keys %seqdb){
	$seqdb{$ctg} =~ s/\s+//g;
	}
	
print "Reading and filtering $ARGV[0] file ...\n";
my %tmprdb = (); ###store reads name
my %infordb;     ###store contig pairs with directions: e.g. A+B+,A+B-,A-B+,A-B-
my $count = 0;   ###used for sorting
open(IN, "samtools view $ARGV[0] |awk \'\$7!=\"*\" && \$7!=\"=\"\' |") or die"";
while(<IN>){
	chomp;
	$_                =~ s/_pilon//g;
	my @data          = split(/\s+/,$_);
	next if(exists($tmprdb{$data[0]}));
	$tmprdb{$data[0]}++;
	my ($ctgA,$ctgB)  = sort ($data[2], $data[6]);
  my $ctgAL         = length $seqdb{$ctgA};
  my $ctgBL         = length $seqdb{$ctgB};
  my $RAP           = ($data[2] le $data[6])?$data[3]:$data[7];
  my $RBP           = ($data[2] le $data[6])?$data[7]:$data[3]; 
  my $A1            = $RAP;
  my $A2            = $ctgAL - $RAP;
  my $B1            = $RBP;
  my $B2            = $ctgBL - $RBP;
###calculate distance for contig pairs                    
  my $ApBp          = $A2 + $B1;
  my $ApBm          = $A2 + $B2;
  my $AmBp          = $A1 + $B1;
  my $AmBm          = $A1 + $B2;

  my $PApBp         = $ctgA."+ ".$ctgB."+"; #P means pair
  my $PApBm         = $ctgA."+ ".$ctgB."-"; 
  my $PAmBp         = $ctgA."- ".$ctgB."+";
  my $PAmBm         = $ctgA."- ".$ctgB."-";
  $infordb{$PApBp}->{'d'} .= $ApBp." ";    #d means distance
  $infordb{$PApBm}->{'d'} .= $ApBm." ";
  $infordb{$PAmBp}->{'d'} .= $AmBp." ";
  $infordb{$PAmBm}->{'d'} .= $AmBm." ";
  $infordb{$PApBp}->{'g'}  = $ctgAgid ;    #g means group id
  $infordb{$PApBm}->{'g'}  = $ctgAgid ; 
  $infordb{$PAmBp}->{'g'}  = $ctgAgid ; 
  $infordb{$PAmBm}->{'g'}  = $ctgAgid ;   
  $infordb{$PApBp}->{'c'}  = $count++ ;    #c means count
  $infordb{$PApBm}->{'c'}  = $count++ ; 
  $infordb{$PAmBp}->{'c'}  = $count++ ; 
  $infordb{$PAmBm}->{'c'}  = $count++ ;     
	}
close IN;

###Get CLM FILES####
print "Output CLM files ...\n";
open(ALLCLM, "> all.clm") or die"";
print ALLCLM "groupA	groupB	num_of_link	Average_distance	signalDensity	distance_list\n";
foreach my $key (sort {$infordb{$a}->{'c'}<=>$infordb{$b}->{'c'}} keys %infordb){
	my @t           = split(/\s+/,$infordb{$key}->{'d'});
	my $num_of_link = @t;
        my $sum = 0; my $ave = 0;
        map {$sum+=$_} @t;
        $ave   = $sum/$num_of_link;
        $ave   = sprintf("%.2f",$ave);
        my ($g1,$g2) = split(/\s+/,$key);
           $g1      =~ s/[+|-]//g;
           $g2      =~ s/[+|-]//g;
        my $l1      = length $seqdb{$g1};
        my $l2      = length $seqdb{$g2};
        my $len     = $l1 + $l2;
        my $signalD = $num_of_link/$len * 1000;
	print ALLCLM "$key	$num_of_link	$ave	$signalD	$infordb{$key}->{'d'}\n";
	}
close ALLCLM;


================================================
FILE: scripts/bam2net.pl
================================================
#!/usr/bin/perl -w
use Getopt::Std;
getopts "c:b:o:";


if ((!defined $opt_c)|| (!defined $opt_b)||(!defined $opt_o) ) {
    die "************************************************************************
    Usage: bam2net.pl -c draft.asm.fasta -b file.bam -o out.net
      -h : help and usage.
      -c : draft.asm.fasta
      -b : mapping.bam
      -o : output
************************************************************************\n";
}
my $bam    = $opt_b;
my $refSeq = $opt_c;

open(IN, $refSeq) or die"";
my $name;
while(<IN>){
	chomp;
	if(/>/){
		$name = $_;
		$name =~ s/>//g;
	}else{
		$refdb{$name} .= $_;
		}
	}
close IN;

foreach $name (keys %refdb){
  $refdb{$name} =~ s/\s+//g;
	}

my %infordb;
open(IN, "samtools view $bam |") or die"";
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	next if($data[6] eq "=");
	next if($data[6] eq "*");
        my ($ctg1,$ctg2) = sort ($data[2],$data[6]);
	$infordb{$ctg1}->{$ctg2}++;
	}
close IN;

open(OUT, "> $opt_o") or die"";
print OUT "ctg1	ctg1_size	ctg2	ctg2_size	signalDensity\n";
foreach my $ctg1(keys %infordb){
	my $len1 = length $refdb{$ctg1};
	foreach my $ctg2(keys %{$infordb{$ctg1}}){
		my $len2  = length $refdb{$ctg2};
                my $normL = ($len1 + $len2)/100000;
                my $sigD  = $infordb{$ctg1}->{$ctg2}/$normL;
                   $sigD  = sprintf("%.2f",$sigD);
		print OUT "$ctg1	$len1	$ctg2	$len2	$sigD\n";
		}
	}
close OUT;


================================================
FILE: scripts/bam_HiCplotter.py
================================================
#!/usr/bin/env python
import os
import sys
import gc
from math import log
import time


# Get position of read based on contig with sam or bam file
def get_read_pos_with_sam_bam_file(sam_bam_file):
	read_on_chr = {}
	if sam_bam_file[-3:] == "bam":
		f_in = os.popen("samtools view "+sam_bam_file, 'r')
	else:
		f_in = open(sam_bam_file, 'r')

	for line in f_in:
		if line.strip() == '' or line[0] == '@':
			continue
		data = line.strip().split()
		read_id = data[0]
		if data[2] == '*' or data[6] == '*':
			continue
		ctg1 = data[2].replace('_pilon', '')
		read_pos1 = int(data[3])
		if data[6] != '=':
			ctg2 = data[6].replace('_pilon', '')
		else:
			ctg2 = ctg1
		read_pos2 = int(data[7])
		read_on_chr[read_id] = [ctg1, read_pos1, ctg2, read_pos2]
	f_in.close()
	return read_on_chr


# Get chromosome length
def get_chr_len(chr_list):
	chr_len_db = {}
	chr_order = []
	with open(chr_list, 'r') as f_in:
		for line in f_in:
			if line.strip() == '':
				continue
			data = line.strip().split()
			chr_order.append(data[0])
			chr_len_db[data[0]] = int(data[1])
	return chr_len_db, chr_order


# Calc read counts on each bin
def calc_read_count_per_bin(chr_len_db, chr_order, read_on_chr, bin_size):
	long_bin_size = bin_size.upper()
	long_bin_size = long_bin_size.replace('K', '000')
	long_bin_size = long_bin_size.replace('M', '000000')
	long_bin_size = long_bin_size.replace('G', '000000000')
	long_bin_size = int(long_bin_size)
	
	read_count_per_chr = {}
	read_count_whole_genome = {}
	
	bin_offset = [0 for i in range(0, len(chr_order)+1)]
	bin_count = [0 for i in range(0, len(chr_order)+1)]
	total_bin_count = 0
	
	for chrn in chr_len_db:
		bin_count_of_chr = int(round((chr_len_db[chrn]*1.0/long_bin_size+0.5)))
		total_bin_count += bin_count_of_chr
		bin_count[chr_order.index(chrn)+1] = bin_count_of_chr
		read_count_per_chr[chrn] = [[0 for i in range(0, bin_count_of_chr)] for j in range(0, bin_count_of_chr)]
	
	for i in range(0, len(bin_count)):
		for j in range(0, i+1):
			bin_offset[i] += bin_count[j]
	
	read_count_whole_genome = [[0 for i in range(0, total_bin_count)] for j in range(0, total_bin_count)]
	
	for read in read_on_chr:
		chr1, pos1, chr2, pos2 = read_on_chr[read]
		if chr1 not in chr_len_db or chr2 not in chr_len_db:
			continue
		pos1_index = int(pos1/long_bin_size)
		pos2_index = int(pos2/long_bin_size)
		if chr1 == chr2 and chr1 in read_count_per_chr:
			read_count_per_chr[chr1][pos1_index][pos2_index] += 1
			read_count_per_chr[chr1][pos2_index][pos1_index] += 1

		chr1_index = chr_order.index(chr1)
		chr2_index = chr_order.index(chr2)

		whole_pos1 = bin_offset[chr1_index] + pos1_index
		whole_pos2 = bin_offset[chr2_index] + pos2_index
		read_count_whole_genome[whole_pos1][whole_pos2] += 1
		read_count_whole_genome[whole_pos2][whole_pos1] += 1
	
	for chrn in read_count_per_chr:
		for i in range(0, len(read_count_per_chr[chrn])):
			for j in range(0, len(read_count_per_chr[chrn][i])):
				if read_count_per_chr[chrn][i][j] != 0:
					read_count_per_chr[chrn][i][j] = log(read_count_per_chr[chrn][i][j], 2)
				else:
					read_count_per_chr[chrn][i][j] = -float('inf')
	
	for i in range(0, len(read_count_whole_genome)):
		for j in range(0, len(read_count_whole_genome[i])):
			if read_count_whole_genome[i][j] != 0:
				read_count_whole_genome[i][j] = log(read_count_whole_genome[i][j], 2)
			else:
				read_count_whole_genome[i][j] = -float('inf')


	return read_count_per_chr, read_count_whole_genome


# Draw heatmap of allhic result with matplotlib
def draw_heatmap(data, chrn, bin_size, ext):
	
	import matplotlib as mpl
	mpl.use('Agg')
	import matplotlib.pyplot as plt

	short_bin_size = bin_size.upper()
	short_bin_size = short_bin_size.replace('000000000', 'G')
	short_bin_size = short_bin_size.replace('000000', 'M')
	short_bin_size = short_bin_size.replace('000', 'K')

	ax = plt.gca()
	
	if chrn != 'all':
		file_prefix = short_bin_size + "_" + chrn
	else:
		file_prefix = short_bin_size + '_Whole_genome'
	
	print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+' Draw '+file_prefix)
	
	# mpl.cm.YlOrRd
	cmap = plt.get_cmap('YlOrRd')
	cmap.set_over('black')
	if chrn != 'all':
		hmap = ax.imshow(data, interpolation='nearest', origin='lower', cmap=cmap, aspect='auto')
	else:
		hmap = ax.imshow(data, interpolation='nearest', cmap=cmap, aspect='auto')
	
	plt.colorbar(mappable=hmap,cax=None, ax=None, shrink=0.5)
	plt.tick_params(labelsize=6)
	for ticks in ax.get_xticklabels():
		ticks.set_rotation(90)
	for ticks in ax.get_yticklabels():
		ticks.set_rotation(0)
	
	if chrn != 'all':
		title = chrn+'_'+short_bin_size
	else:
		title = 'Whole_genome_'+short_bin_size
	
	plt.xlabel("Bins ("+short_bin_size.lower()+"b per bin)", fontsize=8)
	if chrn == 'all':
		plt.xticks([])
		plt.yticks([])
		plt.title(title, y=1.01, fontsize=12)
	else:
		plt.title(title, y=1.1, fontsize=12)

	plt.savefig(file_prefix+'.'+ext, filetype=ext, bbox_inches='tight', dpi=200)
	plt.close('all')


if __name__ == "__main__":
	if len(sys.argv) < 5:
		print("Notice: This script is using for drawing heatmap of the all-hic reasult")
		print("Usage: python "+sys.argv[0]+" <sam/bam file> <chr_list> <bin_size> <ext>")
		print("\t<sam/bam_file> is the sam or bam file filtered by allhic")
		print("\t<chr_prefix> is the part of chromosomes before chromosome index")
		print("\t<bin_size> is the bin size of heatmap, it can be a list splited by comma")
		print("\t<ext> is the file type of picture")

	else:
		sam_bam_file = sys.argv[1]
		chr_list = sys.argv[2]
		bin_list = sys.argv[3]
		ext = sys.argv[4]
		
		print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Step 1: Get read position based on chromosome")
		read_on_chr = get_read_pos_with_sam_bam_file(sam_bam_file)

		print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Step 2: Get chromosome length")
		chr_len_db, chr_order = get_chr_len(chr_list)
		
		print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Step 3: Calculating and Drawing heatmap")

		bin_size_list = bin_list.split(',')
		for bin_size in bin_size_list:

			print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Calculating")
			read_count_per_chr, read_count_whole_genome = calc_read_count_per_bin(chr_len_db, chr_order, read_on_chr, bin_size)
			
			print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Drawing heatmap")
		
			print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Drawing with bin size "+str(bin_size))
			for chrn in read_count_per_chr:
				draw_heatmap(read_count_per_chr[chrn], chrn, bin_size, ext)
			
			draw_heatmap(read_count_whole_genome, 'all', bin_size, ext)
			del read_count_per_chr, read_count_whole_genome
			gc.collect()
		
		del read_on_chr
		gc.collect()
		print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Success")


================================================
FILE: scripts/blastn_parse.pl
================================================
#!/usr/bin/perl -w

###This script was used to parse blast+ result (outfmt 6)
###you can get best hit with parameter -b 1
###or -b 0 to get more results 
###The default coverage and identity are 60%, respectively

use Getopt::Std;
getopts "i:o:b:c:d:q:";


if ((!defined $opt_i)|| (!defined $opt_o)  || (!defined $opt_q)) {
    die "************************************************************************
    Usage: perl $0 -i input -o output -q query.fasta -b 0||1  
      -h : help and usage.
      -q : query file, fasta format
      -i : input file is the result of blast+
      -b : (optioanl, default 1)1 means only output best hit; 0 means get more results
      -d : identity (optional, default is 0.6)
      -c : coverage (optional, defalut is 0.6)
      -o : output
************************************************************************\n";
}

$input         = $opt_i;
$output        = $opt_o;
$BestHit_model = (defined $opt_b) ? $opt_b : 1;
$coverage 	   = (defined $opt_c) ? $opt_c : 0.6;
$identity      = (defined $opt_d) ? $opt_d : 0.6;

open(IN, $opt_q) or die"No query file: $opt_q\n";
while(<IN>){
	if(/>/){
		$gene = $_;
		$gene =~ s/>//g;
		$gene =~ s/\s+.*//g;
	}else{
		$infordb{$gene} .= $_;
		}
	}
close IN;

open(OUT, "> $output") or die"No output file: $output\n";
open(IN, $input) or die"No input file: $input\n";
while(<IN>){
	chomp;
	@data    = split(/\s+/,$_);
	$query   = $data[0];
	$countdb{$query} += 1;
	next if($countdb{$query}>1 and $BestHit_model==1);
	$q_len   = length $infordb{$query};
#	$subject = $data[1];
	$blst_i  = $data[2]/100;
	$blst_c  = ($data[7]-$data[6])/$q_len;
  if($blst_i>=$identity and $blst_c>=$coverage){
  	print OUT "$_\n";
  	}
	}
close IN;
close OUT;







================================================
FILE: scripts/classify.pl
================================================
#!/usr/bin/perl -w

use Getopt::Std;
getopts "i:p:r:g:";


if ((!defined $opt_i)|| (!defined $opt_p) || (!defined $opt_r)|| (!defined $opt_g)) {
    die "************************************************************************
    Usage: perl $0 -i blast.out -p polyploid -r ref.gff3 -g target.gff3 
      -h : help and usage.
      -i : blast.out
      -p : number of alleles
      -r : reference.gff3, annotation from close relative species
      -g : target.gff3, annotation from target species

************************************************************************\n";
}

### Parameter reading
my $blast     = $opt_i;
my $polyn     = $opt_p;
my $rGFF      = $opt_r;
my $tGFF      = $opt_g;
my $geneTable = "Allele.gene.table";
my $ctgTable  = "Allele.ctg.table";

my %infordb;
my $count = 0;
open(IN, "sort -k2,2 -k12,12nr $blast|") or die"";
while(<IN>){
	chomp;
	my @data  = split(/\s+/,$_);
	my $tgene = $data[0];
	my $rgene = $data[1];
	my $bits  = $data[11];
	if(!exists($infordb{$rgene})){
		$count = 1;
		$infordb{$rgene}->{$count} = $tgene;
	}else{
		$count++;
		next if($count>$polyn);
		$infordb{$rgene}->{$count} = $tgene;
		}
	
	}
close IN;

my %tdb;  ### store target genome gff information, e.g het rice
open(IN, "awk '\$3==\"gene\"' $tGFF | ") or die"";
while(<IN>){
	chomp;
	my @data     = split(/\s+/,$_);
	my $tgene    = $1 if(/Name=(\S+)/);
	   $tgene    =~ s/;.*//g;
	$tdb{$tgene} = $data[0];
	}
close IN;


open(OUT, "> $geneTable") or die"";
open(IN, "awk '\$3==\"gene\"' $rGFF |sort -k1,1 -k4,4n |") or die"";
while(<IN>){
	chomp;
	my @data  = split(/\s+/,$_);
	my $rgene = $1 if(/Name=(\S+)/);
	$rgene    =~ s/;.*//g;
	next if(!exists($infordb{$rgene}));
	print OUT "$rgene	$data[0]	$data[3]	";
	foreach my $i(sort {$a<=>$b} keys %{$infordb{$rgene}}){
		my $tgene = $infordb{$rgene}->{$i};
		   $tctg  = $tdb{$tgene}; 
		print OUT "$tgene,$tctg	";     ###print out target gene order and contig name
		}
	print OUT "\n";
	}
close IN;

close OUT;


my %alleledb;
my $ln = 0; ###store line number
open(IN, "Allele.gene.table") or die"";
while(<IN>){
	chomp;
	$ln++;
	my @data = split(/\s+/,$_);
	my %tmpdb = ();
	foreach my $i(3..$#data){
		my $ctg = (split/,/,$data[$i])[1];
		$tmpdb{$ctg}++;
		}
	map {$alleledb{$ln}->{'ctg'} .= $_." "} keys %tmpdb;
	$alleledb{$ln}->{'chrn'}      = $data[1];
	$alleledb{$ln}->{'posi'}      = $data[2];
	}
close IN;

open(OUT, "> remove.log") or die"";
my %removedb = ();
for(my $i=2;$i<=$ln;$i++){
	my $chrI = $alleledb{$i}->{'chrn'};
	my $ctgI = $alleledb{$i}->{'ctg'};
	my $chrR; my $ctgR; my $R;
	for(my $j=1;$j<$i;$j++){
		next if(exists($removedb{$j}));
		my $chrJ = $alleledb{$j}->{'chrn'};
		next if($chrI ne $chrJ);
		my $ctgJ = $alleledb{$j}->{'ctg'};
		my $flag = & compare($ctgI,$ctgJ);
		print OUT "$i	$chrI	$ctgI	$j	$chrJ	$ctgJ	$flag\n" if($flag==1);
### flag=1, remove
		$removedb{$i}++ if($flag==1);
		}
	
	}
close OUT;


open(OUT, ">$ctgTable") or die"";
$ln = 0;
open(IN, $geneTable) or die"";
while(<IN>){
	chomp;
	$ln++;
	next if(exists($removedb{$ln}));
	my @data = split(/\s+/,$_);
	print OUT "$data[1]	$data[2]	";
	foreach my $i(3..$#data){
		my $ctg = (split/,/,$data[$i])[1];
		print OUT "$ctg	";
		}	
	print OUT "\n";
	}
close IN;
close OUT;



sub compare{
	my $ctgT   = shift;
	my $ctgR   = shift;
	my @ctgTdb = split(/\s+/,$ctgT);
	my @ctgRdb = split(/\s+/,$ctgR);
	my %tdb = ();
	my $num_T  = @ctgTdb;
  map {$tdb{$_}++} @ctgTdb;
  my $num_S  = 0;   ###Number of Same contigs
	my $num_D  = 0;   ###Number of Different contigs	  
  foreach my $ctg(@ctgRdb){
  	if(exists($tdb{$ctg})){
  		$num_S++;
  	}else{
  		$num_D++;
  		}
  	} 
 	if($num_S == $num_T){
 		return 1;
 	}else{
 		return 0;
 		}
	}






================================================
FILE: scripts/filterBAM_forHiC.pl
================================================
#!/usr/bin/perl -w

die "Usage: perl $0 file.bam out.sam\n" if(!defined($ARGV[0]) or !defined($ARGV[1]));
open(OUT, "> $ARGV[1]") or die"";
open(IN, "samtools view $ARGV[0] |") or die"";
while(<IN>){
	chomp;
	my $mapq = (split/\s+/,$_)[4];
	my ($NM,$XM,$XO,$XG);
	if(/NM:i:(\d)/){
		$NM = $1;
		}
	if(/XM:i:(\d)/){
		$XM = $1;
		}
	if(/XO:i:(\d)/){
		$XO = $1;
		}
	if(/XG:i:(\d)/){
		$XG = $1;
		}
        next if($mapq<30);
	next if(!(/XT:A:U/));	
	next if(!(defined $NM) or $NM>5);
	next if(!(defined $XM) or $XM>3);
	next if(!(defined $XO) or $XO>2);
	next if(!(defined $XG) or $XG>2);
	next if(/XA:/);
	print OUT "$_\n";
	}
close IN;
close OUT;


#Tag	Meaning
#NM	Edit distance
#MD	Mismatching positions/bases
#AS	Alignment score
#BC	Barcode sequence
#X0	Number of best hits
#X1	Number of suboptimal hits found by BWA
#XN	Number of ambiguous bases in the referenece
#XM	Number of mismatches in the alignment
#XO	Number of gap opens
#XG	Number of gap extentions
#XT	Type: Unique/Repeat/N/Mate-sw
#XA	Alternative hits; format: (chr,pos,CIGAR,NM;)*
#XS	Suboptimal alignment score
#XF	Support from forward/reverse alignment
#XE	Number of supporting seeds


================================================
FILE: scripts/gmap2AlleleTable.pl
================================================
#!/usr/bin/perl -w

die "Usage: perl $0 ref.gff3\n" if(!defined ($ARGV[0]));
my $refGFF = $ARGV[0];
open(IN, "grep 'gene' gmap.gff3 |") or die"";
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	my $gene = $1 if(/Name=([^;\n]*)/);
	$infordb{$gene} .= $data[0]."	";
	}
close IN;

open(OUT, "> Allele.ctg.table") or die"";
open(IN, "awk '\$3==\"gene\"' $refGFF |") or die"";
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	my $gene = $1 if(/Name=(\S+)/);
	   $gene =~ s/;.*//g;
	next if(!exists($infordb{$gene}));
	my @tdb = split(/\s+/,$infordb{$gene});
	my %tmpdb = ();
	map {$tmpdb{$_}++} @tdb;
	print OUT "$data[0]	$data[3]	";
	map {print OUT "$_	"} keys %tmpdb;
	print OUT "\n";
	}
close IN;
close OUT;


================================================
FILE: scripts/gmap2AlleleTableBED.pl
================================================
#!/usr/bin/perl -w

die "Usage: perl $0 ref.bed\n" if(!defined ($ARGV[0]));
my $refGFF = $ARGV[0];
open(IN, "grep 'gene' gmap.gff3 |") or die"";
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	my $gene = $1 if(/Name=([^;\s]+)/);
	$infordb{$gene} .= $data[0]."	";
	}
close IN;

open(OUT, "> Allele.ctg.table") or die"";
open(IN, $refGFF) or die"";
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	my $gene = $data[3];
	   $gene =~ s/;.*//g;
	next if(!exists($infordb{$gene}));
	my @tdb = split(/\s+/,$infordb{$gene});
	my %tmpdb = ();
	map {$tmpdb{$_}++} @tdb;
	print OUT "$data[0]	$data[3]	";
	map {print OUT "$_	"} keys %tmpdb;
	print OUT "\n";
	}
close IN;
close OUT;


================================================
FILE: scripts/link_superscaffold.pl
================================================
#!/usr/bin/perl -w

my %namedb;
my %removedb;
while(<DATA>){
	chomp;
	my ($id,$name) = (split/\s+/,$_)[0,1];
	$namedb{$name} = $id;
	my @data = split(/\s+/,$_);
	my $key  = "";
	foreach my $i (2..$#data){
		my ($sa,$sb) = sort ($data[1],$data[$i]);
		$key   = $sa."	".$sb;
		$removedb{$key}++;
		}
	}
	
my %infordb;
open(IN, "grep -v 'tig' all.clm|") or die"";
<IN>;
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	my $scf1 = $data[0];
	my $scf2 = $data[1];
	   $scf1 =~ s/[+|-]//g;
	   $scf2 =~ s/[+|-]//g;
#	my ($s1,$s2) = sort ($scf1,$scf2);
	my $key1  = $scf1."	".$scf2;
	my $key2  = $scf2."	".$scf1;
	next if(exists($removedb{$key1}));
	if(!exists($infordb{$key1})){
		$infordb{$key1} = $data[4];
	}elsif(exists($infordb{$key1}) and $data[4]>$infordb{$key1}){
		$infordb{$key1} = $data[4];
		}
	if(!exists($infordb{$key2})){
		$infordb{$key2} = $data[4];
	}elsif(exists($infordb{$key2}) and $data[4]>$infordb{$key2}){
		$infordb{$key2} = $data[4];
		}	
	}
close IN;

my %bestdb;
open(OUT, "> tmp.txt") or die"";
foreach my $key (keys %infordb){
	my ($sa,$sb) = split(/\s+/,$key);
	my $ida      = $namedb{$sa};
	my $idb      = $namedb{$sb};
	print OUT "$ida	$idb	$sa	$sb	$infordb{$key}\n";
	if(!exists($bestdb{$ida}->{$idb})){
		$bestdb{$ida}->{$idb} = $infordb{$key};
	}elsif($infordb{$key}>$bestdb{$ida}->{$idb}){
		$bestdb{$ida}->{$idb} = $infordb{$key};
		}
	if(!exists($bestdb{$idb}->{$ida})){
		$bestdb{$idb}->{$ida} = $infordb{$key};
	}elsif($infordb{$key}>$bestdb{$idb}->{$ida}){
		$bestdb{$idb}->{$ida} = $infordb{$key};
		}	
	}
close OUT;

open(OUT, "> best_link.txt") or die"";
my $ln = 0;
my %linkdb;
open(IN, "sort -k5,5nr -k1,1n tmp.txt|") or die"";
while(<IN>){
	chomp;
	$ln++;
  my @data = split(/\s+/,$_);
  my $ida  = $data[0];
  my $idb  = $data[1];
  my $key  = $ida."	".$idb;
  if($ln==1){
  	$linkdb{$key} = $_;
  	$tmpdb{$ida}++;
  	$tmpdb{$idb}++;
  }else{
  	next if(exists($tmpdb{$ida}) or exists($tmpdb{$idb}));
   	$linkdb{$key} = $_;
  	$tmpdb{$ida}++;
  	$tmpdb{$idb}++; 	
  	}
	}
close IN;

foreach my $key (keys %linkdb){
	print OUT "$linkdb{$key}\n";
	}

close OUT;


### Below are the information that listed allelic super-scaffolds for each target.
#Format:
#ID	target	allelic_superscaffold1	allelic_superscaffold2 ...
__DATA__
1	group1	group2	group4	group6	group8	group9	group11	group14	group15	group16
2	group2	group1	group3	group4	group5	group6	group7	group8	group9	group10	group11	group12	group13	group14	group15	group16
3	group3	group2	group5	group7	group9	group10	group11	group12	group13	group14
4	group4	group1	group3	group6	group8	group9	group11	group14	group15	group16
5	group5	group2	group3	group7	group9	group10	group11	group12	group13	group14
6	group6	group1	group2	group4	group8	group9	group11	group14	group15	group16
7	group7	group2	group3	group5	group9	group10	group11	group12	group13	group14
8	group8	group1	group3	group4	group6	group9	group11	group14	group15	group16
9	group9	group1	group3	group4	group5	group6	group7	group8	group2	group10	group11	group12	group13	group14	group15	group16
10	group10	group2	group3	group5	group7	group9	group11	group12	group13	group14
11	group11	group1	group3	group4	group5	group6	group7	group8	group9	group10	group2	group12	group13	group14	group15	group16
12	group12	group2	group3	group5	group7	group9	group10	group11	group13	group14
13	group13	group12	group2	group3	group5	group7	group9	group10	group11
14	group14	group12	group2	group3	group5	group7	group9	group10	group11	group16
15	group15	group1	group2	group4	group6	group8	group9	group14
16	group16	group1	group2	group4	group6	group8	group9	group11	group14	group15


================================================
FILE: scripts/make_bed_around_RE_site.pl
================================================
#!/usr/bin/perl -w
use strict;


# make_bed_around_restriction_site.pl: Make a BED file representing the regions around all occurrences of a restriction site.
#
# For syntax, run with no arguments.
#
# The output BED file is designed for use with bedtools intersect, as follows:
# bedtools intersect -abam [SRR.bam] -b [$BED_out] > [SRR.REduced.bam]
# samtools view -h [SRR.REduced.bam] > [SRR.REduced.sam]
# This restricts a SAM/BAM file to only include reads close to a restriction site, which is a good way to filter Hi-C data, according to Fig. 1b of this paper:
# http://www.nature.com/ng/journal/v43/n11/full/ng.947.html
# Also see PreprocessSAM.pl, which uses the output file.
#
# Josh Burton
# April 2013




if ( scalar @ARGV != 3 ) {
    
    # Report syntax.
    print "\nmake_bed_around_RE_site.pl\n\n";
    print "Find all occurrences of a motif in a genome.  Make a 'POS' file listing these occurrences, and also a BED file representing the regions around these occurrences.\n\n";
    print "SYNTAX:\tmake_bed_around_RE_site.pl <fasta> <motif> <range>\n";
    print "fasta:\tA fasta file representing a genome (reference or draft assembly.)\n";
    print "motif:\tA motif, typically a restriction site sequence (e.g., HindIII = AAGCTT, NcoI = CCATGG, Dpn1 = GATC).\n";
    print "range:\tA number representing how many bp around the sequence to include.  Recommend 500 based on Yaffe & Tanay, Nat. Genetics 2011.\n\n";
    print "OUTPUT FILES:\n";
    print "<fasta>.near_<motif>.<range>.bed\n";
    print "<fasta>.near_pos_of_<motif>.txt\n";
    print "\n";
    exit;
}



# Get command-line arguments.
my ( $FASTA_in, $motif_seq, $range ) = @ARGV;

my $verbose = 0;

# Convert the motif from a string into a regex.  Unroll the IUPAC codes from single letters into Perl-parseable regular expressions.
my $motif_regex = $motif_seq;
$motif_regex =~ s/R/\[AG\]/g;
$motif_regex =~ s/Y/\[CT\]/g;
$motif_regex =~ s/S/\[CG\]/g;
$motif_regex =~ s/W/\[AT\]/g;
$motif_regex =~ s/K/\[GT\]/g;
$motif_regex =~ s/M/\[AC\]/g;
$motif_regex =~ s/B/\[CGT\]/g;
$motif_regex =~ s/D/\[AGT\]/g;
$motif_regex =~ s/H/\[ACT\]/g;
$motif_regex =~ s/V/\[ACG\]/g;
$motif_regex =~ s/N/\[ACGT\]/g;




# Derive an output filename.
my $BED_out = "$FASTA_in.near_$motif_seq.$range.bed";
my $POS_out = "$FASTA_in.pos_of_$motif_seq.txt";


# Determine how many letters needed to be added to each line in order to find instances of the sequence that bridge lines in the fasta.
my $N_prev_chars = length($motif_seq) - 1;


my $contig_name = '';
my $offset = 0;
my $prev_chars;
my @motif_positions;
my $N_motifs_found = 0;


# Open the input fasta file and read through it line-by-line.
print localtime() . ": Reading file $FASTA_in...\n";
open IN, '<', $FASTA_in or die "Can't find file `$FASTA_in'";
open BED, '>', $BED_out or die;
open POS, '>', $POS_out or die;

while (<IN>) {
    my $line = $_;
    chomp $line;
    
    # If this is a header line, we're done with this contig/chromosome (unless we just started), and start a new contig/chromosome.
    if ( $line =~ /^\>(\S+)/ ) {
	
	# The hash %motif_positions contains all positions on the (now complete) old contig at which this motif appears.
	# Convert this list of positions to a set of BED lines, as necessary.
	my ( $prev_start, $prev_end ) = (-1,-1);
	foreach my $pos ( @motif_positions ) {
	    if ( $prev_end == -1 ) {
		$prev_start = $pos;
		$prev_end   = $pos;
	    }
	    if ( $prev_end + 2*$range < $pos ) {
		$prev_start =         $range if $prev_start < $range;
		$prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome
		print BED "$contig_name\t", $prev_start - $range, "\t", $prev_end + $range, "\n";
		$prev_start = $pos;
	    }
	    #print "pos = $pos\n";
	    $prev_end = $pos;
	}
	
	# Print the final BED line for this contig/chromosome.
	if (@motif_positions) {
	    $prev_start =         $range if $prev_start < $range;
	    $prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome
	    print BED "$contig_name\t", $prev_start - $range, "\t", $prev_end + $range, "\n";
	}
	
	# Get the new contig's name.
	$contig_name = $1;
	print localtime() . ": $contig_name\n" if $verbose;
	print POS ">$contig_name\n";
	
	# Reset other contig-related variables.
	$offset = 0;
	$prev_chars = '';
	@motif_positions = ();
    }
    
    # Otherwise, read through this contig/chromosome.
    else {
	if ( $offset != 0 ) { die unless $prev_chars; }
	
	my $verbose = 0;
	
	# Look for instances of this motif in this line of the fasta (including the overlap characters from the previous line, tacked on at the beginning.)
	my $motif_loc = -1;
	my $target_str = "$prev_chars" . uc $line;
	
	my @matches;
	while ($target_str =~ /$motif_regex/g ) {
	    
	    # Every iteration in this loop represents a new match to the motif regex in the terget string.
	    my $motif_loc = $-[0];
	    
	    # Adjust the location so it properly describes the 0-indexed motif position in this contig.
	    # Then add it to the list of contig positions at which the motif has been seen.
	    $N_motifs_found++;
	    my $true_motif_loc = $motif_loc + $offset - length $prev_chars; # adjust index so it properly describes the 0-indexed motif position in this contig
	    push @motif_positions, $true_motif_loc;
	    
	    print "$contig_name\t$offset\t$prev_chars\t->\t$motif_loc\n" if $verbose;
	    print POS "$true_motif_loc\n";
	}
	
	
	# TODO: remove
	while (0) {
	    $motif_loc = index "$prev_chars$line", $motif_seq, $motif_loc + 1;
	    last if ( $motif_loc == -1 ); # no more instances found
	    
	    # Found a motif!  Add its index to the list of contig positions at which the motif has been seen.
	    $N_motifs_found++;
	    my $true_motif_loc = $motif_loc + $offset - length $prev_chars; # adjust index so it properly describes the 0-indexed motif position in this contig
	    push @motif_positions, $true_motif_loc;
	    
	    print "$contig_name\t$offset\t$prev_chars\t->\t$motif_loc\n" if $verbose;
	    print POS "$true_motif_loc\n";
	}
	
	
	# Save the last few characters of this line, so that they can be appended onto the next line in a search for the sequence.
	my $line_len = length $line;
	$prev_chars = substr( $line, $line_len - $N_prev_chars );
	$offset += $line_len;
    }


}

################# modified based on the pull request from @FlyPythons: https://github.com/shendurelab/LACHESIS/pull/45

# process the last fasta record
my ( $prev_start, $prev_end ) = (-1,-1);
foreach my $pos ( @motif_positions ) {
    if ( $prev_end == -1 ) {
        $prev_start = $pos;
        $prev_end   = $pos;
    }
    if ( $prev_end + 2*$range < $pos ) {
        $prev_start =         $range if $prev_start < $range;
        $prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome
        print BED "$contig_name\t", $prev_start - $range, "\t", $prev_end + $range, "\n";
        $prev_start = $pos;
    }
    #print "pos = $pos\n";
    $prev_end = $pos;
}

# Print the final BED line for this contig/chromosome.
if (@motif_positions) {
    $prev_start =         $range if $prev_start < $range;
    $prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome
    print BED "$contig_name\t", $prev_start - $range, "\t", $prev_end + $range, "\n";
}

# Reset other contig-related variables.
$offset = 0;
$prev_chars = '';
@motif_positions = ();

#################

close IN;
close BED;
close POS;


print localtime() . ": Done!  Found $N_motifs_found total instances of motif $motif_seq.  Created files:\n";
print "$BED_out\n$POS_out\n";


================================================
FILE: scripts/mc_bam.pl
================================================
#!/usr/bin/perl -w

use Getopt::Std;
getopts "b:a:r:";


if ((!defined $opt_b)|| (!defined $opt_a) ||(!defined $opt_r)) {
    die "************************************************************************
    Usage: mc_bam.pl -b mapping.bam -r groups.asm.fasta -a agp
      -h : help and usage.
           This script is used for modification the coordinates 
           in bam based on agp file
      -b : mapping.bam
      -r : reference genome, fasta format
      -a : agp file
************************************************************************\n";

}

my %posidb = ();
open(IN, $opt_a) or die"";
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	next if($data[-1] eq "map");
	my $ga   = $data[1]; 
	my $gb   = $data[2]; 
	my $ta   = $data[6];
	my $tb   = $data[7];
	my $ctg  = $data[5];
	if($data[8] eq "+"){
		my $gi = $ga;
		foreach my $ti($ta..$tb){
			$posidb{$ctg}->{$ti} = $data[0].",".$gi;
			$gi++;
			}
	 }elsif($data[8] eq "-"){
	  my $gi = $gb;
	  foreach my $ti ($ta..$tb){
	  		$posidb{$ctg}->{$ti} = $data[0].",".$gi;
	  		$gi--;
	  		}
	  	}

	}
close IN;

#open(OUT, "> posi.txt") or die"";
#foreach my $ctg (keys %posidb){
#  foreach my $i(sort {$a<=>$b} keys %{$posidb{$ctg}}){
#	  print OUT "$ctg	$i	$posidb{$ctg}->{$i}\n";
#	  }
#	}
#close OUT;

my $outsam = "mc.sam";
my $outbam = "mc.bam";
open(OUT, "> $outsam") or die"";
open(IN, "samtools view $opt_b |") or die"";
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	next if($data[6] eq "=");
	my $ctgA = $data[2]; 
	my $tiA  = $data[3]; 
	my $ctgB = ($data[6] eq "=")?$data[2]:$data[6];
	my $tiB  = $data[7];
	my $gidA; my $giA; my $gidB; my $giB;
	next if(!exists($posidb{$ctgA}->{$tiA}));
	next if(!exists($posidb{$ctgB}->{$tiB}));
  ($gidA,$giA) = split(/,/,$posidb{$ctgA}->{$tiA});
	($gidB,$giB) = split(/,/,$posidb{$ctgB}->{$tiB});
	$data[2]       = $gidA;
	$data[3]       = $giA;
	$data[6]       = ($gidB eq $gidA)?"=":$gidB;
	$data[7]       = $giB;
	map {print OUT "$_	"} @data;
	print OUT "\n";
	}
close IN;
close OUT;

system("samtools faidx $opt_r");
my $fai = $opt_r.".fai";
system("samtools view -bt $fai $outsam > $outbam");


================================================
FILE: scripts/odering2tour.pl
================================================
#!/usr/bin/perl -w

while(my $file=glob "*_orderings.txt"){
	my $name = $file; 
		 $name =~ s/_orderings.txt//g;
		 $name .= ".tour";
  open(my $out, "> $name") or die"";
	open(my $fh, $file) or die"";
	while(<$fh>){
		chomp;
		my ($ctg,$dir) = (split/\s+/,$_)[0,1];
		my $line = $ctg."".$dir;
		print $out "$line	";
		}
	close $fh;
	close $out;
	}



================================================
FILE: scripts/partition.pl
================================================
#!/usr/bin/perl -w

use Getopt::Std;
getopts "g:d:b:r:";


if ((!defined $opt_g)|| (!defined $opt_r)) {
    die "************************************************************************
    Usage: perl $0 -g Allele.gene.table -r draft.asm.fasta
      -h : help and usage.
      -g : Allele.gene.table 
      -b : optional,default prunning.bam
      -r : reference ctg assembly
      -d : optional, default wrk_dir
************************************************************************\n";
}

my $bam    = (defined $opt_b)?$opt_b:"prunning.bam";
my $table  = $opt_g;
my $wrkd   = (defined $opt_d)?$opt_d:"wrk_dir";
my $refSeq = $opt_r;

### Read referece ctg fasta
my %refdb = ();
my $ctgn;
open(IN, $refSeq) or die"";
while(<IN>){
	chomp;
	if(/>/){
		$ctgn = $_;
		$ctgn =~ s/>//g;
		$ctgn =~ s/\s+//g;
	}else{
		$refdb{$ctgn} .= $_;
		}
	}
close IN;

foreach $ctgn (keys %refdb){
	$refdb{$ctgn} =~ s/\s+//g;
	}

### Read prunning BAM file
my %bamdb = ();
my $count = 1;
my %rdb;
open(IN, "samtools view $bam |") or die"";
while(<IN>){
	chomp;
	my $rname = (split/\s+/,$_)[0];
	next if(exists($rdb{$rname}));    ### only retain single-end reads
	$rdb{$rname}++;        
	$bamdb{$count++} = $_;
	}
close IN;
### Assign ctgs to pre-defined clusters

my %ctgdb;
open(IN, $table) or die"";
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	my $chrn = $data[1];
	foreach my $i(3..$#data){
		my $ctg = (split/,/,$data[$i])[1];
		$ctgdb{$ctg}->{$chrn}++;
		}
	}
close IN;

my %chrdb; ### pre-defined cluster based on chromosomes of close-releative species
foreach my $ctg (keys %ctgdb){
	my $count = 0;
	foreach my $chrn (sort {$ctgdb{$ctg}->{$b}<=>$ctgdb{$ctg}->{$a}} keys %{$ctgdb{$ctg}}){
		$count++;
		next if($count>1);
#		print "$ctg	$chrn	$ctgdb{$ctg}->{$chrn}\n";
		$chrdb{$chrn} .= $ctg.",";
		}
	}

system("rm -rf $wrkd");
system("mkdir $wrkd");
foreach my $chrn (keys %chrdb){
	next if($chrn=~/tig/);
	next if($chrn=~/ctg/);
	system("rm -rf $wrkd/$chrn");
	system("mkdir $wrkd/$chrn");
	my @ctgdb  = split(/,/,$chrdb{$chrn});
	my %tmpdb = (); $tmpdb{'='}++; ### need retain intra-contig links
### output ctg list	to each cluster
	open(my $out, ">$wrkd/$chrn/ctg.list") or die"";
	map {print $out "$_\n";$tmpdb{$_}++} @ctgdb;
	close $out;
### output ctg sequence to each cluster
	open(my $faout, ">$wrkd/$chrn/seq.fasta") or die"";
	map {chomp;print $faout ">$_\n$refdb{$_}\n" if(exists($refdb{$_}))} @ctgdb;
	close $faout;
### output bam file to each cluster
	open(my $bamout, "> $wrkd/$chrn/sample.clean.sam") or die"";
	foreach my $i(keys %bamdb){
		my ($c1,$c2) = (split/\s+/,$bamdb{$i})[2,6];
		next if(!exists($tmpdb{$c1}) or !exists($tmpdb{$c2}));
		print $bamout "$bamdb{$i}\n";
		}
	close $bamout;
	system("samtools faidx $wrkd/$chrn/seq.fasta");
	system("samtools view -bt $wrkd/$chrn/seq.fasta.fai $wrkd/$chrn/sample.clean.sam > $wrkd/$chrn/sample.clean.bam");
	system("rm $wrkd/$chrn/sample.clean.sam");
	
	}







================================================
FILE: scripts/partition_gmap.pl
================================================
#!/usr/bin/perl -w

use Getopt::Std;
getopts "g:d:b:r:l:";


if ((!defined $opt_g)|| (!defined $opt_r)) {
    die "************************************************************************
    Usage: perl $0 -g Allele.ctg.table -r draft.asm.fasta
      -h : help and usage.
      -g : Allele.ctg.table 
      -b : optional,default prunning.bam
      -r : reference ctg assembly
      -d : optional, default wrk_dir
      -l : chrn.list
************************************************************************\n";
}

my $bam    = (defined $opt_b)?$opt_b:"prunning.bam";
my $table  = $opt_g;
my $wrkd   = (defined $opt_d)?$opt_d:"wrk_dir";
my $refSeq = $opt_r;

if(!defined $opt_l){
  system("cut -f1 $table |sort -u > chrn.list");
  $opt_l = "chrn.list";
  }
my %chrnListdb;
open(IN, $opt_l) or die"";
while(<IN>){
  chomp;
  my $chrn = (split/\s+/,$_)[0];
  $chrnListdb{$chrn}++;
  }
close IN;

### Read referece ctg fasta
my %refdb = ();
my $ctgn;
open(IN, $refSeq) or die"";
while(<IN>){
	chomp;
	if(/>/){
		$ctgn = $_;
		$ctgn =~ s/>//g;
		$ctgn =~ s/\s+.*//g;
	}else{
		$refdb{$ctgn} .= $_;
		}
	}
close IN;

foreach $ctgn (keys %refdb){
	$refdb{$ctgn} =~ s/\s+//g;
	}

### Read prunning BAM file
my %bamdb = ();
my $count = 1;
my %rdb;
open(IN, "samtools view $bam |") or die"";
while(<IN>){
	chomp;
	my $rname = (split/\s+/,$_)[0];
	next if(exists($rdb{$rname}));    ### only retain single-end reads
	$rdb{$rname}++;        
	$bamdb{$count++} = $_;
	}
close IN;
### Assign ctgs to pre-defined clusters

my %ctgdb;
open(IN, $table) or die"";
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	my $chrn = $data[0];
	foreach my $i(2..$#data){
		#my $ctg = (split/,/,$data[$i])[1];
		my $ctg  = $data[$i];
                $ctgdb{$ctg}->{$chrn}++;
		}
	}
close IN;

my %chrdb; ### pre-defined cluster based on chromosomes of close-releative species
foreach my $ctg (keys %ctgdb){
	my $count = 0;
	foreach my $chrn (sort {$ctgdb{$ctg}->{$b}<=>$ctgdb{$ctg}->{$a}} keys %{$ctgdb{$ctg}}){
		$count++;
		next if($count>1);
#		print "$ctg	$chrn	$ctgdb{$ctg}->{$chrn}\n";
		$chrdb{$chrn} .= $ctg.",";
		}
	}

system("rm -rf $wrkd");
system("mkdir $wrkd");
foreach my $chrn (keys %chrdb){
	next if(!exists($chrnListdb{$chrn}));
	print "Process $chrn ...\n";
	system("rm -rf $wrkd/$chrn");
	system("mkdir $wrkd/$chrn");
	my @ctgdb  = split(/,/,$chrdb{$chrn});
	my %tmpdb = (); $tmpdb{'='}++; ### need retain intra-contig links
### output ctg list	to each cluster
	open(my $out, ">$wrkd/$chrn/ctg.list") or die"";
	map {print $out "$_\n";$tmpdb{$_}++} @ctgdb;
	close $out;
### output ctg sequence to each cluster
	open(my $faout, ">$wrkd/$chrn/seq.fasta") or die"";
	map {chomp;print $faout ">$_\n$refdb{$_}\n" if(exists($refdb{$_}))} @ctgdb;
	close $faout;
### output bam file to each cluster
	open(my $bamout, "> $wrkd/$chrn/prunning.sub.sam") or die"";
	foreach my $i(keys %bamdb){
		my ($c1,$c2) = (split/\s+/,$bamdb{$i})[2,6];
		next if(!exists($tmpdb{$c1}) or !exists($tmpdb{$c2}));
		print $bamout "$bamdb{$i}\n";
		}
	close $bamout;
	system("samtools faidx $wrkd/$chrn/seq.fasta");
	system("samtools view -bt $wrkd/$chrn/seq.fasta.fai $wrkd/$chrn/prunning.sub.sam > $wrkd/$chrn/prunning.sub.bam");
	system("rm $wrkd/$chrn/prunning.sub.sam");
	
	}





================================================
FILE: scripts/partition_gmap.py
================================================
#!/usr/bin/env python
import sys
import os
import argparse
import multiprocessing
import pysam


def get_opt():
	group = argparse.ArgumentParser()
	group.add_argument('-r', '--ref', help='reference contig level assembly', required=True)
	group.add_argument('-g', '--alleletable', help='Allele.ctg.table', required=True)
	group.add_argument('-b', '--bam', help='bam file, default: prunning.bam', default='prunning.bam')
	group.add_argument('-d', '--workdir', help='work directory, default: wrk_dir', default='wrk_dir')
	group.add_argument('-t', '--thread', help='threads, default: 10', type=int, default=10)
	return group.parse_args()


def read_fasta(in_fa):
	fa_db = {}
	with open(in_fa, 'r') as fin:
		for line in fin:
			if line[0] == '>':
				id = line.strip().split()[0][1:]
				fa_db[id] = []
			else:
				fa_db[id].append(line.strip())
	for id in fa_db:
		fa_db[id] = ''.join(fa_db[id])
	
	return fa_db


def load_allele(allele_table):
	ctg_on_chr = {}
	chr_contain_ctg = {}
	with open(allele_table, 'r') as fin:
		for line in fin:
			data = line.strip().split()
			chrn = data[0]
			if chrn.startswith('tig') or chrn.startswith('scaffold') or chrn.startswith('utg') or chrn.startswith('ctg'):
				continue
			for ctg in data[2:]:
				if ctg not in ctg_on_chr:
					ctg_on_chr[ctg] = {}
				if chrn not in ctg_on_chr[ctg]:
					ctg_on_chr[ctg][chrn] = 0
				ctg_on_chr[ctg][chrn] += 1
	for ctg in ctg_on_chr:
		max_chr = ""
		max_cnt = 0
		for chrn in ctg_on_chr[ctg]:
			if ctg_on_chr[ctg][chrn] > max_cnt:
				max_cnt = ctg_on_chr[ctg][chrn]
				max_chr = chrn
		ctg_on_chr[ctg] = max_chr
		if max_chr not in chr_contain_ctg:
			chr_contain_ctg[max_chr] = {}
		chr_contain_ctg[max_chr][ctg] = 1
	return ctg_on_chr, chr_contain_ctg


def split_files(chrn, allele_table, ref, bam_file, wrkdir):
	wrk_dir = os.path.join(wrkdir, chrn)
	if not os.path.exists(wrk_dir):
		os.mkdir(wrk_dir)
	
	print("\tDealing %s"%chrn)
	ctg_on_chr, chr_contain_ctg = load_allele(allele_table)
	fa_db = read_fasta(ref)

	sub_bam = os.path.join(wrk_dir, chrn+'.bam')
	sub_fa = os.path.join(wrk_dir, chrn+'.fa')
	with open(sub_fa, 'w') as fout:
		for ctg in chr_contain_ctg[chrn]:
			fout.write(">%s\n%s\n"%(ctg, fa_db[ctg]))

	with pysam.AlignmentFile(bam_file, 'rb') as fin:
		with pysam.AlignmentFile(sub_bam, 'wb', template=fin) as fout:
			for ctg in chr_contain_ctg[chrn]:
				for line in fin.fetch(contig=ctg):
					if line.next_reference_name and line.next_reference_name in ctg_on_chr and ctg_on_chr[line.next_reference_name]==chrn:
						fout.write(line)
	

def partition_gmap(ref, allele_table, bam, wrkdir, threads):
	if not os.path.exists(wrkdir):
		os.mkdir(wrkdir)
	
	print("Getting groups")
	chrn_db = {}
	with open(allele_table, 'r') as fin:
		for line in fin:
			chrn_db[line.strip().split()[0]] = 1

	bai = bam+'.bai'
	if not os.path.exists(bai):
		print("BAI file not found, starting index...")
		ret = os.system('samtools index %s'%bam)
		if ret==0:
			print("Index success")
		else:
			print("Fatal: bam file must be sorted")
			sys.exit(-1)

	print("Splitting files")
	if len(chrn_db) < threads:
		threads = len(chrn_db)
	pool = multiprocessing.Pool(processes=threads)
	result_list = list()
	for chrn in chrn_db:
		result_list.append([chrn, pool.apply_async(split_files, (chrn, allele_table, ref, bam, wrkdir,))])
	pool.close()
	pool.join()

	error_list = list()
	for chrn, result in result_list:
		try:
			result.get()
		except Exception as e:
			print('Exception raised when dealing with {}: {}'.format(chrn, e))
			error_list.append(chrn)

	if error_list:
		raise Exception("{} exception(s) detected in : {}".format(len(error_list), ', '.join(error_list)))

	print("Notice: If you got errors of \"Length mismatch\" during allhic extract, it is normal because we split bam with the same header, it will not effect the result")
	print("Finished")


if __name__ == '__main__':
	opts = get_opt()
	ref = opts.ref
	allele_table = opts.alleletable
	bam = opts.bam
	wrkdir = opts.workdir
	threads = opts.thread
	partition_gmap(ref, allele_table, bam, wrkdir, threads)



================================================
FILE: scripts/prune.pl
================================================
#!/usr/bin/perl -w


use Getopt::Std;
getopts "i:b:r:";


if ((!defined $opt_i)|| (!defined $opt_b)|| (!defined $opt_r)) {
    die "************************************************************************
    Usage: perl $0 -i Allele.ctg.table -b bam.list -r draft.asm.fasta
      -h : help and usage.
      -i : Allele.ctg.table 
      -b : bam.list, a file contains input bam files
      -r : draft.sam.fasta
************************************************************************\n";
}

my $bamfile = $opt_b;
my $table   = $opt_i;
my $refSeq  = $opt_r;
### Read bam files

my %pairdb = ();
my %ctgdb  = ();
my %bamdb  = ();
open(IN, $bamfile) or die"";
while(<IN>){
	chomp;
	my $bam = $_;
	   $bam =~ s/\s+//g;
	next if(!($bam =~ /.bam/));
	$bamdb{$bam}++;
	open(my $fh, "samtools view $bam |") or die"";
	while(<$fh>){
		chomp;
		my @data = split(/\s+/,$_);
		my $ctg1 = $data[2];
		my $ctg2 = $data[6];
		next if($ctg2 eq "=");
		my ($sa,$sb) = sort ($ctg1,$ctg2);
		$pairdb{$sa}->{$sb}  .= $data[0].",";
		$ctgdb{$ctg1}++; $ctgdb{$ctg2}++;
		}
	close $fh;
	}
close IN;

### Read allele information
### Remove signal between alleles
open(OUT1, ">removedb_Allele.txt") or die"";
open(LOG, "> log.txt") or die"";
open(IN, $table) or die"";
while(<IN>){
	chomp;
	my @data     = split(/\s+/,$_);
	next if(@data<=3);
	my %tmpdb    = (); ### Record alelle contigs
	my $n        = $#data;
	for(my $i=2;$i<$n;$i++){
		my $ctg1 = $data[$i];
		for(my $j=$i+1;$j<=$n;$j++){
			my $ctg2 = $data[$j];
			my ($sa,$sb) = sort ($ctg1,$ctg2);
			my $key      = $sa.",".$sb;
			$tmpdb{$key}++;
			print OUT1 "$sa	$sb	$pairdb{$sa}->{$sb}\n" if(exists($pairdb{$sa}->{$sb}));
			}
		}
	print LOG ">$_\n";
	foreach my $i(2..$#data){
		my $ctg1    = $data[$i];
		foreach my $ctg2 (keys %ctgdb){
			my ($sa,$sb) = sort ($ctg1,$ctg2);
			my $key      = $sa.",".$sb;
			next if(exists($tmpdb{$key})); 
			next if(!exists($pairdb{$sa}->{$sb}));
			my @rnamedb = split(/,/,$pairdb{$sa}->{$sb});
			my $num_r   = @rnamedb;
			print LOG "$ctg2	$ctg1	$num_r	$pairdb{$sa}->{$sb}\n";
			}
		}
	}
close IN;
close OUT1;
close LOG;

### Remove signal which are not best match with listed alleles (ctgs)
open(OUT2, "> removedb_nonBest.txt") or die"";
open(IN, "log.txt") or die"";
$/='>';
<IN>;
while(<IN>){
	chomp;
	my %hashdb = ();
	my ($name,$info) = split(/\n/,$_,2);
	my @linedb   = split(/\n/,$info);
	foreach my $line(@linedb){
		my @data   = split(/\s+/,$line);
		if(!exists($hashdb{$data[0]})){
		  $hashdb{$data[0]}->{'retain'} = $data[1];
		  $hashdb{$data[0]}->{'num'}    = $data[2];				
		}elsif(exists($hashdb{$data[0]}) and $data[2]>$hashdb{$data[0]}->{'num'}){
		  $hashdb{$data[0]}->{'retain'} = $data[1];
		  $hashdb{$data[0]}->{'num'}    = $data[2];				
	 	}
	 }
	foreach $line (@linedb){
		@data = split(/\s+/,$line);
		if($hashdb{$data[0]}->{'retain'}  eq $data[1]){
#			print OUT2 "$data[0]	$data[1]	$data[2]	retain	$data[3]\n";
      next;
		}else{
			print OUT2 "$data[0]	$data[1]	$data[2]	remove	$data[3]\n";
			}
		}	
	}
close IN;
close OUT2;
system("remove_reads.pl");
### Reading removed reads
#my %removedb = ();
#open(IN, "removedb_Allele.txt") or die"";
#my $content = <IN>;
#my @linedb  = split(/\n/,$content);
#foreach my $line (@linedb){
#	my $info    = (split/\s+/,$line)[2];
#	my @rnamedb = split(/,/,$info);
#	map {$removedb{$_}++} @rnamedb;
#	
#	}
#close IN;
#
#open(IN, "removedb_nonBest.txt") or die"";
#$content = <IN>;
#@linedb  = split(/\n/,$content);
#foreach my $line (@linedb){
#	my $info  = (split/\s+/,$line)[4];
#	my @rnamedb = split(/,/,$info);
#	map {$removedb{$_}++} @rnamedb;	
#	}
#close IN;

#my $num_of_remove_reads = keys %removedb;
#print "Removing $num_of_remove_reads reads\n";

#open(OUT, "> prunning.sam") or die"";
#foreach my $bam (keys %bamdb){
#	open(my $fh, "samtools view $bam |") or die"";
#	$content = <$fh>;
#	@linedb  = split(/\n/,$content);
#	foreach my $line (@linedb){
#		my $rname = (split/\s+/,$line)[0];
#		next if(exists($removedb{$rname}));
#		print OUT "$line\n";
#		}
#	close $fh;
#	}
#close OUT;

system("samtools faidx $refSeq");
my $fai  = $refSeq.".fai";
system("samtools view -bt $fai prunning.sam > prunning.bam");






================================================
FILE: scripts/ragoo2ALLHiC.pl
================================================
#!/usr/bin/perl -w

use Getopt::Std;
getopts "l:r:b:e:";


if ((!defined $opt_l)|| (!defined $opt_r) ||(!defined $opt_b)) {
    die "************************************************************************
    Usage: perl ragoo2ALLHiC -l orderings.list -r draft.asm.fasta -b sample.clean.bam 
      -h : help and usage.
      -l : ordering.list contains a list of output files from ragoo
      -r : draft contig assembly
      -b : sample.clean.bam
      -e : restriction sites, optional, default GATC
           MboI: GATC; HindIII: AAGCTT
************************************************************************\n";
}else{
  print "************************************************************************\n";
  print "Version demo\n";
  print "Copyright to Tanger\n";
  print "RUNNING...\n";
  print "************************************************************************\n";
	
	}

$opt_e = (defined $opt_e)?$opt_e:"GATC";


if(!(-e "draft.asm.fasta")){
	system("ln -s $opt_r ./draft.asm.fasta");
}else{
	print "check draft.asm.fasta file, exist\n";
	}

if(!(-e "sample.clean.bam")){
	system("ln -s $opt_b ./sample.clean.bam");
}else{
	print "check sample.clean.bam file, exist\n";
	}


my $num_g = 0;
my %cntdb = ();
open(IN, $opt_l) or die"";
while(<IN>){
	chomp;
	$num_g++;
	my @linedb = split(/\n/,$_);
	foreach my $file (@linedb){
		$gid = (split/\//,$file)[-1];
		$gid =~ s/_orderings.txt//g;
		open(my $fh, $file) or die"";
		while(<$fh>){
			chomp;
			my $ctg = (split/\s+/,$_)[0];
			$cntdb{$gid}->{$ctg}++;
			}
		close $fh;
		}
	}
close IN;


open(OUT, ">clusters.txt") or die"";
print OUT "#Group	nContigs	Contigs\n";
foreach my $g (sort keys %cntdb){
	my $num = keys %{$cntdb{$g}};
	print OUT "$g	$num	";
	foreach my $c (keys %{$cntdb{$g}}){
		print OUT "$c	";
		}
	print OUT "\n";
	}
close OUT;

print "#### Counting restriction sites from draft assembly\n";
print "allhic extract sample.clean.bam draft.asm.fasta --RE $opt_e\n...\n\n";
system("allhic extract sample.clean.bam draft.asm.fasta --RE $opt_e");

print "### Rescue unanchored contigs\n";
my $countRE = "sample.clean.counts_".$opt_e.".txt";
print "ALLHiC_rescue -r draft.asm.fasta -b sample.clean.bam -c clusters.txt -i $countRE\n...\n\n";
system("ALLHiC_rescue -r draft.asm.fasta -b sample.clean.bam -c clusters.txt -i $countRE -m 1");

foreach my $i (1..$num_g){
	my $gn = "group".$i.".txt";
	print "### Scaffolding $gn\n";
	print "allhic optimize $gn sample.clean.clm\n...\n\n";
	system("allhic optimize $gn	sample.clean.clm");
	}

print "### Build ALLHiC assembly\n";
system("ALLHiC_build draft.asm.fasta");

system("Done ...\n");



================================================
FILE: scripts/release3DDNA.pl
================================================
#!/usr/bin/perl -w

die "Usage: perl $0 No_of_chr seq.FINAL.fasta\n" if(!defined ($ARGV[0]) or !defined($ARGV[1]));
my $Kchr = $ARGV[0];

open(IN, $ARGV[1]) or die"";
$/='>';
<IN>;
while(<IN>){
	chomp;
	my ($name,$seq) = split(/\n/,$_,2);
	$seq =~ s/\s+//g;
	my $len = length $seq;
	$infordb{$name}->{'seq'} = $seq;
	$infordb{$name}->{'len'} = $len;
	}
close IN;

open(OUT, "> chr.fasta") or die"";
my $count = 0;
foreach my $scaf (sort {$infordb{$b}->{'len'}<=>$infordb{$a}->{'len'}} keys %infordb){
	$count++;
	my $chrname = "";
	if($count<=$Kchr){
		$chrname = 'Chr'.$count;
	}else{
		$chrname = 'scaffold'.$count;
		}
	print OUT ">$chrname\n$infordb{$scaf}->{'seq'}\n";
	}

close OUT;


my $ctgn = 0;
open(OUT, ">tig.HiCcorrected.fasta") or die"";
open(IN, "chr.fasta") or die"";
$/='>';
<IN>;
while(<IN>){
	chomp;
	my ($chrn,$seq) = split(/\n/,$_,2);
	print "Process $chrn\n";	
	$seq            =~ s/N/\n/g;
	my $tour        = "";
	my $ctgname     = "";
	my $otour       = $chrn.".tour";
	my @seqdb = split(/\n/,$seq);
	foreach my $i (0..$#seqdb){
		next if ($seqdb[$i] eq "");
		$ctgn++;
		$ctgn = sprintf("%07d",$ctgn);
		$ctgname = "tig".$ctgn;
		$tour      .= $ctgname."+ ";
		print OUT ">$ctgname\n$seqdb[$i]\n";
		}
	next if($chrn =~ /scaffold/);
	open(my $out, ">$otour") or die"";
	print $out ">$chrn\n$tour\n";
	close $out;
	}
close IN;
close OUT;

system("ALLHiC_build tig.HiCcorrected.fasta");


================================================
FILE: scripts/remove_reads.pl
================================================
#!/usr/bin/perl -w

my %bamdb  = ();
open(IN, "bam.list") or die"";
while(<IN>){
        chomp;
        my $bam = $_;
           $bam =~ s/\s+//g;
        next if(!($bam =~ /.bam/));
        $bamdb{$bam}++;

        }
close IN;


my %removedb = ();
open(IN, "removedb_Allele.txt") or die"";
while(<IN>){
	chomp;
	my $info    = (split/\s+/,$_)[2];
	my @rnamedb = split(/,/,$info);
	map {$removedb{$_}++} @rnamedb;
	}
close IN;

open(IN, "removedb_nonBest.txt") or die"";
while(<IN>){
	chomp;
	my $info  = (split/\s+/,$_)[4];
	my @rnamedb = split(/,/,$info);
	map {$removedb{$_}++} @rnamedb;
	}
close IN;

my $num_of_remove_reads = keys %removedb;
print "Removing $num_of_remove_reads reads\n";


open(OUT, "> prunning.sam") or die"";
foreach my $bam (keys %bamdb){
        open(my $fh, "samtools view $bam|") or die"";
        while(<$fh>){
        	chomp;
                my @data  = split(/\s+/,$_);
        	my $rname = (split/\s+/,$_)[0];
                my $ctg2  = $data[6];
                next if($ctg2 eq "*");
        	print OUT "$_\n" if(!exists($removedb{$rname}));
        	}
        close $fh;
        }
close OUT;





================================================
FILE: scripts/remove_small_contigs.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Author: Xiaofei Zeng
# Email: xiaofei_zeng@whu.edu.cn
# Created Time: 2021-04-16 18:16

import argparse


def assembly_to_groups(assembly, len_cutoff):
    ctg_dict = dict()
    cluster_list = list()
    small_frag = set()
    with open(assembly) as f:
        for line in f:
            if not line.strip():
                continue
            cols = line.split()
            if line.startswith('>'):
                ctg_dict[cols[1]] = cols[0][1:]
                if int(cols[2]) < len_cutoff:
                    small_frag.add(cols[1])
            else:
                cluster_list.append([num.strip('-') for num in cols if num.strip('-') not in small_frag])
    return ctg_dict, cluster_list


def output_clusters(ctg_dict, cluster_list):
    with open('prunning.clusters.txt', 'w') as f:
        f.write('#Group\tnContigs\tContigs\n')
        ngroup = len(cluster_list)
        for n, nums in enumerate(cluster_list, 1):
            f.write('{0}g{1}\t{2}\t{3}\n'.format(ngroup, n, len(nums), ' '.join([ctg_dict[num] for num in nums])))


def output_counts(ctg_dict, counts):
    with open(counts) as fin, open('sub.'+counts, 'w') as fout:
        for line in fin:
            if line.startswith('#'):
                fout.write(line)
            else:
                cols = line.split()
                if cols[0] in ctg_dict.values():
                    fout.write(line)


def output_fasta(ctg_dict, fasta):
    output = False
    with open(fasta) as fin, open('sub.'+fasta, 'w') as fout:
        for line in fin:
            if line.startswith('>'):
                if line.split()[0][1:] in ctg_dict.values():
                    output = True
                    fout.write(line)
                else:
                    output = False
            elif output:
                fout.write(line)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('assembly', help='*.review.assembly (output file of juicebox manual grouping), used to generate new prunning.clusters.txt')
    parser.add_argument('--fasta', default=None, help='input fasta file of contigs, this parameter will remove contigs not in .review.assembly, optional')
    parser.add_argument('--counts', default=None, help='input prunning.counts_RE.txt, this parameter will remove contigs not in .review.assembly, optional')
    parser.add_argument('--len_cutoff', default=100, type=float, help='length cutoff, default: %(default)s Kbp')
    args = parser.parse_args()

    ctg_dict, cluster_list = assembly_to_groups(args.assembly, args.len_cutoff*1000)
    output_clusters(ctg_dict, cluster_list)
    if args.fasta:
        output_fasta(ctg_dict, args.fasta)
    if args.counts:
        output_counts(ctg_dict, args.counts)


if __name__ == '__main__':
    main()



================================================
FILE: scripts/simuCTG.pl
================================================
#!/usr/bin/perl -w

use Getopt::Std;
getopts "i:m:s:";

if ((!defined $opt_i)|| (!defined $opt_m)  || (!defined $opt_s)) {
    die "************************************************************************
    Usage: perl $0 -i input.fasta -m mean -s SD
      -h : help and usage.
      -i : input.fasta, chromosome assembly
      -m : mean length
      -s : sd
************************************************************************\n";
}else{
  print "************************************************************************\n";
  print "Version 1.1\n";
  print "Copyright to Tanger\n";
  print "RUNNING...\n";
  print "************************************************************************\n";
        
        }

my $mean = lc $opt_m;
my $sd   = lc $opt_s;

if($mean =~ /m/){
	$mean =~ s/m//g;
	$mean = $mean * 1000000;
}elsif($mean =~ /k/){
	$mean =~ s/k//g;
	$mean = $mean * 1000;
	}


if($sd =~ /m/){
	$sd =~ s/m//g;
	$sd = $sd * 1000000;
}elsif($sd =~ /k/){
	$sd =~ s/k//g;
	$sd = $sd * 1000;
	}

print "1. generate a contig assembly with Average length = $mean bp ...\n";

my %chrdb;
open(CTG, "> chrUn.fasta") or die"";
open(OUT, "> new_genome.posi.bed") or die"";
open(IN, $opt_i) or die"";
$/='>';
<IN>;
while(<IN>){
	chomp;
	my ($gene,$seq) = split(/\n/,$_,2);
	$seq =~ s/\s+//g;
	if($gene=~/[C|c]hrUn/){
		print CTG ">$gene\n$seq\n";
		next;
		}
	$chrdb{$gene}   = $seq;
	my $total_len   = length $seq;
	my $num_seq     = int $total_len/$sd + 500;
  system("echo \"data<-rnorm($num_seq,mean=$mean,sd=$sd)\" >>Rscript.txt");
  system("echo \"write.table\(data,file\=\'x.txt\'\) \" >> Rscript.txt");
  system("chmod +x Rscript.txt");
  my $Rcmd = "R CMD BATCH --no-save ./Rscript.txt";
  system($Rcmd);
  my $start = 0; my $l  =  0; my $end = 0;
  open(F, "x.txt") or die"";
  my $content = <F>;
  my @linedb = split(/\n/,$content);
  foreach my $i(1..$#linedb){
  	my $line     = $linedb[$i];
  	$start       = $end+1;
  	$l           = (split/\s+/,$line)[1];
  	$l           = int $l;
#  	next if($l<=0);
    $l           = 0 - $l if($l<0);
  	if($end>$total_len){
  		$end       = $total_len;
  	}else{
  		$end       = $start + $l - 1;
  		}
  	next if($start>=$total_len);
  	print OUT "$gene	$start	$end\n";
  	}
  close F;
  system("rm x.txt");
  system("rm Rscript.*");
	}
close IN;
close OUT;
close CTG;

my $count = 0;
my %tdb;
open(OUT, "> ctg.tmp.fasta") or die"";
open(IN, "new_genome.posi.bed") or die"";
$content = <IN>;
@linedb  = split(/\n/,$content);
foreach $line(@linedb){
	my ($chrn,$a,$b) = split(/\s+/,$line);
	my $L            = $b - $a + 1;
	my $subseq       = substr($chrdb{$chrn},$a-1,$L);
	if(!exists($tdb{$chrn})){
		$count = 0;
		$tdb{$chrn}++;
		$count++;
		$outname       = $chrn.".ctg".$count;
	}else{
		$count++;
		$outname       = $chrn.".ctg".$count;	
		}
	print OUT ">$outname\n$subseq\n";	
	}
close OUT;

system("cat ctg.tmp.fasta chrUn.fasta > ctg.fasta");
system("rm ctg.tmp.fasta");

print "2. get statistics for the contig assembly ...\n";
system("perl ~/software/script/faSize.pl ctg.fasta");

$content = `perl ~/software/script/faSize.pl ctg.fasta`;
my $N50  = $1 if($content=~/N50:\s+(\d+)/);
my $ave  = $1 if($content=~/Average\s+length:\s+(\d+)/);
my $ctgname = "ctg."."n".$N50."_m".$ave.".fasta";
system("mv ctg.fasta ./$ctgname");


================================================
FILE: scripts/statAGP.pl
================================================
#!/usr/bin/perl -w

die "Usage: perl $0 chr.agp\n" if(!defined $ARGV[0]);
my $agp = $ARGV[0];
my %uctgdb;
my %actgdb;
my %chrdb;
my $sumL = 0;
my $sumC = 0;
my $sumU = 0;
open(IN, "grep -v 'contig' $agp |grep -v '#'|") or die"";
while(<IN>){
	chomp;
	my @data = split(/\s+/,$_);
	$sumC++;
	$sumL   += $data[7];
	if($data[0] eq $data[5]){
	  $uctgdb{$data[5]} = "Unanchor"; 
	  $sumU += $data[7];
	}else{
		$actgdb{$data[5]} = "Anchor";
		$chrdb{$data[0]}->{'ctg'}++;
		$chrdb{$data[0]}->{'len'} = $data[2];
		}
	}
close IN;

my $numU = keys %uctgdb;
my $numA = keys %actgdb;
my $sumA = 0;
print "ChrID	Anchored_ctg	Length\n";
foreach my $chrn (sort {$chrdb{$b}->{'len'}<=>$chrdb{$a}->{'len'} } keys %chrdb){
	$sumA += $chrdb{$chrn}->{'len'};
	print "$chrn	$chrdb{$chrn}->{'ctg'}	$chrdb{$chrn}->{'len'}\n";
	}

print "Total number of contigs (bp): $sumC\n";
print "Total length of contigs (bp): $sumL\n";
print "Total number of anchored contgis: $numA\n";
print "Total length of chromosome level assembly (bp): $sumA\n";
print "Number of unanchored contigs: $numU\n";
print "Length of unanchored contigs: $sumU\n";
my $arate = (1-$sumU/$sumL)*100;
$arate = sprintf("%.2f",$arate);
print "Anchor rate (%): $arate\n";

Download .txt

gitextract_i521fsdf/

├── .gitmodules
├── README.md
├── allhic.v0.9.8
├── bin/
│   ├── ALLHiC_build
│   ├── ALLHiC_corrector
│   ├── ALLHiC_partition
│   ├── ALLHiC_pip.sh
│   ├── ALLHiC_plot
│   ├── ALLHiC_prune
│   ├── ALLHiC_rescue
│   └── allhic
└── scripts/
    ├── ALLHiC2ALLMAPS.pl
    ├── PreprocessSAMs.pl
    ├── agp2tour.pl
    ├── bam2CLM.pl
    ├── bam2CLM_simple.pl
    ├── bam2net.pl
    ├── bam_HiCplotter.py
    ├── blastn_parse.pl
    ├── classify.pl
    ├── filterBAM_forHiC.pl
    ├── gmap2AlleleTable.pl
    ├── gmap2AlleleTableBED.pl
    ├── link_superscaffold.pl
    ├── make_bed_around_RE_site.pl
    ├── mc_bam.pl
    ├── odering2tour.pl
    ├── partition.pl
    ├── partition_gmap.pl
    ├── partition_gmap.py
    ├── prune.pl
    ├── ragoo2ALLHiC.pl
    ├── release3DDNA.pl
    ├── remove_reads.pl
    ├── remove_small_contigs.py
    ├── simuCTG.pl
    └── statAGP.pl

Download .txt

SYMBOL INDEX (14 symbols across 3 files)

FILE: scripts/bam_HiCplotter.py
  function get_read_pos_with_sam_bam_file (line 10) | def get_read_pos_with_sam_bam_file(sam_bam_file):
  function get_chr_len (line 37) | def get_chr_len(chr_list):
  function calc_read_count_per_bin (line 51) | def calc_read_count_per_bin(chr_len_db, chr_order, read_on_chr, bin_size):
  function draw_heatmap (line 115) | def draw_heatmap(data, chrn, bin_size, ext):

FILE: scripts/partition_gmap.py
  function get_opt (line 9) | def get_opt():
  function read_fasta (line 19) | def read_fasta(in_fa):
  function load_allele (line 34) | def load_allele(allele_table):
  function split_files (line 63) | def split_files(chrn, allele_table, ref, bam_file, wrkdir):
  function partition_gmap (line 86) | def partition_gmap(ref, allele_table, bam, wrkdir, threads):

FILE: scripts/remove_small_contigs.py
  function assembly_to_groups (line 11) | def assembly_to_groups(assembly, len_cutoff):
  function output_clusters (line 29) | def output_clusters(ctg_dict, cluster_list):
  function output_counts (line 37) | def output_counts(ctg_dict, counts):
  function output_fasta (line 48) | def output_fasta(ctg_dict, fasta):
  function main (line 62) | def main():

Download .json

Condensed preview — 37 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (116K chars).

[
  {
    "path": ".gitmodules",
    "chars": 74,
    "preview": "[submodule \"src\"]\n\tpath = src\n\turl = https://github.com/tanghaibao/allhic\n"
  },
  {
    "path": "README.md",
    "chars": 501,
    "preview": "# ALLHiC\nALLHiC: phasing and scaffolding polyploid genomes based on Hi-C data  \nSee wiki for details (https://github.com"
  },
  {
    "path": "bin/ALLHiC_build",
    "chars": 1578,
    "preview": "#!/usr/bin/perl -w\n\n\ndie \"Usage: perl $0 refSeq.fasta\\n\" if(!(defined $ARGV[0]));\n\nprint \"1. tour format to agp ...\\n\";\n"
  },
  {
    "path": "bin/ALLHiC_corrector",
    "chars": 8997,
    "preview": "#!/usr/bin/env python\nimport sys\nimport multiprocessing\nimport math\nimport numpy as np\nimport pysam\nimport time\nimport a"
  },
  {
    "path": "bin/ALLHiC_partition",
    "chars": 1676,
    "preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"b:r:e:k:m:\";\n\n\nif ( (!defined $opt_r)|| (!defined $opt_e)|| (!defined $opt"
  },
  {
    "path": "bin/ALLHiC_pip.sh",
    "chars": 2796,
    "preview": "#!/bin/bash\n\nusage()\n{\n\techo \"    Usage: `basename $0` -r reference -1 R1.fq -2 R2.fq -k group_count [-e enzyme] [-t thr"
  },
  {
    "path": "bin/ALLHiC_plot",
    "chars": 10040,
    "preview": "#!/usr/bin/env python\nimport argparse\nimport numpy as np\nimport matplotlib as mpl\nmpl.use(\"Agg\")\nimport matplotlib.pyplo"
  },
  {
    "path": "bin/ALLHiC_rescue",
    "chars": 5091,
    "preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"b:r:c:i:m:\";\n\n\n\nif ( (!defined $opt_b)|| (!defined $opt_r)|| (!defined $op"
  },
  {
    "path": "scripts/ALLHiC2ALLMAPS.pl",
    "chars": 653,
    "preview": "#!/usr/bin/perl -w\n### Convert ALLHiC output AGP file to ALLMAPS input csv file\nprint \"Convert ALLHiC output AGP file to"
  },
  {
    "path": "scripts/PreprocessSAMs.pl",
    "chars": 8482,
    "preview": "#!/usr/bin/perl -w\nuse strict;\n\n\n\n# PreprocessSAMs.pl\n#\n# Syntax: PreprocessSAMs.pl <sam or bam filename> <draft assembl"
  },
  {
    "path": "scripts/agp2tour.pl",
    "chars": 665,
    "preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 chr.agp\\n\" if(!defined $ARGV[0]);\nmy %infordb;\nmy $cnt = 0;\nopen(IN, \"grep -v co"
  },
  {
    "path": "scripts/bam2CLM.pl",
    "chars": 5176,
    "preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"b:r:d:\";\n\n\nif ((!defined $opt_b)|| (!defined $opt_r) || (!defined $opt_d) "
  },
  {
    "path": "scripts/bam2CLM_simple.pl",
    "chars": 3007,
    "preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 mapping.bam refSeq.fasta\\n\" if((!defined $ARGV[0]) or (!defined $ARGV[1]));\n\nmy "
  },
  {
    "path": "scripts/bam2net.pl",
    "chars": 1420,
    "preview": "#!/usr/bin/perl -w\nuse Getopt::Std;\ngetopts \"c:b:o:\";\n\n\nif ((!defined $opt_c)|| (!defined $opt_b)||(!defined $opt_o) ) {"
  },
  {
    "path": "scripts/bam_HiCplotter.py",
    "chars": 6809,
    "preview": "#!/usr/bin/env python\nimport os\nimport sys\nimport gc\nfrom math import log\nimport time\n\n\n# Get position of read based on "
  },
  {
    "path": "scripts/blastn_parse.pl",
    "chars": 1721,
    "preview": "#!/usr/bin/perl -w\n\n###This script was used to parse blast+ result (outfmt 6)\n###you can get best hit with parameter -b "
  },
  {
    "path": "scripts/classify.pl",
    "chars": 3698,
    "preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"i:p:r:g:\";\n\n\nif ((!defined $opt_i)|| (!defined $opt_p) || (!defined $opt_r"
  },
  {
    "path": "scripts/filterBAM_forHiC.pl",
    "chars": 1156,
    "preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 file.bam out.sam\\n\" if(!defined($ARGV[0]) or !defined($ARGV[1]));\nopen(OUT, \"> $"
  },
  {
    "path": "scripts/gmap2AlleleTable.pl",
    "chars": 711,
    "preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 ref.gff3\\n\" if(!defined ($ARGV[0]));\nmy $refGFF = $ARGV[0];\nopen(IN, \"grep 'gene"
  },
  {
    "path": "scripts/gmap2AlleleTableBED.pl",
    "chars": 675,
    "preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 ref.bed\\n\" if(!defined ($ARGV[0]));\nmy $refGFF = $ARGV[0];\nopen(IN, \"grep 'gene'"
  },
  {
    "path": "scripts/link_superscaffold.pl",
    "chars": 3611,
    "preview": "#!/usr/bin/perl -w\n\nmy %namedb;\nmy %removedb;\nwhile(<DATA>){\n\tchomp;\n\tmy ($id,$name) = (split/\\s+/,$_)[0,1];\n\t$namedb{$n"
  },
  {
    "path": "scripts/make_bed_around_RE_site.pl",
    "chars": 7717,
    "preview": "#!/usr/bin/perl -w\nuse strict;\n\n\n# make_bed_around_restriction_site.pl: Make a BED file representing the regions around "
  },
  {
    "path": "scripts/mc_bam.pl",
    "chars": 2123,
    "preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"b:a:r:\";\n\n\nif ((!defined $opt_b)|| (!defined $opt_a) ||(!defined $opt_r)) "
  },
  {
    "path": "scripts/odering2tour.pl",
    "chars": 350,
    "preview": "#!/usr/bin/perl -w\n\nwhile(my $file=glob \"*_orderings.txt\"){\n\tmy $name = $file; \n\t\t $name =~ s/_orderings.txt//g;\n\t\t $nam"
  },
  {
    "path": "scripts/partition.pl",
    "chars": 2932,
    "preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"g:d:b:r:\";\n\n\nif ((!defined $opt_g)|| (!defined $opt_r)) {\n    die \"*******"
  },
  {
    "path": "scripts/partition_gmap.pl",
    "chars": 3248,
    "preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"g:d:b:r:l:\";\n\n\nif ((!defined $opt_g)|| (!defined $opt_r)) {\n    die \"*****"
  },
  {
    "path": "scripts/partition_gmap.py",
    "chars": 4071,
    "preview": "#!/usr/bin/env python\nimport sys\nimport os\nimport argparse\nimport multiprocessing\nimport pysam\n\n\ndef get_opt():\n\tgroup ="
  },
  {
    "path": "scripts/prune.pl",
    "chars": 4175,
    "preview": "#!/usr/bin/perl -w\n\n\nuse Getopt::Std;\ngetopts \"i:b:r:\";\n\n\nif ((!defined $opt_i)|| (!defined $opt_b)|| (!defined $opt_r))"
  },
  {
    "path": "scripts/ragoo2ALLHiC.pl",
    "chars": 2615,
    "preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"l:r:b:e:\";\n\n\nif ((!defined $opt_l)|| (!defined $opt_r) ||(!defined $opt_b)"
  },
  {
    "path": "scripts/release3DDNA.pl",
    "chars": 1410,
    "preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 No_of_chr seq.FINAL.fasta\\n\" if(!defined ($ARGV[0]) or !defined($ARGV[1]));\nmy $"
  },
  {
    "path": "scripts/remove_reads.pl",
    "chars": 1131,
    "preview": "#!/usr/bin/perl -w\n\nmy %bamdb  = ();\nopen(IN, \"bam.list\") or die\"\";\nwhile(<IN>){\n        chomp;\n        my $bam = $_;\n  "
  },
  {
    "path": "scripts/remove_small_contigs.py",
    "chars": 2813,
    "preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\n# Author: Xiaofei Zeng\n# Email: xiaofei_zeng@whu.edu.cn\n# Created Time: 2"
  },
  {
    "path": "scripts/simuCTG.pl",
    "chars": 3294,
    "preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"i:m:s:\";\n\nif ((!defined $opt_i)|| (!defined $opt_m)  || (!defined $opt_s))"
  },
  {
    "path": "scripts/statAGP.pl",
    "chars": 1215,
    "preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 chr.agp\\n\" if(!defined $ARGV[0]);\nmy $agp = $ARGV[0];\nmy %uctgdb;\nmy %actgdb;\nmy"
  }
]

// ... and 3 more files (download for full content)

About this extraction

This page contains the full source code of the tangerzhang/ALLHiC GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 37 files (10.5 MB), approximately 35.9k tokens, and a symbol index with 14 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo