Repository: tangerzhang/ALLHiC
Branch: master
Commit: 4710d96b2872
Files: 37
Total size: 10.5 MB
Directory structure:
gitextract_i521fsdf/
├── .gitmodules
├── README.md
├── allhic.v0.9.8
├── bin/
│ ├── ALLHiC_build
│ ├── ALLHiC_corrector
│ ├── ALLHiC_partition
│ ├── ALLHiC_pip.sh
│ ├── ALLHiC_plot
│ ├── ALLHiC_prune
│ ├── ALLHiC_rescue
│ └── allhic
└── scripts/
├── ALLHiC2ALLMAPS.pl
├── PreprocessSAMs.pl
├── agp2tour.pl
├── bam2CLM.pl
├── bam2CLM_simple.pl
├── bam2net.pl
├── bam_HiCplotter.py
├── blastn_parse.pl
├── classify.pl
├── filterBAM_forHiC.pl
├── gmap2AlleleTable.pl
├── gmap2AlleleTableBED.pl
├── link_superscaffold.pl
├── make_bed_around_RE_site.pl
├── mc_bam.pl
├── odering2tour.pl
├── partition.pl
├── partition_gmap.pl
├── partition_gmap.py
├── prune.pl
├── ragoo2ALLHiC.pl
├── release3DDNA.pl
├── remove_reads.pl
├── remove_small_contigs.py
├── simuCTG.pl
└── statAGP.pl
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitmodules
================================================
[submodule "src"]
path = src
url = https://github.com/tanghaibao/allhic
================================================
FILE: README.md
================================================
# ALLHiC
ALLHiC: phasing and scaffolding polyploid genomes based on Hi-C data
See wiki for details (https://github.com/tangerzhang/ALLHiC/wiki).
# Note
Please be aware that ALLHiC is no longer maintained. We recommend using two recently released algorithm packages developed by our team, which are reference-free and much faster:
- **C-Phasing**: [C-Phasing GitHub Repository](https://github.com/wangyibin/CPhasing)
- **HapHiC**: [HapHiC GitHub Repository](https://github.com/zengxiaofei/HapHiC)
================================================
FILE: bin/ALLHiC_build
================================================
#!/usr/bin/perl -w
die "Usage: perl $0 refSeq.fasta\n" if(!(defined $ARGV[0]));
print "1. tour format to agp ...\n";
my $refSeq = $ARGV[0];
my $Nseq = "N" x 100;
my %anchordb;
my %seqdb;
open(IN, $refSeq) or die"";
$/='>';
<IN>;
while(<IN>){
chomp;
my ($ctg,$seq) = split(/\n/,$_,2);
$ctg =~ s/\s+.*//g;
$seq =~ s/\s+//g;
$seqdb{$ctg} = $seq;
}
close IN;
open(OUT, "> groups.agp") or die"Error: $!";
open(SEQ, "> groups.asm.fasta") or die"Error: $!";
while(my $tour = glob "*.tour"){
print "Processing $tour ...\n";
my $gid = $tour;
$gid =~ s/.tour//g;
my $agp = $gid.".agp";
my $last_line = `tail -n 1 $tour`;
my @ctgdb = split(/\s+/,$last_line);
my $a = 0;
my $b = 0;
my $len = 0;
my $count = 0;
my $fullSeq = "";
foreach my $i(0..$#ctgdb){
my $ctg; my $dir;
if($ctgdb[$i]=~/(.*)([+|-])/){
$ctg = $1; $dir = $2;
}
$a = $b + 1;
$len = length $seqdb{$ctg};
$anchordb{$ctg}++;
$b = $a + $len - 1;
$count++;
print OUT "$gid $a $b $count W $ctg 1 $len $dir\n";
my $seq = uc $seqdb{$ctg};
if($dir eq "-"){
$seq = reverse $seq;
$seq =~ tr/ATGC/TACG/;
}
$fullSeq .= $seq;
$a = $b + 1;
$b = $a + 100 - 1;
$count++;
print OUT "$gid $a $b $count U 100 contig yes map\n" if($i!=$#ctgdb);
$fullSeq .= $Nseq if($i!=$#ctgdb);
}
print SEQ ">$gid\n$fullSeq\n";
}
foreach my $ctg (keys %seqdb){
next if(exists($anchordb{$ctg}));
my $len = length $seqdb{$ctg};
print OUT "$ctg 1 $len 1 W $ctg 1 $len +\n";
print SEQ ">$ctg\n$seqdb{$ctg}\n";
}
close OUT;
close SEQ;
================================================
FILE: bin/ALLHiC_corrector
================================================
#!/usr/bin/env python
import sys
import multiprocessing
import math
import numpy as np
import pysam
import time
import argparse
def time_print(str):
print("\033[32m%s\033[0m %s"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())), str))
def get_opt():
group = argparse.ArgumentParser()
group.add_argument("-m", "--mapping", help="Input mapping file", required=True)
group.add_argument("-r", "--reference", help="Contig fasta file", required=True)
group.add_argument("-o", "--output", help="Corrected fasta file", required=True)
group.add_argument("-p", "--percent", type=float, help="Percent of the map to saturate, default is 0.95", default=0.95)
group.add_argument("-s", "--sensitive", type=float, help="sensitivity to depletion score, default is 0.5", default=0.5)
group.add_argument("-q", "--mapq", type=int, help="MAPQ of mapping lower bound, default is 1", default=1)
group.add_argument("-w", "--wide", type=int, help="Resolution for first pass search of mismatches, default is 25000 bp", default=25000)
group.add_argument("-n", "--narrow", type=int, help="Resolution for the precise mismatch localizaton, n<w default is 1000 bp", default=1000)
group.add_argument("-d", "--depletion", type=int, help="The size of the region to aggregate the depletion score in the wide path, d >= 2*w, default is 100000 bp", default=100000)
group.add_argument("-t", "--threads", type=int, help="Threads, default is 1", default=1)
return group.parse_args()
def get_ctg_len(bam):
ctg_len = {}
for item in bam.header["SQ"]:
item = dict(item)
ctg_len[item['SN']] = item['LN']
return ctg_len
def get_pos_list(bam_fetch, min_mapq):
pos_list = []
for line in bam_fetch:
ctg1 = line.reference_name
ctg2 = line.next_reference_name
pos1 = line.reference_start
pos2 = line.next_reference_start
if pos1 == -1 or pos2 == -1 or ctg1 != ctg2 or line.mapq < min_mapq:
continue
pos_list.append([pos1, pos2])
return pos_list
def get_hic_list(pos_list, bin_size):
hic_db = {}
hic_list = []
pos_mat = np.matrix(pos_list)
pos_mat = pos_mat//bin_size*bin_size
for i in range(0, len(pos_mat)):
key = (pos_mat[i, 0], pos_mat[i, 1])
if key not in hic_db:
hic_db[key] =0
hic_db[key] += 1
for key in hic_db:
hic_list.append([key[0], key[1], hic_db[key]])
return hic_list
def calc_sat_level(hic_list, pct):
tmp_list = []
nan_cnt = 0
for i in range(0, len(hic_list)):
if hic_list[i][0] != hic_list[i][1]:
if math.isnan(hic_list[i][2]):
nan_cnt += 1
tmp_list.append(0)
else:
tmp_list.append(hic_list[i][2])
if tmp_list == []:
return -1
tmp_list = sorted(tmp_list)
for i in range(0, nan_cnt):
tmp_list[i] = float('nan')
if len(tmp_list) == 1:
return tmp_list[0]
else:
pos = pct*(len(tmp_list)+1)
if pos<1:
return tmp_list[0]
else:
if pos >= len(tmp_list):
return tmp_list[-1]
else:
d = pos-int(pos)
return tmp_list[int(pos)-1]+d*(tmp_list[int(pos)]-tmp_list[int(pos)-1])
def precompute_dep_score(hic_list, bin_size, dep_size, sat_level):
score_db = {}
for s, e, val in hic_list:
if math.isnan(val):
continue
if e-s>dep_size:
continue
if val >= sat_level:
val = sat_level
for i in range(s+bin_size, e, bin_size):
if i not in score_db:
score_db[i] = 0
score_db[i] += val
pos = score_db.keys()
if len(pos) != 0:
return score_db, min(pos), max(pos)
else:
return score_db, 0, 0
def get_sub_score_db(score_db, min_pos, max_pos, bin_size, dep_size):
sub_score_db = {}
for i in range(min_pos+dep_size-2*bin_size, max_pos-dep_size+3*bin_size, bin_size):
if i in score_db:
sub_score_db[i] = score_db[i]
else:
sub_score_db[i] = 0
return sub_score_db
def get_wide_mismatch(score_db, thr, bin_size):
tmp_list = [[]]
for i in sorted(score_db):
if score_db[i] < thr:
if tmp_list[-1] == []:
tmp_list[-1].append(i)
else:
if tmp_list[-1] != []:
tmp_list[-1].append(i)
tmp_list.append([])
if len(tmp_list[-1]) == 1:
tmp_list[-1].append(i+bin_size)
elif len(tmp_list[-1]) == 0:
del tmp_list[-1]
return tmp_list
def get_mismatch(hic_list, bin_size, dep_size, pct, sens, is_wide):
sat_level = round(calc_sat_level(hic_list, pct), 5)
if sat_level == -1:
return []
thr = sens*sat_level*0.5*dep_size/bin_size*(dep_size/bin_size-1)
score_db, min_pos, max_pos = precompute_dep_score(hic_list, bin_size, dep_size, sat_level)
if len(score_db) != 0:
score_db = get_sub_score_db(score_db, min_pos, max_pos, bin_size, dep_size)
if is_wide:
if len(score_db) != 0:
wide_mismatch = get_wide_mismatch(score_db, thr, bin_size)
else:
wide_mismatch = []
return wide_mismatch
else:
return score_db
def merge_region(wide_list, narrow_score, bin_size):
idx_wide = 0
min_val = 0
tmp_list = []
if narrow_score == {}:
return wide_list
for pos in sorted(narrow_score):
if idx_wide >= len(wide_list):
break
if pos <= wide_list[idx_wide][0]:
min_val = narrow_score[pos]
else:
if narrow_score[pos] < min_val:
min_val = narrow_score[pos]
if pos+bin_size <= wide_list[idx_wide][0]:
continue
if pos >= wide_list[idx_wide][1]:
for i in range(wide_list[idx_wide][0], wide_list[idx_wide][1], bin_size):
if i in narrow_score and narrow_score[i] == min_val:
tmp_list.append([i, i+bin_size])
idx_wide += 1
if idx_wide < len(wide_list):
for i in range(wide_list[idx_wide][0], wide_list[idx_wide][1], bin_size):
if i in narrow_score and narrow_score[i] == min_val:
tmp_list.append([i, i+bin_size])
if tmp_list == []:
return wide_list
narrow_mismatch = []
last_e = 0
for s, e in tmp_list:
if last_e == 0:
narrow_mismatch.append([s])
last_e = e
else:
if s != last_e:
narrow_mismatch[-1].append(last_e)
narrow_mismatch.append([s])
last_e = e
narrow_mismatch[-1].append(last_e)
return narrow_mismatch
def pipeline(in_bam, mapq, dep_size, bin_size, narrow_bin_size, percent, sensitive, ctg):
time_print("\tContig: %s Getting mapping list"%ctg)
with pysam.AlignmentFile(in_bam, 'rb') as bam:
mapping_list = get_pos_list(bam.fetch(contig=ctg), mapq)
if mapping_list == []:
time_print("\tContig: %s Could not found mapping list"%ctg)
return []
time_print("\tContig: %s Getting hic list with bin size: %d"%(ctg, bin_size))
hic_list = get_hic_list(mapping_list, bin_size)
time_print("\tContig: %s Getting wide mismatch"%ctg)
wide_mismatch = get_mismatch(hic_list, bin_size, dep_size, percent, sensitive, True)
if wide_mismatch == []:
time_print("\tContig: %s Could not found mismatch"%ctg)
return []
dep_size = bin_size
bin_size = narrow_bin_size
time_print("\tContig: %s Getting narrow score with bin size: %d"%(ctg, bin_size))
hic_list = get_hic_list(mapping_list, bin_size)
narrow_score = get_mismatch(hic_list, bin_size, dep_size, percent, sensitive, False)
time_print("\tContig: %s Getting narrow mismatch"%ctg)
narrow_mismatch = merge_region(wide_mismatch, narrow_score, bin_size)
if narrow_mismatch == wide_mismatch:
time_print("\tContig: %s Wide mismatch without update"%ctg)
return narrow_mismatch
def ALLHiC_correct(in_bam, in_fa, out_fa, mapq, dep_size, bin_size, narrow_bin_size, percent, sensitive, thread):
time_print("Reading mapping")
with pysam.AlignmentFile(in_bam, 'rb') as bam:
ctg_len = get_ctg_len(bam)
time_print("Running pipeline")
pool = multiprocessing.Pool(processes=thread)
res = []
for ctg in ctg_len:
r = pool.apply_async(pipeline, (in_bam, mapq, dep_size, bin_size, narrow_bin_size, percent, sensitive, ctg,))
res.append([ctg, r])
pool.close()
pool.join()
bam.close()
narrow_mismatch = {}
for ctg, r in res:
sub_mismatch = r.get()
if sub_mismatch != []:
narrow_mismatch[ctg] = sub_mismatch
time_print("Found all mismatches")
time_print("Reading origin fasta")
fa_db = {}
with open(in_fa, 'r') as fin:
for line in fin:
if line[0] == '>':
id = line.strip().split()[0][1:]
fa_db[id] = []
else:
fa_db[id].append(line.strip())
for id in fa_db:
fa_db[id] = ''.join(fa_db[id])
time_print("Writing result")
with open(out_fa, 'w') as fout:
for ctg in sorted(fa_db):
if ctg in narrow_mismatch:
base = 0
for s, e in narrow_mismatch[ctg]:
s = s-1
e = e-1
fout.write(">%s_%d_%d\n%s\n"%(ctg, base+1, s, fa_db[ctg][base: s]))
fout.write(">%s_%d_%d\n%s\n"%(ctg, s+1, e, fa_db[ctg][s: e]))
base = e
if base < len(fa_db[ctg]):
fout.write(">%s_%d_%d\n%s\n"%(ctg, base, len(fa_db[ctg]), fa_db[ctg][base:]))
else:
fout.write(">%s\n%s\n"%(ctg, fa_db[ctg]))
time_print("Finished")
if __name__ == "__main__":
opts = get_opt()
in_bam = opts.mapping
in_fa = opts.reference
out_fa = opts.output
mapq = opts.mapq
percent = opts.percent
sensitive = opts.sensitive
dep_size = opts.depletion
bin_size = opts.wide
narrow_bin_size = opts.narrow
thread = opts.threads
ALLHiC_correct(in_bam, in_fa, out_fa, mapq, dep_size, bin_size, narrow_bin_size, percent, sensitive, thread)
================================================
FILE: bin/ALLHiC_partition
================================================
#!/usr/bin/perl -w
use Getopt::Std;
getopts "b:r:e:k:m:";
if ( (!defined $opt_r)|| (!defined $opt_e)|| (!defined $opt_k)) {
die "************************************************************************
Usage: ALLHiC_partition -r draft.asm.fasta -e enzyme_sites -k Num of groups
-h : help and usage.
-b : prunned bam (optional, default prunning.bam)
-r : draft.sam.fasta
-e : enzyme_sites (HindIII: AAGCTT; MboI: GATC, Arima)
-k : number of groups (user defined K value)
-m : minimum number of restriction sites (default, 25)
************************************************************************\n";
}
my $bam = (defined $opt_b)?$opt_b:"prunning.bam";
my $refSeq = $opt_r;
my $esites = uc $opt_e;
$esites = "AAGCTT" if($esites eq "HINDIII");
$esites = "GATC" if($esites eq "MBOI");
my $K = $opt_k;
my $minRes = (defined $opt_m)?$opt_m:25;
my $runcmd = "";
print "Extract function: calculate an empirical distribution of Hi-C link size based on intra-contig links\n";
if ($esites eq "ARIMA") {
$runcmd = "allhic extract ".$bam." ".$refSeq." --RE='GATCGATC,GANTGATC,GANTANTC,GATCANTC'";
$esites = "GATCGATC_GANTGATC_GANTANTC_GATCANTC";
}
else {
$runcmd = "allhic extract ".$bam." ".$refSeq." --RE ".$esites;
}
print "CMD: $runcmd\n";
system($runcmd);
print "Partition contigs based on prunning bam file\n";
my $counts_file = $bam.".counts_".$esites.".txt";
$counts_file =~ s/.bam//g;
my $pairs_file = $bam.".pairs.txt";
$pairs_file =~ s/.bam//g;
$runcmd = "allhic partition $counts_file $pairs_file ".$K." --minREs ".$minRes;
print "CMD: $runcmd\n";
system($runcmd);
================================================
FILE: bin/ALLHiC_pip.sh
================================================
#!/bin/bash
usage()
{
echo " Usage: `basename $0` -r reference -1 R1.fq -2 R2.fq -k group_count [-e enzyme] [-t threads] [-b bin_size]"
echo " -r: reference genome"
echo " -1: Lib_R1.fq.gz"
echo " -2: Lib_R2.fq.gz"
echo " -k: group_count"
echo " -e: enzyme_sites (HindIII: AAGCTT; MboI: GATC), default: HindIII"
echo " -t: threads, default: 10"
echo " -b: bin_size for hic heatmap, can be divided with comma, default: 500k"
exit 0
}
### get options
while getopts ':r:1:2:k:e:t:b:' OPT; do
case $OPT in
r)
ref="$OPTARG";;
1)
R1="$OPTARG";;
2)
R2="$OPTARG";;
e)
enzyme="$OPTARG";;
k)
group_count="$OPTARG";;
t)
threads="$OPTARG";;
b)
bin_size="$OPTARG";;
?)
usage;;
esac
done
bwa="bwa"
### check required variants
if [ -z $ref ] || [ -z $R1 ] || [ -z $R2 ] || [ -z $group_count ]; then
usage
fi
### set default values while optional variants were not set
if [ -z $threads ]; then
threads=10
fi
if [ -z $bin_size ]; then
bin_size=500k
fi
if [ -z $enzyme ]; then
enzyme=AAGCTT
fi
enzyme=`echo $enzyme | tr '[a-z]' '[A-Z]'`
if [ $enzyme = HINDIII ]; then
enzyme=AAGCTT
fi
if [ $enzyme = MBOI ]; then
enzyme=GATC
fi
### link required files
ln -s ${ref} ./seq.fasta
ln -s ${R1} ./Lib_R1.fastq.gz
ln -s ${R2} ./Lib_R2.fastq.gz
### index reference genome
bwa index seq.fasta
samtools faidx seq.fasta
### 1st round of mapping
bwa mem -SP5M -t $threads seq.fasta Lib_R1.fastq.gz Lib_R2.fastq.gz \
| samtools view -hF 256 - \
| samtools sort -@ $threads -o sorted.bam -T tmp.ali
samtools index sorted.bam
### correct contig
ALLHiC_corrector -m sorted.bam -r seq.fasta -o seq.HiCcorrected.fasta -t $threads
### 2nd round of mapping
bwa index seq.HiCcorrected.fasta
samtools faidx seq.HiCcorrected.fasta
bwa mem -SP5M -t $threads seq.HiCcorrected.fasta Lib_R1.fastq.gz Lib_R2.fastq.gz \
| samtools view -hF 256 - \
| samtools sort -@ $threads -o sample.bwa_mem.bam -T tmp.ali
### filter bam
samtools view -bq 40 sample.bwa_mem.bam |samtools view -bt seq.HiCcorrected.fasta.fai > sample.unique.bam
PreprocessSAMs.pl sample.unique.bam seq.HiCcorrected.fasta $enzyme
### partition
ALLHiC_partition -r seq.HiCcorrected.fasta -e $enzyme -k $group_count -b sample.unique.REduced.paired_only.bam
### optimize
rm cmd.list
for((K=1;K<=$group_count;K++));do echo "allhic optimize sample.unique.REduced.paired_only.counts_${enzyme}.${group_count}g${K}.txt sample.unique.REduced.paired_only.clm" >> cmd.list;done
ParaFly -c cmd.list -CPU $threads
### build
ALLHiC_build seq.HiCcorrected.fasta
### plot
samtools faidx groups.asm.fasta
cut -f1,2 groups.asm.fasta.fai|grep sample > chrn.list
ALLHiC_plot sample.bwa_mem.bam groups.agp chrn.list $bin_size pdf
================================================
FILE: bin/ALLHiC_plot
================================================
#!/usr/bin/env python
import argparse
import numpy as np
import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pyplot as plt
import pysam
import time
import os
def time_print(info):
print("\033[32m%s\033[0m %s"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())), info))
def get_opts():
groups = argparse.ArgumentParser()
groups.add_argument('-b', '--bam', help='Input bam file', required=True)
groups.add_argument('-a', '--agp', help='Input AGP file', required=True)
groups.add_argument('-l', '--list', help='Chromosome list, contain: ID\tLength', required=True)
groups.add_argument('-n', '--npz', help="npz file of hic signal, optional, if not exist, it will be generate after reading hic signals, or it will be loaded for drawing other resolution of heatmap", default="")
groups.add_argument('-m', '--min_size', help="Minium bin size of heatmap, default=50k", default="50k")
groups.add_argument('-s', '--size', help="Bin size of heatmap, can be a list separated by comma, default=500k, notice: it must be n times of min_size (n is integer) or we will ajust it to nearest one", default="500k")
groups.add_argument('-o', '--outdir', help='Output directory, default=workdir', default='workdir')
return groups.parse_args()
# Get chromosome length
def get_chr_len(chr_list):
chr_len_db = {}
chr_order = []
with open(chr_list, 'r') as f_in:
for line in f_in:
if line.strip() == '':
continue
data = line.strip().split()
chr_order.append(data[0])
chr_len_db[data[0]] = int(data[1])
return chr_len_db, chr_order
# Calc read counts on each bin
def calc_read_count_per_min_size(chr_len_db, chr_order, bam, agp, min_size):
long_bin_size=min_size
read_count_whole_genome = {}
bin_offset = [0 for i in range(0, len(chr_order)+1)]
bin_count = [0 for i in range(0, len(chr_order)+1)]
total_bin_count = 0
for chrn in chr_len_db:
bin_count_of_chr = int(round((chr_len_db[chrn]*1.0/long_bin_size+0.51)))
total_bin_count += bin_count_of_chr
bin_count[chr_order.index(chrn)+1] = bin_count_of_chr
for i in range(1, len(bin_count)):
bin_offset[i] = bin_count[i]+bin_offset[i-1]
read_count_whole_genome = [[0 for i in range(0, total_bin_count)] for j in range(0, total_bin_count)]
ctg_on_chr = {}
with open(agp, 'r') as f_in:
for line in f_in:
if line.strip() == '' or line.strip().startswith('#'):
continue
data = line.strip().split()
if data[4] == 'U':
continue
chrn = data[0]
start_pos = int(data[1])
end_pos = int(data[2])
ctg = data[5].replace('_pilon', '')
direct = data[-1]
ctg_on_chr[ctg] = [chrn, start_pos, end_pos, direct]
with pysam.AlignmentFile(bam, 'rb') as fin:
for line in fin:
if line.is_unmapped or line.mate_is_unmapped:
continue
ctg1 = line.reference_name
ctg2 = line.next_reference_name
read_pos1 = line.reference_start+1
read_pos2 = line.next_reference_start+1
if ctg1 not in ctg_on_chr or ctg2 not in ctg_on_chr:
continue
chrn1, ctg_start_pos1, ctg_end_pos1, ctg_direct1 = ctg_on_chr[ctg1]
chrn2, ctg_start_pos2, ctg_end_pos2, ctg_direct2 = ctg_on_chr[ctg2]
if ctg_direct1 == '+':
converted_pos1 = ctg_start_pos1 + read_pos1 - 1
else:
converted_pos1 = ctg_end_pos1 - read_pos1 + 1
if ctg_direct2 == '+':
converted_pos2 = ctg_start_pos2 + read_pos2 - 1
else:
converted_pos2 = ctg_end_pos2 - read_pos2 + 1
if chrn1 not in chr_len_db or chrn2 not in chr_len_db:
continue
pos1_index = int(converted_pos1/long_bin_size)
pos2_index = int(converted_pos2/long_bin_size)
chr1_index = chr_order.index(chrn1)
chr2_index = chr_order.index(chrn2)
whole_pos1 = bin_offset[chr1_index] + pos1_index
whole_pos2 = bin_offset[chr2_index] + pos2_index
try:
read_count_whole_genome[whole_pos1][whole_pos2] += 1
read_count_whole_genome[whole_pos2][whole_pos1] += 1
except Exception:
time_print("Index error on whole genome: index1: %d, index2: %d, bin counts: %d"%(whole_pos1, whole_pos2, total_bin_count))
return np.array(bin_offset), np.array(read_count_whole_genome)
def draw_heatmap(read_count_whole_genome_min_size, bin_offset_min_size, ratio, chr_order, min_size):
bin_size = str(int(ratio*min_size))
if bin_size[-9:] == '000000000':
short_bin_size = bin_size[:-9]+'G'
elif bin_size[-6:] == '000000':
short_bin_size = bin_size[:-6]+'M'
elif bin_size[-3:] == '000':
short_bin_size = bin_size[:-3]+'K'
total_cnt = len(read_count_whole_genome_min_size)
ratio_cnt = int(round(total_cnt*1.0/ratio+0.51, 0))
plt_cnt = int(total_cnt*1.0/ratio)
data = read_count_whole_genome_min_size
data = np.pad(data, ((0, ratio_cnt*ratio-total_cnt), (0, ratio_cnt*ratio-total_cnt)), 'constant', constant_values=0)
data = data.reshape(-1, ratio_cnt, ratio).sum(axis=2)
data = data.reshape(ratio_cnt, -1, ratio_cnt).sum(axis=1)
fn = "%s_Whole_genome.pdf"%short_bin_size
cmap = plt.get_cmap("YlOrRd")
cmap.set_over('black')
ax = plt.gca()
with np.errstate(divide='ignore'):
hmap = ax.imshow(np.log2(data[: plt_cnt, : plt_cnt]), interpolation='nearest', origin='lower',cmap=cmap, aspect='equal')
plt.colorbar(mappable=hmap, cax=None, ax=None, shrink=0.5)
plt.tick_params(labelsize=6)
for ticks in ax.get_xticklabels():
ticks.set_rotation(90)
for ticks in ax.get_yticklabels():
ticks.set_rotation(0)
title = 'Whole_genome_'+short_bin_size
plt.xlabel("Bins ("+short_bin_size.lower()+"b per bin)", fontsize=8)
plt.xticks([])
plt.yticks([])
plt.title(title, y=1.01, fontsize=12)
plt.savefig(fn, bbox_inches='tight', dpi=200)
plt.close('all')
chr_cnt = len(chr_order)
row_cnt = int(round(np.sqrt(chr_cnt)+0.51))
col_cnt = int(round(chr_cnt*1.0/row_cnt+0.51))
all_fn = '%s_all_chrs.pdf'%short_bin_size
plt.figure(figsize=(col_cnt*2, row_cnt*2))
idx = 1
for chrn in chr_order:
sr = bin_offset_min_size[idx-1]
er = bin_offset_min_size[idx]
sub_data = read_count_whole_genome_min_size[sr: er, sr: er]
total_cnt = len(sub_data)
ratio_cnt = int(round(total_cnt*1.0/ratio+0.51, 0))
plt_cnt = int(total_cnt*1.0/ratio)
sub_data = np.pad(sub_data, ((0, ratio_cnt*ratio-total_cnt), (0, ratio_cnt*ratio-total_cnt)), 'constant', constant_values=0)
sub_data = sub_data.reshape(-1, ratio_cnt, ratio).sum(axis=2)
sub_data = sub_data.reshape(ratio_cnt, -1, ratio_cnt).sum(axis=1)
plt.subplot(row_cnt, col_cnt, idx)
ax = plt.gca()
cmap = plt.get_cmap('YlOrRd')
cmap.set_over('black')
with np.errstate(divide='ignore'):
hmap = ax.imshow(np.log2(sub_data[: plt_cnt, : plt_cnt]), interpolation='nearest', origin='lower', cmap=cmap, aspect='equal')
plt.colorbar(mappable=hmap, cax=None, ax=None, shrink=0.5)
plt.tick_params(labelsize=5)
plt.title(chrn)
idx += 1
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5)
plt.savefig(all_fn, bbox_inches='tight', dpi=200)
plt.close('all')
def ALLHiC_plot(bam, agp, chrlist, npzfile, minsize, binsize, outdir):
bam = os.path.abspath(bam)
agp = os.path.abspath(agp)
chrlist = os.path.abspath(chrlist)
if npzfile != "":
npzfile = os.path.abspath(npzfile)
if not os.path.exists(outdir):
os.mkdir(outdir)
os.chdir(outdir)
min_size = minsize.upper()
min_size = min_size.replace('K', '000')
min_size = min_size.replace('M', '000000')
min_size = min_size.replace('G', '000000000')
min_size = int(min_size)
bin_list = binsize.split(',')
bin_ratio = []
for bin_size in bin_list:
long_bin_size = bin_size.upper()
long_bin_size = long_bin_size.replace('K', '000')
long_bin_size = long_bin_size.replace('M', '000000')
long_bin_size = long_bin_size.replace('G', '000000000')
long_bin_size = int(long_bin_size)
bin_ratio.append(int(round(long_bin_size/min_size+0.01, 0)))
time_print("Step1: Get chromosome length")
chr_len_db, chr_order = get_chr_len(chrlist)
time_print("Step2: Get signal matrix")
if npzfile != "" and os.path.exists(npzfile):
npzdata = np.load(npzfile)
bin_offset_min_size = npzdata['bin_offset_min_size']
read_count_whole_genome_min_size = npzdata['read_count_whole_genome_min_size']
else:
bin_offset_min_size, read_count_whole_genome_min_size = calc_read_count_per_min_size(chr_len_db, chr_order, bam, agp, min_size)
if npzfile != "":
np.savez(npzfile.replace('.npz', ''), bin_offset_min_size=bin_offset_min_size, read_count_whole_genome_min_size=read_count_whole_genome_min_size)
time_print("Step3: Draw heatmap")
for i in range(0, len(bin_ratio)):
ratio = bin_ratio[i]
time_print("Drawing with bin size %s"%bin_list[i])
draw_heatmap(read_count_whole_genome_min_size, bin_offset_min_size, ratio, chr_order, min_size)
os.chdir('..')
time_print("Success")
if __name__ == "__main__":
opts = get_opts()
bam = opts.bam
agp = opts.agp
chrlist = opts.list
npzfile = opts.npz
minsize = opts.min_size
binsize = opts.size
outdir = opts.outdir
ALLHiC_plot(bam, agp, chrlist, npzfile, minsize, binsize, outdir)
================================================
FILE: bin/ALLHiC_rescue
================================================
#!/usr/bin/perl -w
use Getopt::Std;
getopts "b:r:c:i:m:";
if ( (!defined $opt_b)|| (!defined $opt_r)|| (!defined $opt_c)|| (!defined $opt_i)) {
die "**************************************************************************************
Usage: ALLHiC_rescue -r draft.asm.fasta -b sample.clean.bam -c clusters.txt -i counts.file
-h : help and usage.
-b : sample.clean.bam (unpruned bam)
-r : draft.sam.fasta
-c : prunning.clusters.txt
-i : prunning.counts_AAGCTT.txt
-m : minimum single density for rescuing contigs (optional, default 0.01)
**************************************************************************************\n";
}
my $bam = $opt_b;
my $refSeq = $opt_r;
my $clusters = $opt_c;
my $counts_file = $opt_i;
my $minSig = (defined $opt_m)?$opt_m:0.01;
print "Starting rescue ungrouped contigs\n";
print "Reading contig length\n";
my %ctgdb;
open(IN, $counts_file) or die"";
while(<IN>){
chomp;
next if(/#/);
my ($ctg, $RECounts, $len) = split(/\s+/,$_);
$ctgdb{$ctg}->{'RECounts'} = $RECounts;
$ctgdb{$ctg}->{'length'} = $len;
}
close IN;
print "Reading link signals ...\n";
my %signaldb;
my @bamList = split(/,/,$opt_b);
foreach my $bam (@bamList){
print "Reading $bam\n";
open(my $in, "samtools view $bam|") or die"";
while(<$in>){
chomp;
my ($reads,$ctga,$ctgb) = (split/\s+/,$_)[0,2,6];
next if($ctgb eq "=");
next if($ctgb eq "*");
my ($a,$b) = sort ($ctga,$ctgb);
my $key = $a.",".$b;
$signaldb{$key}++;
}
close $in;
}
print "find ungrouped contigs ...\n";
my %GROUPDB;
my %anchordb;
my $gid = 0;
open(IN, $clusters) or die"";
while(<IN>){
chomp;
next if(/#/);
$gid++;
my $g = "group".$gid;
my @data = split(/\s+/,$_);
foreach my $i(2..$#data){
$anchordb{$data[$i]} = $gid;
$GROUPDB{$g}->{'origin'} .= $data[$i]." ";
}
}
close IN;
print "output HiC link signals ...\n";
open(OUT, "> signals.txt") or die"";
print OUT "#GID unclustered_ctg Linked_reads Anchored_ctgs\n";
foreach my $key (keys %signaldb){
my ($a,$b) = split(/,/,$key);
next if(exists($anchordb{$a}) and exists($anchordb{$b}));
next if(!exists($anchordb{$a}) and !exists($anchordb{$b}));
# $a = "group".$anchordb{$a} if(exists($anchordb{$a}));
# $b = "group".$anchordb{$b} if(exists($anchordb{$b}));
# next if($a=~/group/ and $b=~/group/);
# next if(!($a=~/group/) and !($b=~/group/));
my $ga = (exists($anchordb{$a}))?$a:$b; ### anchored contig should be placed in the first row
my $ub = (!exists($anchordb{$b}))?$b:$a;### followed by unanchored contig
if(!exists($ctgdb{$a}->{'length'})){
print "WARNING: $a not found in $counts_file, PASS\n";
next;
}
if(!exists($ctgdb{$b}->{'length'})){
print "WARNING: $b not found in $counts_file, PASS\n";
next;
}
my $lenA = $ctgdb{$ga}->{'length'};
my $lenB = $ctgdb{$ub}->{'length'};
my $sigD = ($signaldb{$key}*1000)/($lenA+$lenB);
# $sigD = sprintf("%.2f",$sigD);
print OUT "group$anchordb{$ga} $ub $sigD $ga\n"
}
close OUT;
my %infordb;
my %groupdb;
open(IN, "signals.txt") or die"";
while(<IN>){
chomp;
next if(/#/);
my ($gid,$ctg,$value) = (split/\s+/,$_)[0,1,2];
$infordb{$ctg}->{$gid} += $value;
$groupdb{$gid}++;
}
close IN;
my $num_of_groups = keys %groupdb;
open(OUT, "> unanchor.signal.txt") or die "";
print OUT "unanchored_contig ";
foreach my $gid(sort keys %groupdb){
print OUT "$gid ";
}
print OUT "best_group best_ctg1 sigD best_ctg2\n";
foreach my $ctg (sort keys %infordb){
my $v = 0;
my $maxv = 0;
print OUT "$ctg ";
foreach my $g(sort keys %groupdb){
$v = $infordb{$ctg}->{$g} if(exists($infordb{$ctg}->{$g}));
$v = 0 if(!exists($infordb{$ctg}->{$g}));
# $v = sprintf ("%.2f",$v);
$maxv = $v if($v>$maxv);
print OUT "$v ";
}
# next if($maxv<=$minSig); ### minimum singal density should be larger than 0.01
my $count = 0;
my $best_g;
foreach $g (sort {$infordb{$ctg}->{$b}<=>$infordb{$ctg}->{$a}} keys %{$infordb{$ctg}}){
$count++;
last if($count>1);
$best_g = $g;
}
print OUT "$best_g $maxv #\n" if($maxv<=$minSig);
print OUT "$best_g $maxv \n" if($maxv>$minSig);
$GROUPDB{$best_g}->{'rescued'} .= $ctg." " if($maxv>$minSig);
# my $line = `grep \'$ctg\' signals.txt |grep \'$best_g\' |sort -k 3 -n -r |head -n 2|cut -f4`;
# my ($best_ctg1,$best_ctg2) = split(/\n/,$line);
# print OUT "$best_ctg1 $maxv $best_ctg2 \n";
}
close OUT;
print "Output refined clusters \n";
foreach my $gid (keys %GROUPDB){
my @odb = split(/\s+/,$GROUPDB{$gid}->{'origin'});
$GROUPDB{$gid}->{'rescued'} = "" if(!exists($GROUPDB{$gid}->{'rescued'}));
my @rdb = split(/\s+/,$GROUPDB{$gid}->{'rescued'});
my $no = @odb;
my $nr = @rdb;
print "Number of original contigs in $gid: $no\n";
print "Number of rescued contigs in $gid: $nr\n";
my $outfile = $gid.".txt";
open(my $out, "> $outfile") or die"";
print $out "#Contig RECounts Length\n";
map {print $out "$_ $ctgdb{$_}->{'RECounts'} $ctgdb{$_}->{'length'}\n"} @odb;
map {print $out "$_ $ctgdb{$_}->{'RECounts'} $ctgdb{$_}->{'length'}\n"} @rdb;
close $out;
}
================================================
FILE: bin/allhic
================================================
[File too large to display: 10.4 MB]
================================================
FILE: scripts/ALLHiC2ALLMAPS.pl
================================================
#!/usr/bin/perl -w
### Convert ALLHiC output AGP file to ALLMAPS input csv file
print "Convert ALLHiC output AGP file to ALLMAPS input csv file\n";
die "Usage: perl $0 groups.agp\n" if(!defined $ARGV[0]);
my $agp = $ARGV[0];
open(OUT, "> hic.csv") or die"";
print OUT "Scafffold ID,scaffold position,LG,genetic position\n";
open(IN, "grep -v 'contig' $agp|") or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
if($data[8] eq "+"){
$a = $data[6]; $b = $data[7];
}elsif($data[8] eq "-"){
$a = $data[7]; $b = $data[6];
}
print OUT "$data[5],$a,$data[0],$data[1]\n";
print OUT "$data[5],$b,$data[0],$data[2]\n";
}
close IN;
close OUT;
================================================
FILE: scripts/PreprocessSAMs.pl
================================================
#!/usr/bin/perl -w
use strict;
# PreprocessSAMs.pl
#
# Syntax: PreprocessSAMs.pl <sam or bam filename> <draft assembly fasta>
#
# This Perl script prepares a SAM/BAM file for use with Lachesis.
# Specifically, it pre-processes the file with bedtools, samtools, picard to remove redundant, chimeric, and/or uninformative read pairs.
# This creates a dataset of Hi-C links with as strong a signal as possible, and it's also as small as possible, so as to reduce I/O runtime in Lachesis.
# (NOTE: As of August 24, 2013, I'm no longer removing PCR duplicates. Picard's MarkDuplicates is extremely slow and resource-intensive - far more so than
# the runtime benefit in Lachesis of having fewer reads. I don't think it's removing PCR duplicates properly, nor do I think PCR duplicate removal is even
# necessary - http://seqanswers.com/forums/showthread.php?t=6854).
#
# This script will determine whether the file is a SAM or a BAM file, and then run the following commands:
#
# COMMAND OUTPUT FILENAME WHAT THE COMMAND DOES
# make_bed_around_RE_site.pl <fasta>.near_<RE>.<range>.bed Prepare the bed file for bedtools intersect (next command)
# bedtools intersect <head>.REduced.bam Remove all reads that aren't within 500 bp of a restriction site
### picard SortSam.jar <head>.REduced.sort_coord.bam Sort the file in coordinate order so PCR duplicates can be removed
### picard MarkDuplicates.jar <head>.REduced.sort_coord.nodups.bam Remove PCR duplicates
### picard SortSam.jar <head>.REduced.nodups.bam Sort the file in query-name order so Lachesis can read it
# samtools view -F12 <head>.REduced.nodups.paired_only.bam Filter out all pairs in which both reads are not aligned
# samtools flagstat <head>.REduced.nodups.paired_only.flagstat Make a flagstat file that describes the contents of the BAM file
#
#
# The final output file will be <head>.REduced.paired_only.bam. This is what should be entered into the Lachesis INI file under the key "SAM_FILES".
#
# To pre-process several SAM/BAM files in parallel, use the script PreprocessSAMs.sh, which can be submitted to a cluster via qsub.
#
# Josh Burton
# July 2013
################################
# #
# USER-DEFINED PARAMETERS #
# #
################################
my $dry_run = 0; # if true, just print the commands to be run - don't actually run them
#my $RE_site = 'AAGCTT'; # the restriction enzyme site at which the DNA was cut for the Hi-C experiment
# Paths to the necessary scripts and software packages.
my $make_bed_around_RE_site_pl = 'make_bed_around_RE_site.pl';
my $bedtools = 'bedtools';
my $samtools = 'samtools';
#my $mem = "16G";
#my $picard_head = "java -d64 -Xmx$mem -jar /net/shendure/vol10/jnburton/extern/picard-tools-1.50/";
################################
# #
# SUBROUTINES #
# #
################################
# Print and then run a command in bash (unless $dry_run, in which case just print it.)
# First argument: the command to run.
# Second argument (optional): the file to redirect stdout to.
sub run_cmd(@) {
my ($cmd,$redirect) = @_;
print localtime() . ": PreprocessSAMs.pl: $cmd\n";
return if $dry_run;
if ($redirect) { system ( "$cmd > $redirect" ) }
else { system ( $cmd ); }
}
################################
# #
# CONTROL STARTS HERE #
# #
################################
# Get the command-line arguments, or check syntax.
if ( @ARGV != 3 ) {
print STDERR "\nPreprocessSAMs.pl: A script to prepare SAM or BAM files for use with Lachesis.\n\nSyntax: $0 <sam-or-bam-filename> <draft-assembly-fasta> enzyme(HINDIII/MBOI/Arima)\n\n";
exit;
}
# Get the input filenames, and check that they actually exist.
my ( $SAM, $fasta) = @ARGV;
unless ( -e $SAM ) {
print STDERR "$0: Can't find input SAM/BAM file `$SAM`\n";
exit;
}
unless ( -e $fasta) {
print STDERR "$0: Can't find draft assembly file `$fasta`\n";
exit;
}
$ARGV[2] = uc $ARGV[2];
my $RE_site;
if($ARGV[2] eq "HINDIII" or $ARGV[2] eq "AAGCTT"){
$RE_site = 'AAGCTT';
}elsif($ARGV[2] eq "MBOI" or $ARGV[2] eq "GATC"){
$RE_site = 'GATC';
}elsif($ARGV[2] eq "ARIMA"){
$RE_site = 'arima';
}
# Find the input file's "head" and extension.
my ($head,$extension) = $SAM =~ /^(.*)\.(.*)$/;
# Examine the extension to determine whether this is a SAM or a BAM file. If it's a SAM, convert it to BAM. If it doesn't seem to be either, throw an error.
if ( uc($extension) eq 'SAM' ) { run_cmd( "$samtools view -bS $SAM -o $head.bam" ); }
elsif ( uc($extension) eq 'BAM' ) {}
else {
print STDERR "$0: Can't determine file type for input file `$SAM`.\nFilename should end in '.SAM' or '.BAM' (not case-sensitive.)\n";
exit;
}
print "$0 @ARGV\n\n";
# COMMAND OUTPUT FILENAME WHAT THE COMMAND DOES
# make_bed_around_RE_site.pl <fasta>.near_<RE>.<range>.bed Prepare the bed file for bedtools intersect (next command)
#
# Make the BED file for the restriction sites on the draft assembly. This only needs to be done once.
my $BED_RE_file;
if ($RE_site eq "arima") {
$BED_RE_file = "$fasta.near_arima.500.bed";
my $BED_re_file_gatc = "$fasta.near_GATC.500.bed";
my $BED_re_file_gaat = "$fasta.near_GAAT.500.bed";
my $BED_re_file_gact = "$fasta.near_GACT.500.bed";
my $BED_re_file_gagt = "$fasta.near_GAGT.500.bed";
my $BED_re_file_gatt = "$fasta.near_GATT.500.bed";
run_cmd( "$make_bed_around_RE_site_pl $fasta GATC 500" ) unless -e $BED_re_file_gatc;
run_cmd( "$make_bed_around_RE_site_pl $fasta GAAT 500" ) unless -e $BED_re_file_gaat;
run_cmd( "$make_bed_around_RE_site_pl $fasta GACT 500" ) unless -e $BED_re_file_gact;
run_cmd( "$make_bed_around_RE_site_pl $fasta GAGT 500" ) unless -e $BED_re_file_gagt;
run_cmd( "$make_bed_around_RE_site_pl $fasta GATT 500" ) unless -e $BED_re_file_gatt;
run_cmd( "cat $BED_re_file_gatc $BED_re_file_gaat $BED_re_file_gact $BED_re_file_gagt $BED_re_file_gatt | sort -k1,1 -k2,2b -u > $BED_RE_file" );
}
else {
$BED_RE_file = "$fasta.near_$RE_site.500.bed";
run_cmd( "$make_bed_around_RE_site_pl $fasta $RE_site 500" ) unless -e $BED_RE_file;
}
# Do the pre-processing on this file.
#
# COMMAND OUTPUT FILENAME WHAT THE COMMAND DOES
# bedtools intersect <head>.REduced.bam Remove all reads that aren't within 500 bp of a restriction site
### picard SortSam.jar <head>.REduced.sort_coord.bam Sort the file in coordinate order so PCR duplicates can be removed
### picard MarkDuplicates.jar <head>.REduced.sort_coord.nodups.bam Remove PCR duplicates
### picard SortSam.jar <head>.REduced.nodups.bam Sort the file in query-name order so Lachesis can read it
# samtools view -F12 <head>.REduced.paired_only.bam Filter out all pairs in which both reads are not aligned
# samtools flagstat <head>.REduced.paired_only.flagstat Make a flagstat file that describes the contents of the BAM file
my $opts = "VALIDATION_STRINGENCY=SILENT";
my $nodups = ""; # or ".nodups", if removing PCR duplicates
run_cmd( "$bedtools intersect -abam $head.bam -b $BED_RE_file > $head.REduced.bam" );
#run_cmd( "${picard_head}SortSam.jar $opts I=$head.REduced.bam O=$head.REduced.sort_coord.bam SO=coordinate" );
#run_cmd( "${picard_head}MarkDuplicates.jar $opts I=$head.REduced.sort_coord.bam O=$head.REduced.sort_coord.nodups.bam M=$head.REduced.sort_coord.dup_metrics AS=true REMOVE_DUPLICATES=true" );
#run_cmd( "${picard_head}SortSam.jar $opts I=$head.REduced.sort_coord.nodups.bam O=$head.REduced.nodups.bam SO=queryname" );
run_cmd( "$samtools view -F12 $head.REduced$nodups.bam -b -o $head.REduced$nodups.paired_only.bam" );
run_cmd( "$samtools flagstat $head.REduced$nodups.paired_only.bam > $head.REduced$nodups.paired_only.flagstat" );
================================================
FILE: scripts/agp2tour.pl
================================================
#!/usr/bin/perl -w
die "Usage: perl $0 chr.agp\n" if(!defined $ARGV[0]);
my %infordb;
my $cnt = 0;
open(IN, "grep -v contig $ARGV[0]|") or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
my $chrn = $data[0];
next if(!($chrn=~/Chr/) and !($chrn=~/group/));
if(!exists($infordb{$chrn})){
$cnt = 1;
$infordb{$chrn}->{$cnt} .= $data[5]."".$data[8];
}else{
$cnt++;
$infordb{$chrn}->{$cnt} .= $data[5]."".$data[8];
}
}
close IN;
foreach my $c (sort keys %infordb){
my $outfile = $c.".tour";
open(my $out, ">$outfile") or die"";
foreach my $i (sort {$a<=>$b} keys %{$infordb{$c}}){
print $out "$infordb{$c}->{$i} ";
}
close $out;
}
================================================
FILE: scripts/bam2CLM.pl
================================================
#!/usr/bin/perl -w
use Getopt::Std;
getopts "b:r:d:";
if ((!defined $opt_b)|| (!defined $opt_r) || (!defined $opt_d) ) {
die "************************************************************************
Usage: perl $0 -b mapping.bam -r refSeq.fasta -d main_results/
-h : help and usage.
-b : mapping.bam
-r : reference genome, fasta format
-d : LACHESIS main_results/
************************************************************************\n";
}
my %seqdb;
my $ctg;
open(IN, $opt_r) or die"";
while(<IN>){
chomp;
if(/>/){
$ctg = $_;
$ctg =~ s/>//g;
$ctg =~ s/\s+.*//g;
$ctg =~ s/_pilon//g;
}else{
$seqdb{$ctg} .= $_;
}
}
close IN;
foreach $ctg(keys %seqdb){
$seqdb{$ctg} =~ s/\s+//g;
}
######GET GROUP IDS######
print "a. Getting group ids ...\n";
print "Reading anchored contigs ...\n";
my %anchordb;
my %gidb;
while(my $file = glob "$opt_d/group*ordering"){
my $gid = $1 if($file=~/group(\d+).ordering/);
open(my $in, $file) or die"";
while(<$in>){
chomp;
next if(/#/);
my $ctg = (split/\s+/,$_)[1];
$ctg =~ s/_pilon//g;
$anchordb{$ctg}->{'gid'} = $gid;
$anchordb{$ctg}->{'stat'} = "An";
$gidb{$gid}->{$ctg} = "An";
}
close $in;
}
my $ufile = "unanchor.signal.txt";
if(!(-e $ufile)){
system("touch $ufile");
}
my $num_of_group = keys %gidb;
print "Number of groups: $num_of_group\n";
print "Reading unanchored contigs ...\n";
open(IN, "unanchor.signal.txt") or die"";
<IN>;
while(<IN>){
chomp;
my $i = $num_of_group + 1;
my ($ctg,$gid) = (split/\s+/,$_)[0,$i];
$ctg =~ s/_pilon//g;
$gid =~ s/group//g;
$anchordb{$ctg}->{'gid'} = $gid;
$anchordb{$ctg}->{'stat'} = "Un";
$gidb{$gid}->{$ctg} = "Un";
}
close IN;
print "Output group ids ...\n";
foreach my $gid(sort keys %gidb){
my $outid = "group".$gid.".ids";
open(my $out, ">$outid") or die"";
foreach my $ctg (keys %{$gidb{$gid}}){
my $len = length $seqdb{$ctg};
print $out "$ctg $len\n" if($gidb{$gid}->{$ctg} eq "An");
print $out "$ctg $len recover\n" if($gidb{$gid}->{$ctg} eq "Un");
}
close $out;
}
print "b. Getting CLM files ...\n";
print "Reading and filtering $opt_b file ...\n";
my %tmprdb = (); ###store reads name
my %infordb; ###store contig pairs with directions: e.g. A+B+,A+B-,A-B+,A-B-
my $count = 0; ###used for sorting
open(IN, "samtools view $opt_b |awk \'\$7!=\"*\" && \$7!=\"=\"\' |") or die"";
while(<IN>){
chomp;
$_ =~ s/_pilon//g;
my @data = split(/\s+/,$_);
next if(exists($tmprdb{$data[0]}));
$tmprdb{$data[0]}++;
my ($ctgA,$ctgB) = sort ($data[2], $data[6]);
###determine gid for the contig pairs
next if(!exists($anchordb{$ctgA}->{'gid'}));
next if(!exists($anchordb{$ctgB}->{'gid'}));
my $ctgAgid = $anchordb{$ctgA}->{'gid'};
my $ctgBgid = $anchordb{$ctgB}->{'gid'};
next if($ctgAgid ne $ctgBgid);
my $ctgAL = length $seqdb{$ctgA};
my $ctgBL = length $seqdb{$ctgB};
my $RAP = ($data[2] le $data[6])?$data[3]:$data[7];
my $RBP = ($data[2] le $data[6])?$data[7]:$data[3];
my $A1 = $RAP;
my $A2 = $ctgAL - $RAP;
my $B1 = $RBP;
my $B2 = $ctgBL - $RBP;
###calculate distance for contig pairs
my $ApBp = $A2 + $B1;
my $ApBm = $A2 + $B2;
my $AmBp = $A1 + $B1;
my $AmBm = $A1 + $B2;
# print ">$_\n";
# print "$ctgA length=$ctgAL and $ctgB length=$ctgBL\n";
# print "$ctgA+ $ctgB+: $ApBp\n";
# print "$ctgA+ $ctgB-: $ApBm\n";
# print "$ctgA- $ctgB+: $AmBp\n";
# print "$ctgA- $ctgB-: $AmBm\n";
# print "\n";
my $PApBp = $ctgA."+ ".$ctgB."+"; #P means pair
my $PApBm = $ctgA."+ ".$ctgB."-";
my $PAmBp = $ctgA."- ".$ctgB."+";
my $PAmBm = $ctgA."- ".$ctgB."-";
$infordb{$PApBp}->{'d'} .= $ApBp." "; #d means distance
$infordb{$PApBm}->{'d'} .= $ApBm." ";
$infordb{$PAmBp}->{'d'} .= $AmBp." ";
$infordb{$PAmBm}->{'d'} .= $AmBm." ";
$infordb{$PApBp}->{'g'} = $ctgAgid ; #g means group id
$infordb{$PApBm}->{'g'} = $ctgAgid ;
$infordb{$PAmBp}->{'g'} = $ctgAgid ;
$infordb{$PAmBm}->{'g'} = $ctgAgid ;
$infordb{$PApBp}->{'c'} = $count++ ; #c means count
$infordb{$PApBm}->{'c'} = $count++ ;
$infordb{$PAmBp}->{'c'} = $count++ ;
$infordb{$PAmBm}->{'c'} = $count++ ;
}
close IN;
###Get CLM FILES####
print "Output CLM files ...\n";
open(ALLCLM, "> all.clm") or die"";
foreach my $key (sort {$infordb{$a}->{'c'}<=>$infordb{$b}->{'c'}} keys %infordb){
my @t = split(/\s+/,$infordb{$key}->{'d'});
my $num_of_link = @t;
print ALLCLM "$key $num_of_link $infordb{$key}->{'d'}\n";
}
close ALLCLM;
foreach my $gid (keys %gidb){
my $outfile = "group".$gid.".clm";
open(my $out, ">$outfile") or die"";
foreach my $key (sort {$infordb{$a}->{'c'}<=>$infordb{$b}->{c}} keys %infordb){
my @t = split(/\s+/,$infordb{$key}->{'d'});
my $num_of_link = @t;
print $out "$key $num_of_link $infordb{$key}->{'d'}\n" if($infordb{$key}->{'g'} eq $gid);
}
close $out;
}
================================================
FILE: scripts/bam2CLM_simple.pl
================================================
#!/usr/bin/perl -w
die "Usage: perl $0 mapping.bam refSeq.fasta\n" if((!defined $ARGV[0]) or (!defined $ARGV[1]));
my %seqdb;
my $ctg;
open(IN, $ARGV[1]) or die"";
while(<IN>){
chomp;
if(/>/){
$ctg = $_;
$ctg =~ s/>//g;
$ctg =~ s/\s+.*//g;
$ctg =~ s/_pilon//g;
}else{
$seqdb{$ctg} .= $_;
}
}
close IN;
foreach $ctg(keys %seqdb){
$seqdb{$ctg} =~ s/\s+//g;
}
print "Reading and filtering $ARGV[0] file ...\n";
my %tmprdb = (); ###store reads name
my %infordb; ###store contig pairs with directions: e.g. A+B+,A+B-,A-B+,A-B-
my $count = 0; ###used for sorting
open(IN, "samtools view $ARGV[0] |awk \'\$7!=\"*\" && \$7!=\"=\"\' |") or die"";
while(<IN>){
chomp;
$_ =~ s/_pilon//g;
my @data = split(/\s+/,$_);
next if(exists($tmprdb{$data[0]}));
$tmprdb{$data[0]}++;
my ($ctgA,$ctgB) = sort ($data[2], $data[6]);
my $ctgAL = length $seqdb{$ctgA};
my $ctgBL = length $seqdb{$ctgB};
my $RAP = ($data[2] le $data[6])?$data[3]:$data[7];
my $RBP = ($data[2] le $data[6])?$data[7]:$data[3];
my $A1 = $RAP;
my $A2 = $ctgAL - $RAP;
my $B1 = $RBP;
my $B2 = $ctgBL - $RBP;
###calculate distance for contig pairs
my $ApBp = $A2 + $B1;
my $ApBm = $A2 + $B2;
my $AmBp = $A1 + $B1;
my $AmBm = $A1 + $B2;
my $PApBp = $ctgA."+ ".$ctgB."+"; #P means pair
my $PApBm = $ctgA."+ ".$ctgB."-";
my $PAmBp = $ctgA."- ".$ctgB."+";
my $PAmBm = $ctgA."- ".$ctgB."-";
$infordb{$PApBp}->{'d'} .= $ApBp." "; #d means distance
$infordb{$PApBm}->{'d'} .= $ApBm." ";
$infordb{$PAmBp}->{'d'} .= $AmBp." ";
$infordb{$PAmBm}->{'d'} .= $AmBm." ";
$infordb{$PApBp}->{'g'} = $ctgAgid ; #g means group id
$infordb{$PApBm}->{'g'} = $ctgAgid ;
$infordb{$PAmBp}->{'g'} = $ctgAgid ;
$infordb{$PAmBm}->{'g'} = $ctgAgid ;
$infordb{$PApBp}->{'c'} = $count++ ; #c means count
$infordb{$PApBm}->{'c'} = $count++ ;
$infordb{$PAmBp}->{'c'} = $count++ ;
$infordb{$PAmBm}->{'c'} = $count++ ;
}
close IN;
###Get CLM FILES####
print "Output CLM files ...\n";
open(ALLCLM, "> all.clm") or die"";
print ALLCLM "groupA groupB num_of_link Average_distance signalDensity distance_list\n";
foreach my $key (sort {$infordb{$a}->{'c'}<=>$infordb{$b}->{'c'}} keys %infordb){
my @t = split(/\s+/,$infordb{$key}->{'d'});
my $num_of_link = @t;
my $sum = 0; my $ave = 0;
map {$sum+=$_} @t;
$ave = $sum/$num_of_link;
$ave = sprintf("%.2f",$ave);
my ($g1,$g2) = split(/\s+/,$key);
$g1 =~ s/[+|-]//g;
$g2 =~ s/[+|-]//g;
my $l1 = length $seqdb{$g1};
my $l2 = length $seqdb{$g2};
my $len = $l1 + $l2;
my $signalD = $num_of_link/$len * 1000;
print ALLCLM "$key $num_of_link $ave $signalD $infordb{$key}->{'d'}\n";
}
close ALLCLM;
================================================
FILE: scripts/bam2net.pl
================================================
#!/usr/bin/perl -w
use Getopt::Std;
getopts "c:b:o:";
if ((!defined $opt_c)|| (!defined $opt_b)||(!defined $opt_o) ) {
die "************************************************************************
Usage: bam2net.pl -c draft.asm.fasta -b file.bam -o out.net
-h : help and usage.
-c : draft.asm.fasta
-b : mapping.bam
-o : output
************************************************************************\n";
}
my $bam = $opt_b;
my $refSeq = $opt_c;
open(IN, $refSeq) or die"";
my $name;
while(<IN>){
chomp;
if(/>/){
$name = $_;
$name =~ s/>//g;
}else{
$refdb{$name} .= $_;
}
}
close IN;
foreach $name (keys %refdb){
$refdb{$name} =~ s/\s+//g;
}
my %infordb;
open(IN, "samtools view $bam |") or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
next if($data[6] eq "=");
next if($data[6] eq "*");
my ($ctg1,$ctg2) = sort ($data[2],$data[6]);
$infordb{$ctg1}->{$ctg2}++;
}
close IN;
open(OUT, "> $opt_o") or die"";
print OUT "ctg1 ctg1_size ctg2 ctg2_size signalDensity\n";
foreach my $ctg1(keys %infordb){
my $len1 = length $refdb{$ctg1};
foreach my $ctg2(keys %{$infordb{$ctg1}}){
my $len2 = length $refdb{$ctg2};
my $normL = ($len1 + $len2)/100000;
my $sigD = $infordb{$ctg1}->{$ctg2}/$normL;
$sigD = sprintf("%.2f",$sigD);
print OUT "$ctg1 $len1 $ctg2 $len2 $sigD\n";
}
}
close OUT;
================================================
FILE: scripts/bam_HiCplotter.py
================================================
#!/usr/bin/env python
import os
import sys
import gc
from math import log
import time
# Get position of read based on contig with sam or bam file
def get_read_pos_with_sam_bam_file(sam_bam_file):
read_on_chr = {}
if sam_bam_file[-3:] == "bam":
f_in = os.popen("samtools view "+sam_bam_file, 'r')
else:
f_in = open(sam_bam_file, 'r')
for line in f_in:
if line.strip() == '' or line[0] == '@':
continue
data = line.strip().split()
read_id = data[0]
if data[2] == '*' or data[6] == '*':
continue
ctg1 = data[2].replace('_pilon', '')
read_pos1 = int(data[3])
if data[6] != '=':
ctg2 = data[6].replace('_pilon', '')
else:
ctg2 = ctg1
read_pos2 = int(data[7])
read_on_chr[read_id] = [ctg1, read_pos1, ctg2, read_pos2]
f_in.close()
return read_on_chr
# Get chromosome length
def get_chr_len(chr_list):
chr_len_db = {}
chr_order = []
with open(chr_list, 'r') as f_in:
for line in f_in:
if line.strip() == '':
continue
data = line.strip().split()
chr_order.append(data[0])
chr_len_db[data[0]] = int(data[1])
return chr_len_db, chr_order
# Calc read counts on each bin
def calc_read_count_per_bin(chr_len_db, chr_order, read_on_chr, bin_size):
long_bin_size = bin_size.upper()
long_bin_size = long_bin_size.replace('K', '000')
long_bin_size = long_bin_size.replace('M', '000000')
long_bin_size = long_bin_size.replace('G', '000000000')
long_bin_size = int(long_bin_size)
read_count_per_chr = {}
read_count_whole_genome = {}
bin_offset = [0 for i in range(0, len(chr_order)+1)]
bin_count = [0 for i in range(0, len(chr_order)+1)]
total_bin_count = 0
for chrn in chr_len_db:
bin_count_of_chr = int(round((chr_len_db[chrn]*1.0/long_bin_size+0.5)))
total_bin_count += bin_count_of_chr
bin_count[chr_order.index(chrn)+1] = bin_count_of_chr
read_count_per_chr[chrn] = [[0 for i in range(0, bin_count_of_chr)] for j in range(0, bin_count_of_chr)]
for i in range(0, len(bin_count)):
for j in range(0, i+1):
bin_offset[i] += bin_count[j]
read_count_whole_genome = [[0 for i in range(0, total_bin_count)] for j in range(0, total_bin_count)]
for read in read_on_chr:
chr1, pos1, chr2, pos2 = read_on_chr[read]
if chr1 not in chr_len_db or chr2 not in chr_len_db:
continue
pos1_index = int(pos1/long_bin_size)
pos2_index = int(pos2/long_bin_size)
if chr1 == chr2 and chr1 in read_count_per_chr:
read_count_per_chr[chr1][pos1_index][pos2_index] += 1
read_count_per_chr[chr1][pos2_index][pos1_index] += 1
chr1_index = chr_order.index(chr1)
chr2_index = chr_order.index(chr2)
whole_pos1 = bin_offset[chr1_index] + pos1_index
whole_pos2 = bin_offset[chr2_index] + pos2_index
read_count_whole_genome[whole_pos1][whole_pos2] += 1
read_count_whole_genome[whole_pos2][whole_pos1] += 1
for chrn in read_count_per_chr:
for i in range(0, len(read_count_per_chr[chrn])):
for j in range(0, len(read_count_per_chr[chrn][i])):
if read_count_per_chr[chrn][i][j] != 0:
read_count_per_chr[chrn][i][j] = log(read_count_per_chr[chrn][i][j], 2)
else:
read_count_per_chr[chrn][i][j] = -float('inf')
for i in range(0, len(read_count_whole_genome)):
for j in range(0, len(read_count_whole_genome[i])):
if read_count_whole_genome[i][j] != 0:
read_count_whole_genome[i][j] = log(read_count_whole_genome[i][j], 2)
else:
read_count_whole_genome[i][j] = -float('inf')
return read_count_per_chr, read_count_whole_genome
# Draw heatmap of allhic result with matplotlib
def draw_heatmap(data, chrn, bin_size, ext):
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
short_bin_size = bin_size.upper()
short_bin_size = short_bin_size.replace('000000000', 'G')
short_bin_size = short_bin_size.replace('000000', 'M')
short_bin_size = short_bin_size.replace('000', 'K')
ax = plt.gca()
if chrn != 'all':
file_prefix = short_bin_size + "_" + chrn
else:
file_prefix = short_bin_size + '_Whole_genome'
print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+' Draw '+file_prefix)
# mpl.cm.YlOrRd
cmap = plt.get_cmap('YlOrRd')
cmap.set_over('black')
if chrn != 'all':
hmap = ax.imshow(data, interpolation='nearest', origin='lower', cmap=cmap, aspect='auto')
else:
hmap = ax.imshow(data, interpolation='nearest', cmap=cmap, aspect='auto')
plt.colorbar(mappable=hmap,cax=None, ax=None, shrink=0.5)
plt.tick_params(labelsize=6)
for ticks in ax.get_xticklabels():
ticks.set_rotation(90)
for ticks in ax.get_yticklabels():
ticks.set_rotation(0)
if chrn != 'all':
title = chrn+'_'+short_bin_size
else:
title = 'Whole_genome_'+short_bin_size
plt.xlabel("Bins ("+short_bin_size.lower()+"b per bin)", fontsize=8)
if chrn == 'all':
plt.xticks([])
plt.yticks([])
plt.title(title, y=1.01, fontsize=12)
else:
plt.title(title, y=1.1, fontsize=12)
plt.savefig(file_prefix+'.'+ext, filetype=ext, bbox_inches='tight', dpi=200)
plt.close('all')
if __name__ == "__main__":
if len(sys.argv) < 5:
print("Notice: This script is using for drawing heatmap of the all-hic reasult")
print("Usage: python "+sys.argv[0]+" <sam/bam file> <chr_list> <bin_size> <ext>")
print("\t<sam/bam_file> is the sam or bam file filtered by allhic")
print("\t<chr_prefix> is the part of chromosomes before chromosome index")
print("\t<bin_size> is the bin size of heatmap, it can be a list splited by comma")
print("\t<ext> is the file type of picture")
else:
sam_bam_file = sys.argv[1]
chr_list = sys.argv[2]
bin_list = sys.argv[3]
ext = sys.argv[4]
print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Step 1: Get read position based on chromosome")
read_on_chr = get_read_pos_with_sam_bam_file(sam_bam_file)
print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Step 2: Get chromosome length")
chr_len_db, chr_order = get_chr_len(chr_list)
print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Step 3: Calculating and Drawing heatmap")
bin_size_list = bin_list.split(',')
for bin_size in bin_size_list:
print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Calculating")
read_count_per_chr, read_count_whole_genome = calc_read_count_per_bin(chr_len_db, chr_order, read_on_chr, bin_size)
print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Drawing heatmap")
print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Drawing with bin size "+str(bin_size))
for chrn in read_count_per_chr:
draw_heatmap(read_count_per_chr[chrn], chrn, bin_size, ext)
draw_heatmap(read_count_whole_genome, 'all', bin_size, ext)
del read_count_per_chr, read_count_whole_genome
gc.collect()
del read_on_chr
gc.collect()
print(time.strftime('[%H:%M:%S]',time.localtime(time.time()))+" Success")
================================================
FILE: scripts/blastn_parse.pl
================================================
#!/usr/bin/perl -w
###This script was used to parse blast+ result (outfmt 6)
###you can get best hit with parameter -b 1
###or -b 0 to get more results
###The default coverage and identity are 60%, respectively
use Getopt::Std;
getopts "i:o:b:c:d:q:";
if ((!defined $opt_i)|| (!defined $opt_o) || (!defined $opt_q)) {
die "************************************************************************
Usage: perl $0 -i input -o output -q query.fasta -b 0||1
-h : help and usage.
-q : query file, fasta format
-i : input file is the result of blast+
-b : (optioanl, default 1)1 means only output best hit; 0 means get more results
-d : identity (optional, default is 0.6)
-c : coverage (optional, defalut is 0.6)
-o : output
************************************************************************\n";
}
$input = $opt_i;
$output = $opt_o;
$BestHit_model = (defined $opt_b) ? $opt_b : 1;
$coverage = (defined $opt_c) ? $opt_c : 0.6;
$identity = (defined $opt_d) ? $opt_d : 0.6;
open(IN, $opt_q) or die"No query file: $opt_q\n";
while(<IN>){
if(/>/){
$gene = $_;
$gene =~ s/>//g;
$gene =~ s/\s+.*//g;
}else{
$infordb{$gene} .= $_;
}
}
close IN;
open(OUT, "> $output") or die"No output file: $output\n";
open(IN, $input) or die"No input file: $input\n";
while(<IN>){
chomp;
@data = split(/\s+/,$_);
$query = $data[0];
$countdb{$query} += 1;
next if($countdb{$query}>1 and $BestHit_model==1);
$q_len = length $infordb{$query};
# $subject = $data[1];
$blst_i = $data[2]/100;
$blst_c = ($data[7]-$data[6])/$q_len;
if($blst_i>=$identity and $blst_c>=$coverage){
print OUT "$_\n";
}
}
close IN;
close OUT;
================================================
FILE: scripts/classify.pl
================================================
#!/usr/bin/perl -w
use Getopt::Std;
getopts "i:p:r:g:";
if ((!defined $opt_i)|| (!defined $opt_p) || (!defined $opt_r)|| (!defined $opt_g)) {
die "************************************************************************
Usage: perl $0 -i blast.out -p polyploid -r ref.gff3 -g target.gff3
-h : help and usage.
-i : blast.out
-p : number of alleles
-r : reference.gff3, annotation from close relative species
-g : target.gff3, annotation from target species
************************************************************************\n";
}
### Parameter reading
my $blast = $opt_i;
my $polyn = $opt_p;
my $rGFF = $opt_r;
my $tGFF = $opt_g;
my $geneTable = "Allele.gene.table";
my $ctgTable = "Allele.ctg.table";
my %infordb;
my $count = 0;
open(IN, "sort -k2,2 -k12,12nr $blast|") or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
my $tgene = $data[0];
my $rgene = $data[1];
my $bits = $data[11];
if(!exists($infordb{$rgene})){
$count = 1;
$infordb{$rgene}->{$count} = $tgene;
}else{
$count++;
next if($count>$polyn);
$infordb{$rgene}->{$count} = $tgene;
}
}
close IN;
my %tdb; ### store target genome gff information, e.g het rice
open(IN, "awk '\$3==\"gene\"' $tGFF | ") or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
my $tgene = $1 if(/Name=(\S+)/);
$tgene =~ s/;.*//g;
$tdb{$tgene} = $data[0];
}
close IN;
open(OUT, "> $geneTable") or die"";
open(IN, "awk '\$3==\"gene\"' $rGFF |sort -k1,1 -k4,4n |") or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
my $rgene = $1 if(/Name=(\S+)/);
$rgene =~ s/;.*//g;
next if(!exists($infordb{$rgene}));
print OUT "$rgene $data[0] $data[3] ";
foreach my $i(sort {$a<=>$b} keys %{$infordb{$rgene}}){
my $tgene = $infordb{$rgene}->{$i};
$tctg = $tdb{$tgene};
print OUT "$tgene,$tctg "; ###print out target gene order and contig name
}
print OUT "\n";
}
close IN;
close OUT;
my %alleledb;
my $ln = 0; ###store line number
open(IN, "Allele.gene.table") or die"";
while(<IN>){
chomp;
$ln++;
my @data = split(/\s+/,$_);
my %tmpdb = ();
foreach my $i(3..$#data){
my $ctg = (split/,/,$data[$i])[1];
$tmpdb{$ctg}++;
}
map {$alleledb{$ln}->{'ctg'} .= $_." "} keys %tmpdb;
$alleledb{$ln}->{'chrn'} = $data[1];
$alleledb{$ln}->{'posi'} = $data[2];
}
close IN;
open(OUT, "> remove.log") or die"";
my %removedb = ();
for(my $i=2;$i<=$ln;$i++){
my $chrI = $alleledb{$i}->{'chrn'};
my $ctgI = $alleledb{$i}->{'ctg'};
my $chrR; my $ctgR; my $R;
for(my $j=1;$j<$i;$j++){
next if(exists($removedb{$j}));
my $chrJ = $alleledb{$j}->{'chrn'};
next if($chrI ne $chrJ);
my $ctgJ = $alleledb{$j}->{'ctg'};
my $flag = & compare($ctgI,$ctgJ);
print OUT "$i $chrI $ctgI $j $chrJ $ctgJ $flag\n" if($flag==1);
### flag=1, remove
$removedb{$i}++ if($flag==1);
}
}
close OUT;
open(OUT, ">$ctgTable") or die"";
$ln = 0;
open(IN, $geneTable) or die"";
while(<IN>){
chomp;
$ln++;
next if(exists($removedb{$ln}));
my @data = split(/\s+/,$_);
print OUT "$data[1] $data[2] ";
foreach my $i(3..$#data){
my $ctg = (split/,/,$data[$i])[1];
print OUT "$ctg ";
}
print OUT "\n";
}
close IN;
close OUT;
sub compare{
my $ctgT = shift;
my $ctgR = shift;
my @ctgTdb = split(/\s+/,$ctgT);
my @ctgRdb = split(/\s+/,$ctgR);
my %tdb = ();
my $num_T = @ctgTdb;
map {$tdb{$_}++} @ctgTdb;
my $num_S = 0; ###Number of Same contigs
my $num_D = 0; ###Number of Different contigs
foreach my $ctg(@ctgRdb){
if(exists($tdb{$ctg})){
$num_S++;
}else{
$num_D++;
}
}
if($num_S == $num_T){
return 1;
}else{
return 0;
}
}
================================================
FILE: scripts/filterBAM_forHiC.pl
================================================
#!/usr/bin/perl -w
die "Usage: perl $0 file.bam out.sam\n" if(!defined($ARGV[0]) or !defined($ARGV[1]));
open(OUT, "> $ARGV[1]") or die"";
open(IN, "samtools view $ARGV[0] |") or die"";
while(<IN>){
chomp;
my $mapq = (split/\s+/,$_)[4];
my ($NM,$XM,$XO,$XG);
if(/NM:i:(\d)/){
$NM = $1;
}
if(/XM:i:(\d)/){
$XM = $1;
}
if(/XO:i:(\d)/){
$XO = $1;
}
if(/XG:i:(\d)/){
$XG = $1;
}
next if($mapq<30);
next if(!(/XT:A:U/));
next if(!(defined $NM) or $NM>5);
next if(!(defined $XM) or $XM>3);
next if(!(defined $XO) or $XO>2);
next if(!(defined $XG) or $XG>2);
next if(/XA:/);
print OUT "$_\n";
}
close IN;
close OUT;
#Tag Meaning
#NM Edit distance
#MD Mismatching positions/bases
#AS Alignment score
#BC Barcode sequence
#X0 Number of best hits
#X1 Number of suboptimal hits found by BWA
#XN Number of ambiguous bases in the referenece
#XM Number of mismatches in the alignment
#XO Number of gap opens
#XG Number of gap extentions
#XT Type: Unique/Repeat/N/Mate-sw
#XA Alternative hits; format: (chr,pos,CIGAR,NM;)*
#XS Suboptimal alignment score
#XF Support from forward/reverse alignment
#XE Number of supporting seeds
================================================
FILE: scripts/gmap2AlleleTable.pl
================================================
#!/usr/bin/perl -w
die "Usage: perl $0 ref.gff3\n" if(!defined ($ARGV[0]));
my $refGFF = $ARGV[0];
open(IN, "grep 'gene' gmap.gff3 |") or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
my $gene = $1 if(/Name=([^;\n]*)/);
$infordb{$gene} .= $data[0]." ";
}
close IN;
open(OUT, "> Allele.ctg.table") or die"";
open(IN, "awk '\$3==\"gene\"' $refGFF |") or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
my $gene = $1 if(/Name=(\S+)/);
$gene =~ s/;.*//g;
next if(!exists($infordb{$gene}));
my @tdb = split(/\s+/,$infordb{$gene});
my %tmpdb = ();
map {$tmpdb{$_}++} @tdb;
print OUT "$data[0] $data[3] ";
map {print OUT "$_ "} keys %tmpdb;
print OUT "\n";
}
close IN;
close OUT;
================================================
FILE: scripts/gmap2AlleleTableBED.pl
================================================
#!/usr/bin/perl -w
die "Usage: perl $0 ref.bed\n" if(!defined ($ARGV[0]));
my $refGFF = $ARGV[0];
open(IN, "grep 'gene' gmap.gff3 |") or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
my $gene = $1 if(/Name=([^;\s]+)/);
$infordb{$gene} .= $data[0]." ";
}
close IN;
open(OUT, "> Allele.ctg.table") or die"";
open(IN, $refGFF) or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
my $gene = $data[3];
$gene =~ s/;.*//g;
next if(!exists($infordb{$gene}));
my @tdb = split(/\s+/,$infordb{$gene});
my %tmpdb = ();
map {$tmpdb{$_}++} @tdb;
print OUT "$data[0] $data[3] ";
map {print OUT "$_ "} keys %tmpdb;
print OUT "\n";
}
close IN;
close OUT;
================================================
FILE: scripts/link_superscaffold.pl
================================================
#!/usr/bin/perl -w
my %namedb;
my %removedb;
while(<DATA>){
chomp;
my ($id,$name) = (split/\s+/,$_)[0,1];
$namedb{$name} = $id;
my @data = split(/\s+/,$_);
my $key = "";
foreach my $i (2..$#data){
my ($sa,$sb) = sort ($data[1],$data[$i]);
$key = $sa." ".$sb;
$removedb{$key}++;
}
}
my %infordb;
open(IN, "grep -v 'tig' all.clm|") or die"";
<IN>;
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
my $scf1 = $data[0];
my $scf2 = $data[1];
$scf1 =~ s/[+|-]//g;
$scf2 =~ s/[+|-]//g;
# my ($s1,$s2) = sort ($scf1,$scf2);
my $key1 = $scf1." ".$scf2;
my $key2 = $scf2." ".$scf1;
next if(exists($removedb{$key1}));
if(!exists($infordb{$key1})){
$infordb{$key1} = $data[4];
}elsif(exists($infordb{$key1}) and $data[4]>$infordb{$key1}){
$infordb{$key1} = $data[4];
}
if(!exists($infordb{$key2})){
$infordb{$key2} = $data[4];
}elsif(exists($infordb{$key2}) and $data[4]>$infordb{$key2}){
$infordb{$key2} = $data[4];
}
}
close IN;
my %bestdb;
open(OUT, "> tmp.txt") or die"";
foreach my $key (keys %infordb){
my ($sa,$sb) = split(/\s+/,$key);
my $ida = $namedb{$sa};
my $idb = $namedb{$sb};
print OUT "$ida $idb $sa $sb $infordb{$key}\n";
if(!exists($bestdb{$ida}->{$idb})){
$bestdb{$ida}->{$idb} = $infordb{$key};
}elsif($infordb{$key}>$bestdb{$ida}->{$idb}){
$bestdb{$ida}->{$idb} = $infordb{$key};
}
if(!exists($bestdb{$idb}->{$ida})){
$bestdb{$idb}->{$ida} = $infordb{$key};
}elsif($infordb{$key}>$bestdb{$idb}->{$ida}){
$bestdb{$idb}->{$ida} = $infordb{$key};
}
}
close OUT;
open(OUT, "> best_link.txt") or die"";
my $ln = 0;
my %linkdb;
open(IN, "sort -k5,5nr -k1,1n tmp.txt|") or die"";
while(<IN>){
chomp;
$ln++;
my @data = split(/\s+/,$_);
my $ida = $data[0];
my $idb = $data[1];
my $key = $ida." ".$idb;
if($ln==1){
$linkdb{$key} = $_;
$tmpdb{$ida}++;
$tmpdb{$idb}++;
}else{
next if(exists($tmpdb{$ida}) or exists($tmpdb{$idb}));
$linkdb{$key} = $_;
$tmpdb{$ida}++;
$tmpdb{$idb}++;
}
}
close IN;
foreach my $key (keys %linkdb){
print OUT "$linkdb{$key}\n";
}
close OUT;
### Below are the information that listed allelic super-scaffolds for each target.
#Format:
#ID target allelic_superscaffold1 allelic_superscaffold2 ...
__DATA__
1 group1 group2 group4 group6 group8 group9 group11 group14 group15 group16
2 group2 group1 group3 group4 group5 group6 group7 group8 group9 group10 group11 group12 group13 group14 group15 group16
3 group3 group2 group5 group7 group9 group10 group11 group12 group13 group14
4 group4 group1 group3 group6 group8 group9 group11 group14 group15 group16
5 group5 group2 group3 group7 group9 group10 group11 group12 group13 group14
6 group6 group1 group2 group4 group8 group9 group11 group14 group15 group16
7 group7 group2 group3 group5 group9 group10 group11 group12 group13 group14
8 group8 group1 group3 group4 group6 group9 group11 group14 group15 group16
9 group9 group1 group3 group4 group5 group6 group7 group8 group2 group10 group11 group12 group13 group14 group15 group16
10 group10 group2 group3 group5 group7 group9 group11 group12 group13 group14
11 group11 group1 group3 group4 group5 group6 group7 group8 group9 group10 group2 group12 group13 group14 group15 group16
12 group12 group2 group3 group5 group7 group9 group10 group11 group13 group14
13 group13 group12 group2 group3 group5 group7 group9 group10 group11
14 group14 group12 group2 group3 group5 group7 group9 group10 group11 group16
15 group15 group1 group2 group4 group6 group8 group9 group14
16 group16 group1 group2 group4 group6 group8 group9 group11 group14 group15
================================================
FILE: scripts/make_bed_around_RE_site.pl
================================================
#!/usr/bin/perl -w
use strict;
# make_bed_around_restriction_site.pl: Make a BED file representing the regions around all occurrences of a restriction site.
#
# For syntax, run with no arguments.
#
# The output BED file is designed for use with bedtools intersect, as follows:
# bedtools intersect -abam [SRR.bam] -b [$BED_out] > [SRR.REduced.bam]
# samtools view -h [SRR.REduced.bam] > [SRR.REduced.sam]
# This restricts a SAM/BAM file to only include reads close to a restriction site, which is a good way to filter Hi-C data, according to Fig. 1b of this paper:
# http://www.nature.com/ng/journal/v43/n11/full/ng.947.html
# Also see PreprocessSAM.pl, which uses the output file.
#
# Josh Burton
# April 2013
if ( scalar @ARGV != 3 ) {
# Report syntax.
print "\nmake_bed_around_RE_site.pl\n\n";
print "Find all occurrences of a motif in a genome. Make a 'POS' file listing these occurrences, and also a BED file representing the regions around these occurrences.\n\n";
print "SYNTAX:\tmake_bed_around_RE_site.pl <fasta> <motif> <range>\n";
print "fasta:\tA fasta file representing a genome (reference or draft assembly.)\n";
print "motif:\tA motif, typically a restriction site sequence (e.g., HindIII = AAGCTT, NcoI = CCATGG, Dpn1 = GATC).\n";
print "range:\tA number representing how many bp around the sequence to include. Recommend 500 based on Yaffe & Tanay, Nat. Genetics 2011.\n\n";
print "OUTPUT FILES:\n";
print "<fasta>.near_<motif>.<range>.bed\n";
print "<fasta>.near_pos_of_<motif>.txt\n";
print "\n";
exit;
}
# Get command-line arguments.
my ( $FASTA_in, $motif_seq, $range ) = @ARGV;
my $verbose = 0;
# Convert the motif from a string into a regex. Unroll the IUPAC codes from single letters into Perl-parseable regular expressions.
my $motif_regex = $motif_seq;
$motif_regex =~ s/R/\[AG\]/g;
$motif_regex =~ s/Y/\[CT\]/g;
$motif_regex =~ s/S/\[CG\]/g;
$motif_regex =~ s/W/\[AT\]/g;
$motif_regex =~ s/K/\[GT\]/g;
$motif_regex =~ s/M/\[AC\]/g;
$motif_regex =~ s/B/\[CGT\]/g;
$motif_regex =~ s/D/\[AGT\]/g;
$motif_regex =~ s/H/\[ACT\]/g;
$motif_regex =~ s/V/\[ACG\]/g;
$motif_regex =~ s/N/\[ACGT\]/g;
# Derive an output filename.
my $BED_out = "$FASTA_in.near_$motif_seq.$range.bed";
my $POS_out = "$FASTA_in.pos_of_$motif_seq.txt";
# Determine how many letters needed to be added to each line in order to find instances of the sequence that bridge lines in the fasta.
my $N_prev_chars = length($motif_seq) - 1;
my $contig_name = '';
my $offset = 0;
my $prev_chars;
my @motif_positions;
my $N_motifs_found = 0;
# Open the input fasta file and read through it line-by-line.
print localtime() . ": Reading file $FASTA_in...\n";
open IN, '<', $FASTA_in or die "Can't find file `$FASTA_in'";
open BED, '>', $BED_out or die;
open POS, '>', $POS_out or die;
while (<IN>) {
my $line = $_;
chomp $line;
# If this is a header line, we're done with this contig/chromosome (unless we just started), and start a new contig/chromosome.
if ( $line =~ /^\>(\S+)/ ) {
# The hash %motif_positions contains all positions on the (now complete) old contig at which this motif appears.
# Convert this list of positions to a set of BED lines, as necessary.
my ( $prev_start, $prev_end ) = (-1,-1);
foreach my $pos ( @motif_positions ) {
if ( $prev_end == -1 ) {
$prev_start = $pos;
$prev_end = $pos;
}
if ( $prev_end + 2*$range < $pos ) {
$prev_start = $range if $prev_start < $range;
$prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome
print BED "$contig_name\t", $prev_start - $range, "\t", $prev_end + $range, "\n";
$prev_start = $pos;
}
#print "pos = $pos\n";
$prev_end = $pos;
}
# Print the final BED line for this contig/chromosome.
if (@motif_positions) {
$prev_start = $range if $prev_start < $range;
$prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome
print BED "$contig_name\t", $prev_start - $range, "\t", $prev_end + $range, "\n";
}
# Get the new contig's name.
$contig_name = $1;
print localtime() . ": $contig_name\n" if $verbose;
print POS ">$contig_name\n";
# Reset other contig-related variables.
$offset = 0;
$prev_chars = '';
@motif_positions = ();
}
# Otherwise, read through this contig/chromosome.
else {
if ( $offset != 0 ) { die unless $prev_chars; }
my $verbose = 0;
# Look for instances of this motif in this line of the fasta (including the overlap characters from the previous line, tacked on at the beginning.)
my $motif_loc = -1;
my $target_str = "$prev_chars" . uc $line;
my @matches;
while ($target_str =~ /$motif_regex/g ) {
# Every iteration in this loop represents a new match to the motif regex in the terget string.
my $motif_loc = $-[0];
# Adjust the location so it properly describes the 0-indexed motif position in this contig.
# Then add it to the list of contig positions at which the motif has been seen.
$N_motifs_found++;
my $true_motif_loc = $motif_loc + $offset - length $prev_chars; # adjust index so it properly describes the 0-indexed motif position in this contig
push @motif_positions, $true_motif_loc;
print "$contig_name\t$offset\t$prev_chars\t->\t$motif_loc\n" if $verbose;
print POS "$true_motif_loc\n";
}
# TODO: remove
while (0) {
$motif_loc = index "$prev_chars$line", $motif_seq, $motif_loc + 1;
last if ( $motif_loc == -1 ); # no more instances found
# Found a motif! Add its index to the list of contig positions at which the motif has been seen.
$N_motifs_found++;
my $true_motif_loc = $motif_loc + $offset - length $prev_chars; # adjust index so it properly describes the 0-indexed motif position in this contig
push @motif_positions, $true_motif_loc;
print "$contig_name\t$offset\t$prev_chars\t->\t$motif_loc\n" if $verbose;
print POS "$true_motif_loc\n";
}
# Save the last few characters of this line, so that they can be appended onto the next line in a search for the sequence.
my $line_len = length $line;
$prev_chars = substr( $line, $line_len - $N_prev_chars );
$offset += $line_len;
}
}
################# modified based on the pull request from @FlyPythons: https://github.com/shendurelab/LACHESIS/pull/45
# process the last fasta record
my ( $prev_start, $prev_end ) = (-1,-1);
foreach my $pos ( @motif_positions ) {
if ( $prev_end == -1 ) {
$prev_start = $pos;
$prev_end = $pos;
}
if ( $prev_end + 2*$range < $pos ) {
$prev_start = $range if $prev_start < $range;
$prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome
print BED "$contig_name\t", $prev_start - $range, "\t", $prev_end + $range, "\n";
$prev_start = $pos;
}
#print "pos = $pos\n";
$prev_end = $pos;
}
# Print the final BED line for this contig/chromosome.
if (@motif_positions) {
$prev_start = $range if $prev_start < $range;
$prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome
print BED "$contig_name\t", $prev_start - $range, "\t", $prev_end + $range, "\n";
}
# Reset other contig-related variables.
$offset = 0;
$prev_chars = '';
@motif_positions = ();
#################
close IN;
close BED;
close POS;
print localtime() . ": Done! Found $N_motifs_found total instances of motif $motif_seq. Created files:\n";
print "$BED_out\n$POS_out\n";
================================================
FILE: scripts/mc_bam.pl
================================================
#!/usr/bin/perl -w
use Getopt::Std;
getopts "b:a:r:";
if ((!defined $opt_b)|| (!defined $opt_a) ||(!defined $opt_r)) {
die "************************************************************************
Usage: mc_bam.pl -b mapping.bam -r groups.asm.fasta -a agp
-h : help and usage.
This script is used for modification the coordinates
in bam based on agp file
-b : mapping.bam
-r : reference genome, fasta format
-a : agp file
************************************************************************\n";
}
my %posidb = ();
open(IN, $opt_a) or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
next if($data[-1] eq "map");
my $ga = $data[1];
my $gb = $data[2];
my $ta = $data[6];
my $tb = $data[7];
my $ctg = $data[5];
if($data[8] eq "+"){
my $gi = $ga;
foreach my $ti($ta..$tb){
$posidb{$ctg}->{$ti} = $data[0].",".$gi;
$gi++;
}
}elsif($data[8] eq "-"){
my $gi = $gb;
foreach my $ti ($ta..$tb){
$posidb{$ctg}->{$ti} = $data[0].",".$gi;
$gi--;
}
}
}
close IN;
#open(OUT, "> posi.txt") or die"";
#foreach my $ctg (keys %posidb){
# foreach my $i(sort {$a<=>$b} keys %{$posidb{$ctg}}){
# print OUT "$ctg $i $posidb{$ctg}->{$i}\n";
# }
# }
#close OUT;
my $outsam = "mc.sam";
my $outbam = "mc.bam";
open(OUT, "> $outsam") or die"";
open(IN, "samtools view $opt_b |") or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
next if($data[6] eq "=");
my $ctgA = $data[2];
my $tiA = $data[3];
my $ctgB = ($data[6] eq "=")?$data[2]:$data[6];
my $tiB = $data[7];
my $gidA; my $giA; my $gidB; my $giB;
next if(!exists($posidb{$ctgA}->{$tiA}));
next if(!exists($posidb{$ctgB}->{$tiB}));
($gidA,$giA) = split(/,/,$posidb{$ctgA}->{$tiA});
($gidB,$giB) = split(/,/,$posidb{$ctgB}->{$tiB});
$data[2] = $gidA;
$data[3] = $giA;
$data[6] = ($gidB eq $gidA)?"=":$gidB;
$data[7] = $giB;
map {print OUT "$_ "} @data;
print OUT "\n";
}
close IN;
close OUT;
system("samtools faidx $opt_r");
my $fai = $opt_r.".fai";
system("samtools view -bt $fai $outsam > $outbam");
================================================
FILE: scripts/odering2tour.pl
================================================
#!/usr/bin/perl -w
while(my $file=glob "*_orderings.txt"){
my $name = $file;
$name =~ s/_orderings.txt//g;
$name .= ".tour";
open(my $out, "> $name") or die"";
open(my $fh, $file) or die"";
while(<$fh>){
chomp;
my ($ctg,$dir) = (split/\s+/,$_)[0,1];
my $line = $ctg."".$dir;
print $out "$line ";
}
close $fh;
close $out;
}
================================================
FILE: scripts/partition.pl
================================================
#!/usr/bin/perl -w
use Getopt::Std;
getopts "g:d:b:r:";
if ((!defined $opt_g)|| (!defined $opt_r)) {
die "************************************************************************
Usage: perl $0 -g Allele.gene.table -r draft.asm.fasta
-h : help and usage.
-g : Allele.gene.table
-b : optional,default prunning.bam
-r : reference ctg assembly
-d : optional, default wrk_dir
************************************************************************\n";
}
my $bam = (defined $opt_b)?$opt_b:"prunning.bam";
my $table = $opt_g;
my $wrkd = (defined $opt_d)?$opt_d:"wrk_dir";
my $refSeq = $opt_r;
### Read referece ctg fasta
my %refdb = ();
my $ctgn;
open(IN, $refSeq) or die"";
while(<IN>){
chomp;
if(/>/){
$ctgn = $_;
$ctgn =~ s/>//g;
$ctgn =~ s/\s+//g;
}else{
$refdb{$ctgn} .= $_;
}
}
close IN;
foreach $ctgn (keys %refdb){
$refdb{$ctgn} =~ s/\s+//g;
}
### Read prunning BAM file
my %bamdb = ();
my $count = 1;
my %rdb;
open(IN, "samtools view $bam |") or die"";
while(<IN>){
chomp;
my $rname = (split/\s+/,$_)[0];
next if(exists($rdb{$rname})); ### only retain single-end reads
$rdb{$rname}++;
$bamdb{$count++} = $_;
}
close IN;
### Assign ctgs to pre-defined clusters
my %ctgdb;
open(IN, $table) or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
my $chrn = $data[1];
foreach my $i(3..$#data){
my $ctg = (split/,/,$data[$i])[1];
$ctgdb{$ctg}->{$chrn}++;
}
}
close IN;
my %chrdb; ### pre-defined cluster based on chromosomes of close-releative species
foreach my $ctg (keys %ctgdb){
my $count = 0;
foreach my $chrn (sort {$ctgdb{$ctg}->{$b}<=>$ctgdb{$ctg}->{$a}} keys %{$ctgdb{$ctg}}){
$count++;
next if($count>1);
# print "$ctg $chrn $ctgdb{$ctg}->{$chrn}\n";
$chrdb{$chrn} .= $ctg.",";
}
}
system("rm -rf $wrkd");
system("mkdir $wrkd");
foreach my $chrn (keys %chrdb){
next if($chrn=~/tig/);
next if($chrn=~/ctg/);
system("rm -rf $wrkd/$chrn");
system("mkdir $wrkd/$chrn");
my @ctgdb = split(/,/,$chrdb{$chrn});
my %tmpdb = (); $tmpdb{'='}++; ### need retain intra-contig links
### output ctg list to each cluster
open(my $out, ">$wrkd/$chrn/ctg.list") or die"";
map {print $out "$_\n";$tmpdb{$_}++} @ctgdb;
close $out;
### output ctg sequence to each cluster
open(my $faout, ">$wrkd/$chrn/seq.fasta") or die"";
map {chomp;print $faout ">$_\n$refdb{$_}\n" if(exists($refdb{$_}))} @ctgdb;
close $faout;
### output bam file to each cluster
open(my $bamout, "> $wrkd/$chrn/sample.clean.sam") or die"";
foreach my $i(keys %bamdb){
my ($c1,$c2) = (split/\s+/,$bamdb{$i})[2,6];
next if(!exists($tmpdb{$c1}) or !exists($tmpdb{$c2}));
print $bamout "$bamdb{$i}\n";
}
close $bamout;
system("samtools faidx $wrkd/$chrn/seq.fasta");
system("samtools view -bt $wrkd/$chrn/seq.fasta.fai $wrkd/$chrn/sample.clean.sam > $wrkd/$chrn/sample.clean.bam");
system("rm $wrkd/$chrn/sample.clean.sam");
}
================================================
FILE: scripts/partition_gmap.pl
================================================
#!/usr/bin/perl -w
use Getopt::Std;
getopts "g:d:b:r:l:";
if ((!defined $opt_g)|| (!defined $opt_r)) {
die "************************************************************************
Usage: perl $0 -g Allele.ctg.table -r draft.asm.fasta
-h : help and usage.
-g : Allele.ctg.table
-b : optional,default prunning.bam
-r : reference ctg assembly
-d : optional, default wrk_dir
-l : chrn.list
************************************************************************\n";
}
my $bam = (defined $opt_b)?$opt_b:"prunning.bam";
my $table = $opt_g;
my $wrkd = (defined $opt_d)?$opt_d:"wrk_dir";
my $refSeq = $opt_r;
if(!defined $opt_l){
system("cut -f1 $table |sort -u > chrn.list");
$opt_l = "chrn.list";
}
my %chrnListdb;
open(IN, $opt_l) or die"";
while(<IN>){
chomp;
my $chrn = (split/\s+/,$_)[0];
$chrnListdb{$chrn}++;
}
close IN;
### Read referece ctg fasta
my %refdb = ();
my $ctgn;
open(IN, $refSeq) or die"";
while(<IN>){
chomp;
if(/>/){
$ctgn = $_;
$ctgn =~ s/>//g;
$ctgn =~ s/\s+.*//g;
}else{
$refdb{$ctgn} .= $_;
}
}
close IN;
foreach $ctgn (keys %refdb){
$refdb{$ctgn} =~ s/\s+//g;
}
### Read prunning BAM file
my %bamdb = ();
my $count = 1;
my %rdb;
open(IN, "samtools view $bam |") or die"";
while(<IN>){
chomp;
my $rname = (split/\s+/,$_)[0];
next if(exists($rdb{$rname})); ### only retain single-end reads
$rdb{$rname}++;
$bamdb{$count++} = $_;
}
close IN;
### Assign ctgs to pre-defined clusters
my %ctgdb;
open(IN, $table) or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
my $chrn = $data[0];
foreach my $i(2..$#data){
#my $ctg = (split/,/,$data[$i])[1];
my $ctg = $data[$i];
$ctgdb{$ctg}->{$chrn}++;
}
}
close IN;
my %chrdb; ### pre-defined cluster based on chromosomes of close-releative species
foreach my $ctg (keys %ctgdb){
my $count = 0;
foreach my $chrn (sort {$ctgdb{$ctg}->{$b}<=>$ctgdb{$ctg}->{$a}} keys %{$ctgdb{$ctg}}){
$count++;
next if($count>1);
# print "$ctg $chrn $ctgdb{$ctg}->{$chrn}\n";
$chrdb{$chrn} .= $ctg.",";
}
}
system("rm -rf $wrkd");
system("mkdir $wrkd");
foreach my $chrn (keys %chrdb){
next if(!exists($chrnListdb{$chrn}));
print "Process $chrn ...\n";
system("rm -rf $wrkd/$chrn");
system("mkdir $wrkd/$chrn");
my @ctgdb = split(/,/,$chrdb{$chrn});
my %tmpdb = (); $tmpdb{'='}++; ### need retain intra-contig links
### output ctg list to each cluster
open(my $out, ">$wrkd/$chrn/ctg.list") or die"";
map {print $out "$_\n";$tmpdb{$_}++} @ctgdb;
close $out;
### output ctg sequence to each cluster
open(my $faout, ">$wrkd/$chrn/seq.fasta") or die"";
map {chomp;print $faout ">$_\n$refdb{$_}\n" if(exists($refdb{$_}))} @ctgdb;
close $faout;
### output bam file to each cluster
open(my $bamout, "> $wrkd/$chrn/prunning.sub.sam") or die"";
foreach my $i(keys %bamdb){
my ($c1,$c2) = (split/\s+/,$bamdb{$i})[2,6];
next if(!exists($tmpdb{$c1}) or !exists($tmpdb{$c2}));
print $bamout "$bamdb{$i}\n";
}
close $bamout;
system("samtools faidx $wrkd/$chrn/seq.fasta");
system("samtools view -bt $wrkd/$chrn/seq.fasta.fai $wrkd/$chrn/prunning.sub.sam > $wrkd/$chrn/prunning.sub.bam");
system("rm $wrkd/$chrn/prunning.sub.sam");
}
================================================
FILE: scripts/partition_gmap.py
================================================
#!/usr/bin/env python
import sys
import os
import argparse
import multiprocessing
import pysam
def get_opt():
group = argparse.ArgumentParser()
group.add_argument('-r', '--ref', help='reference contig level assembly', required=True)
group.add_argument('-g', '--alleletable', help='Allele.ctg.table', required=True)
group.add_argument('-b', '--bam', help='bam file, default: prunning.bam', default='prunning.bam')
group.add_argument('-d', '--workdir', help='work directory, default: wrk_dir', default='wrk_dir')
group.add_argument('-t', '--thread', help='threads, default: 10', type=int, default=10)
return group.parse_args()
def read_fasta(in_fa):
fa_db = {}
with open(in_fa, 'r') as fin:
for line in fin:
if line[0] == '>':
id = line.strip().split()[0][1:]
fa_db[id] = []
else:
fa_db[id].append(line.strip())
for id in fa_db:
fa_db[id] = ''.join(fa_db[id])
return fa_db
def load_allele(allele_table):
ctg_on_chr = {}
chr_contain_ctg = {}
with open(allele_table, 'r') as fin:
for line in fin:
data = line.strip().split()
chrn = data[0]
if chrn.startswith('tig') or chrn.startswith('scaffold') or chrn.startswith('utg') or chrn.startswith('ctg'):
continue
for ctg in data[2:]:
if ctg not in ctg_on_chr:
ctg_on_chr[ctg] = {}
if chrn not in ctg_on_chr[ctg]:
ctg_on_chr[ctg][chrn] = 0
ctg_on_chr[ctg][chrn] += 1
for ctg in ctg_on_chr:
max_chr = ""
max_cnt = 0
for chrn in ctg_on_chr[ctg]:
if ctg_on_chr[ctg][chrn] > max_cnt:
max_cnt = ctg_on_chr[ctg][chrn]
max_chr = chrn
ctg_on_chr[ctg] = max_chr
if max_chr not in chr_contain_ctg:
chr_contain_ctg[max_chr] = {}
chr_contain_ctg[max_chr][ctg] = 1
return ctg_on_chr, chr_contain_ctg
def split_files(chrn, allele_table, ref, bam_file, wrkdir):
wrk_dir = os.path.join(wrkdir, chrn)
if not os.path.exists(wrk_dir):
os.mkdir(wrk_dir)
print("\tDealing %s"%chrn)
ctg_on_chr, chr_contain_ctg = load_allele(allele_table)
fa_db = read_fasta(ref)
sub_bam = os.path.join(wrk_dir, chrn+'.bam')
sub_fa = os.path.join(wrk_dir, chrn+'.fa')
with open(sub_fa, 'w') as fout:
for ctg in chr_contain_ctg[chrn]:
fout.write(">%s\n%s\n"%(ctg, fa_db[ctg]))
with pysam.AlignmentFile(bam_file, 'rb') as fin:
with pysam.AlignmentFile(sub_bam, 'wb', template=fin) as fout:
for ctg in chr_contain_ctg[chrn]:
for line in fin.fetch(contig=ctg):
if line.next_reference_name and line.next_reference_name in ctg_on_chr and ctg_on_chr[line.next_reference_name]==chrn:
fout.write(line)
def partition_gmap(ref, allele_table, bam, wrkdir, threads):
if not os.path.exists(wrkdir):
os.mkdir(wrkdir)
print("Getting groups")
chrn_db = {}
with open(allele_table, 'r') as fin:
for line in fin:
chrn_db[line.strip().split()[0]] = 1
bai = bam+'.bai'
if not os.path.exists(bai):
print("BAI file not found, starting index...")
ret = os.system('samtools index %s'%bam)
if ret==0:
print("Index success")
else:
print("Fatal: bam file must be sorted")
sys.exit(-1)
print("Splitting files")
if len(chrn_db) < threads:
threads = len(chrn_db)
pool = multiprocessing.Pool(processes=threads)
result_list = list()
for chrn in chrn_db:
result_list.append([chrn, pool.apply_async(split_files, (chrn, allele_table, ref, bam, wrkdir,))])
pool.close()
pool.join()
error_list = list()
for chrn, result in result_list:
try:
result.get()
except Exception as e:
print('Exception raised when dealing with {}: {}'.format(chrn, e))
error_list.append(chrn)
if error_list:
raise Exception("{} exception(s) detected in : {}".format(len(error_list), ', '.join(error_list)))
print("Notice: If you got errors of \"Length mismatch\" during allhic extract, it is normal because we split bam with the same header, it will not effect the result")
print("Finished")
if __name__ == '__main__':
opts = get_opt()
ref = opts.ref
allele_table = opts.alleletable
bam = opts.bam
wrkdir = opts.workdir
threads = opts.thread
partition_gmap(ref, allele_table, bam, wrkdir, threads)
================================================
FILE: scripts/prune.pl
================================================
#!/usr/bin/perl -w
use Getopt::Std;
getopts "i:b:r:";
if ((!defined $opt_i)|| (!defined $opt_b)|| (!defined $opt_r)) {
die "************************************************************************
Usage: perl $0 -i Allele.ctg.table -b bam.list -r draft.asm.fasta
-h : help and usage.
-i : Allele.ctg.table
-b : bam.list, a file contains input bam files
-r : draft.sam.fasta
************************************************************************\n";
}
my $bamfile = $opt_b;
my $table = $opt_i;
my $refSeq = $opt_r;
### Read bam files
my %pairdb = ();
my %ctgdb = ();
my %bamdb = ();
open(IN, $bamfile) or die"";
while(<IN>){
chomp;
my $bam = $_;
$bam =~ s/\s+//g;
next if(!($bam =~ /.bam/));
$bamdb{$bam}++;
open(my $fh, "samtools view $bam |") or die"";
while(<$fh>){
chomp;
my @data = split(/\s+/,$_);
my $ctg1 = $data[2];
my $ctg2 = $data[6];
next if($ctg2 eq "=");
my ($sa,$sb) = sort ($ctg1,$ctg2);
$pairdb{$sa}->{$sb} .= $data[0].",";
$ctgdb{$ctg1}++; $ctgdb{$ctg2}++;
}
close $fh;
}
close IN;
### Read allele information
### Remove signal between alleles
open(OUT1, ">removedb_Allele.txt") or die"";
open(LOG, "> log.txt") or die"";
open(IN, $table) or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
next if(@data<=3);
my %tmpdb = (); ### Record alelle contigs
my $n = $#data;
for(my $i=2;$i<$n;$i++){
my $ctg1 = $data[$i];
for(my $j=$i+1;$j<=$n;$j++){
my $ctg2 = $data[$j];
my ($sa,$sb) = sort ($ctg1,$ctg2);
my $key = $sa.",".$sb;
$tmpdb{$key}++;
print OUT1 "$sa $sb $pairdb{$sa}->{$sb}\n" if(exists($pairdb{$sa}->{$sb}));
}
}
print LOG ">$_\n";
foreach my $i(2..$#data){
my $ctg1 = $data[$i];
foreach my $ctg2 (keys %ctgdb){
my ($sa,$sb) = sort ($ctg1,$ctg2);
my $key = $sa.",".$sb;
next if(exists($tmpdb{$key}));
next if(!exists($pairdb{$sa}->{$sb}));
my @rnamedb = split(/,/,$pairdb{$sa}->{$sb});
my $num_r = @rnamedb;
print LOG "$ctg2 $ctg1 $num_r $pairdb{$sa}->{$sb}\n";
}
}
}
close IN;
close OUT1;
close LOG;
### Remove signal which are not best match with listed alleles (ctgs)
open(OUT2, "> removedb_nonBest.txt") or die"";
open(IN, "log.txt") or die"";
$/='>';
<IN>;
while(<IN>){
chomp;
my %hashdb = ();
my ($name,$info) = split(/\n/,$_,2);
my @linedb = split(/\n/,$info);
foreach my $line(@linedb){
my @data = split(/\s+/,$line);
if(!exists($hashdb{$data[0]})){
$hashdb{$data[0]}->{'retain'} = $data[1];
$hashdb{$data[0]}->{'num'} = $data[2];
}elsif(exists($hashdb{$data[0]}) and $data[2]>$hashdb{$data[0]}->{'num'}){
$hashdb{$data[0]}->{'retain'} = $data[1];
$hashdb{$data[0]}->{'num'} = $data[2];
}
}
foreach $line (@linedb){
@data = split(/\s+/,$line);
if($hashdb{$data[0]}->{'retain'} eq $data[1]){
# print OUT2 "$data[0] $data[1] $data[2] retain $data[3]\n";
next;
}else{
print OUT2 "$data[0] $data[1] $data[2] remove $data[3]\n";
}
}
}
close IN;
close OUT2;
system("remove_reads.pl");
### Reading removed reads
#my %removedb = ();
#open(IN, "removedb_Allele.txt") or die"";
#my $content = <IN>;
#my @linedb = split(/\n/,$content);
#foreach my $line (@linedb){
# my $info = (split/\s+/,$line)[2];
# my @rnamedb = split(/,/,$info);
# map {$removedb{$_}++} @rnamedb;
#
# }
#close IN;
#
#open(IN, "removedb_nonBest.txt") or die"";
#$content = <IN>;
#@linedb = split(/\n/,$content);
#foreach my $line (@linedb){
# my $info = (split/\s+/,$line)[4];
# my @rnamedb = split(/,/,$info);
# map {$removedb{$_}++} @rnamedb;
# }
#close IN;
#my $num_of_remove_reads = keys %removedb;
#print "Removing $num_of_remove_reads reads\n";
#open(OUT, "> prunning.sam") or die"";
#foreach my $bam (keys %bamdb){
# open(my $fh, "samtools view $bam |") or die"";
# $content = <$fh>;
# @linedb = split(/\n/,$content);
# foreach my $line (@linedb){
# my $rname = (split/\s+/,$line)[0];
# next if(exists($removedb{$rname}));
# print OUT "$line\n";
# }
# close $fh;
# }
#close OUT;
system("samtools faidx $refSeq");
my $fai = $refSeq.".fai";
system("samtools view -bt $fai prunning.sam > prunning.bam");
================================================
FILE: scripts/ragoo2ALLHiC.pl
================================================
#!/usr/bin/perl -w
use Getopt::Std;
getopts "l:r:b:e:";
if ((!defined $opt_l)|| (!defined $opt_r) ||(!defined $opt_b)) {
die "************************************************************************
Usage: perl ragoo2ALLHiC -l orderings.list -r draft.asm.fasta -b sample.clean.bam
-h : help and usage.
-l : ordering.list contains a list of output files from ragoo
-r : draft contig assembly
-b : sample.clean.bam
-e : restriction sites, optional, default GATC
MboI: GATC; HindIII: AAGCTT
************************************************************************\n";
}else{
print "************************************************************************\n";
print "Version demo\n";
print "Copyright to Tanger\n";
print "RUNNING...\n";
print "************************************************************************\n";
}
$opt_e = (defined $opt_e)?$opt_e:"GATC";
if(!(-e "draft.asm.fasta")){
system("ln -s $opt_r ./draft.asm.fasta");
}else{
print "check draft.asm.fasta file, exist\n";
}
if(!(-e "sample.clean.bam")){
system("ln -s $opt_b ./sample.clean.bam");
}else{
print "check sample.clean.bam file, exist\n";
}
my $num_g = 0;
my %cntdb = ();
open(IN, $opt_l) or die"";
while(<IN>){
chomp;
$num_g++;
my @linedb = split(/\n/,$_);
foreach my $file (@linedb){
$gid = (split/\//,$file)[-1];
$gid =~ s/_orderings.txt//g;
open(my $fh, $file) or die"";
while(<$fh>){
chomp;
my $ctg = (split/\s+/,$_)[0];
$cntdb{$gid}->{$ctg}++;
}
close $fh;
}
}
close IN;
open(OUT, ">clusters.txt") or die"";
print OUT "#Group nContigs Contigs\n";
foreach my $g (sort keys %cntdb){
my $num = keys %{$cntdb{$g}};
print OUT "$g $num ";
foreach my $c (keys %{$cntdb{$g}}){
print OUT "$c ";
}
print OUT "\n";
}
close OUT;
print "#### Counting restriction sites from draft assembly\n";
print "allhic extract sample.clean.bam draft.asm.fasta --RE $opt_e\n...\n\n";
system("allhic extract sample.clean.bam draft.asm.fasta --RE $opt_e");
print "### Rescue unanchored contigs\n";
my $countRE = "sample.clean.counts_".$opt_e.".txt";
print "ALLHiC_rescue -r draft.asm.fasta -b sample.clean.bam -c clusters.txt -i $countRE\n...\n\n";
system("ALLHiC_rescue -r draft.asm.fasta -b sample.clean.bam -c clusters.txt -i $countRE -m 1");
foreach my $i (1..$num_g){
my $gn = "group".$i.".txt";
print "### Scaffolding $gn\n";
print "allhic optimize $gn sample.clean.clm\n...\n\n";
system("allhic optimize $gn sample.clean.clm");
}
print "### Build ALLHiC assembly\n";
system("ALLHiC_build draft.asm.fasta");
system("Done ...\n");
================================================
FILE: scripts/release3DDNA.pl
================================================
#!/usr/bin/perl -w
die "Usage: perl $0 No_of_chr seq.FINAL.fasta\n" if(!defined ($ARGV[0]) or !defined($ARGV[1]));
my $Kchr = $ARGV[0];
open(IN, $ARGV[1]) or die"";
$/='>';
<IN>;
while(<IN>){
chomp;
my ($name,$seq) = split(/\n/,$_,2);
$seq =~ s/\s+//g;
my $len = length $seq;
$infordb{$name}->{'seq'} = $seq;
$infordb{$name}->{'len'} = $len;
}
close IN;
open(OUT, "> chr.fasta") or die"";
my $count = 0;
foreach my $scaf (sort {$infordb{$b}->{'len'}<=>$infordb{$a}->{'len'}} keys %infordb){
$count++;
my $chrname = "";
if($count<=$Kchr){
$chrname = 'Chr'.$count;
}else{
$chrname = 'scaffold'.$count;
}
print OUT ">$chrname\n$infordb{$scaf}->{'seq'}\n";
}
close OUT;
my $ctgn = 0;
open(OUT, ">tig.HiCcorrected.fasta") or die"";
open(IN, "chr.fasta") or die"";
$/='>';
<IN>;
while(<IN>){
chomp;
my ($chrn,$seq) = split(/\n/,$_,2);
print "Process $chrn\n";
$seq =~ s/N/\n/g;
my $tour = "";
my $ctgname = "";
my $otour = $chrn.".tour";
my @seqdb = split(/\n/,$seq);
foreach my $i (0..$#seqdb){
next if ($seqdb[$i] eq "");
$ctgn++;
$ctgn = sprintf("%07d",$ctgn);
$ctgname = "tig".$ctgn;
$tour .= $ctgname."+ ";
print OUT ">$ctgname\n$seqdb[$i]\n";
}
next if($chrn =~ /scaffold/);
open(my $out, ">$otour") or die"";
print $out ">$chrn\n$tour\n";
close $out;
}
close IN;
close OUT;
system("ALLHiC_build tig.HiCcorrected.fasta");
================================================
FILE: scripts/remove_reads.pl
================================================
#!/usr/bin/perl -w
my %bamdb = ();
open(IN, "bam.list") or die"";
while(<IN>){
chomp;
my $bam = $_;
$bam =~ s/\s+//g;
next if(!($bam =~ /.bam/));
$bamdb{$bam}++;
}
close IN;
my %removedb = ();
open(IN, "removedb_Allele.txt") or die"";
while(<IN>){
chomp;
my $info = (split/\s+/,$_)[2];
my @rnamedb = split(/,/,$info);
map {$removedb{$_}++} @rnamedb;
}
close IN;
open(IN, "removedb_nonBest.txt") or die"";
while(<IN>){
chomp;
my $info = (split/\s+/,$_)[4];
my @rnamedb = split(/,/,$info);
map {$removedb{$_}++} @rnamedb;
}
close IN;
my $num_of_remove_reads = keys %removedb;
print "Removing $num_of_remove_reads reads\n";
open(OUT, "> prunning.sam") or die"";
foreach my $bam (keys %bamdb){
open(my $fh, "samtools view $bam|") or die"";
while(<$fh>){
chomp;
my @data = split(/\s+/,$_);
my $rname = (split/\s+/,$_)[0];
my $ctg2 = $data[6];
next if($ctg2 eq "*");
print OUT "$_\n" if(!exists($removedb{$rname}));
}
close $fh;
}
close OUT;
================================================
FILE: scripts/remove_small_contigs.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Xiaofei Zeng
# Email: xiaofei_zeng@whu.edu.cn
# Created Time: 2021-04-16 18:16
import argparse
def assembly_to_groups(assembly, len_cutoff):
ctg_dict = dict()
cluster_list = list()
small_frag = set()
with open(assembly) as f:
for line in f:
if not line.strip():
continue
cols = line.split()
if line.startswith('>'):
ctg_dict[cols[1]] = cols[0][1:]
if int(cols[2]) < len_cutoff:
small_frag.add(cols[1])
else:
cluster_list.append([num.strip('-') for num in cols if num.strip('-') not in small_frag])
return ctg_dict, cluster_list
def output_clusters(ctg_dict, cluster_list):
with open('prunning.clusters.txt', 'w') as f:
f.write('#Group\tnContigs\tContigs\n')
ngroup = len(cluster_list)
for n, nums in enumerate(cluster_list, 1):
f.write('{0}g{1}\t{2}\t{3}\n'.format(ngroup, n, len(nums), ' '.join([ctg_dict[num] for num in nums])))
def output_counts(ctg_dict, counts):
with open(counts) as fin, open('sub.'+counts, 'w') as fout:
for line in fin:
if line.startswith('#'):
fout.write(line)
else:
cols = line.split()
if cols[0] in ctg_dict.values():
fout.write(line)
def output_fasta(ctg_dict, fasta):
output = False
with open(fasta) as fin, open('sub.'+fasta, 'w') as fout:
for line in fin:
if line.startswith('>'):
if line.split()[0][1:] in ctg_dict.values():
output = True
fout.write(line)
else:
output = False
elif output:
fout.write(line)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('assembly', help='*.review.assembly (output file of juicebox manual grouping), used to generate new prunning.clusters.txt')
parser.add_argument('--fasta', default=None, help='input fasta file of contigs, this parameter will remove contigs not in .review.assembly, optional')
parser.add_argument('--counts', default=None, help='input prunning.counts_RE.txt, this parameter will remove contigs not in .review.assembly, optional')
parser.add_argument('--len_cutoff', default=100, type=float, help='length cutoff, default: %(default)s Kbp')
args = parser.parse_args()
ctg_dict, cluster_list = assembly_to_groups(args.assembly, args.len_cutoff*1000)
output_clusters(ctg_dict, cluster_list)
if args.fasta:
output_fasta(ctg_dict, args.fasta)
if args.counts:
output_counts(ctg_dict, args.counts)
if __name__ == '__main__':
main()
================================================
FILE: scripts/simuCTG.pl
================================================
#!/usr/bin/perl -w
use Getopt::Std;
getopts "i:m:s:";
if ((!defined $opt_i)|| (!defined $opt_m) || (!defined $opt_s)) {
die "************************************************************************
Usage: perl $0 -i input.fasta -m mean -s SD
-h : help and usage.
-i : input.fasta, chromosome assembly
-m : mean length
-s : sd
************************************************************************\n";
}else{
print "************************************************************************\n";
print "Version 1.1\n";
print "Copyright to Tanger\n";
print "RUNNING...\n";
print "************************************************************************\n";
}
my $mean = lc $opt_m;
my $sd = lc $opt_s;
if($mean =~ /m/){
$mean =~ s/m//g;
$mean = $mean * 1000000;
}elsif($mean =~ /k/){
$mean =~ s/k//g;
$mean = $mean * 1000;
}
if($sd =~ /m/){
$sd =~ s/m//g;
$sd = $sd * 1000000;
}elsif($sd =~ /k/){
$sd =~ s/k//g;
$sd = $sd * 1000;
}
print "1. generate a contig assembly with Average length = $mean bp ...\n";
my %chrdb;
open(CTG, "> chrUn.fasta") or die"";
open(OUT, "> new_genome.posi.bed") or die"";
open(IN, $opt_i) or die"";
$/='>';
<IN>;
while(<IN>){
chomp;
my ($gene,$seq) = split(/\n/,$_,2);
$seq =~ s/\s+//g;
if($gene=~/[C|c]hrUn/){
print CTG ">$gene\n$seq\n";
next;
}
$chrdb{$gene} = $seq;
my $total_len = length $seq;
my $num_seq = int $total_len/$sd + 500;
system("echo \"data<-rnorm($num_seq,mean=$mean,sd=$sd)\" >>Rscript.txt");
system("echo \"write.table\(data,file\=\'x.txt\'\) \" >> Rscript.txt");
system("chmod +x Rscript.txt");
my $Rcmd = "R CMD BATCH --no-save ./Rscript.txt";
system($Rcmd);
my $start = 0; my $l = 0; my $end = 0;
open(F, "x.txt") or die"";
my $content = <F>;
my @linedb = split(/\n/,$content);
foreach my $i(1..$#linedb){
my $line = $linedb[$i];
$start = $end+1;
$l = (split/\s+/,$line)[1];
$l = int $l;
# next if($l<=0);
$l = 0 - $l if($l<0);
if($end>$total_len){
$end = $total_len;
}else{
$end = $start + $l - 1;
}
next if($start>=$total_len);
print OUT "$gene $start $end\n";
}
close F;
system("rm x.txt");
system("rm Rscript.*");
}
close IN;
close OUT;
close CTG;
my $count = 0;
my %tdb;
open(OUT, "> ctg.tmp.fasta") or die"";
open(IN, "new_genome.posi.bed") or die"";
$content = <IN>;
@linedb = split(/\n/,$content);
foreach $line(@linedb){
my ($chrn,$a,$b) = split(/\s+/,$line);
my $L = $b - $a + 1;
my $subseq = substr($chrdb{$chrn},$a-1,$L);
if(!exists($tdb{$chrn})){
$count = 0;
$tdb{$chrn}++;
$count++;
$outname = $chrn.".ctg".$count;
}else{
$count++;
$outname = $chrn.".ctg".$count;
}
print OUT ">$outname\n$subseq\n";
}
close OUT;
system("cat ctg.tmp.fasta chrUn.fasta > ctg.fasta");
system("rm ctg.tmp.fasta");
print "2. get statistics for the contig assembly ...\n";
system("perl ~/software/script/faSize.pl ctg.fasta");
$content = `perl ~/software/script/faSize.pl ctg.fasta`;
my $N50 = $1 if($content=~/N50:\s+(\d+)/);
my $ave = $1 if($content=~/Average\s+length:\s+(\d+)/);
my $ctgname = "ctg."."n".$N50."_m".$ave.".fasta";
system("mv ctg.fasta ./$ctgname");
================================================
FILE: scripts/statAGP.pl
================================================
#!/usr/bin/perl -w
die "Usage: perl $0 chr.agp\n" if(!defined $ARGV[0]);
my $agp = $ARGV[0];
my %uctgdb;
my %actgdb;
my %chrdb;
my $sumL = 0;
my $sumC = 0;
my $sumU = 0;
open(IN, "grep -v 'contig' $agp |grep -v '#'|") or die"";
while(<IN>){
chomp;
my @data = split(/\s+/,$_);
$sumC++;
$sumL += $data[7];
if($data[0] eq $data[5]){
$uctgdb{$data[5]} = "Unanchor";
$sumU += $data[7];
}else{
$actgdb{$data[5]} = "Anchor";
$chrdb{$data[0]}->{'ctg'}++;
$chrdb{$data[0]}->{'len'} = $data[2];
}
}
close IN;
my $numU = keys %uctgdb;
my $numA = keys %actgdb;
my $sumA = 0;
print "ChrID Anchored_ctg Length\n";
foreach my $chrn (sort {$chrdb{$b}->{'len'}<=>$chrdb{$a}->{'len'} } keys %chrdb){
$sumA += $chrdb{$chrn}->{'len'};
print "$chrn $chrdb{$chrn}->{'ctg'} $chrdb{$chrn}->{'len'}\n";
}
print "Total number of contigs (bp): $sumC\n";
print "Total length of contigs (bp): $sumL\n";
print "Total number of anchored contgis: $numA\n";
print "Total length of chromosome level assembly (bp): $sumA\n";
print "Number of unanchored contigs: $numU\n";
print "Length of unanchored contigs: $sumU\n";
my $arate = (1-$sumU/$sumL)*100;
$arate = sprintf("%.2f",$arate);
print "Anchor rate (%): $arate\n";
gitextract_i521fsdf/
├── .gitmodules
├── README.md
├── allhic.v0.9.8
├── bin/
│ ├── ALLHiC_build
│ ├── ALLHiC_corrector
│ ├── ALLHiC_partition
│ ├── ALLHiC_pip.sh
│ ├── ALLHiC_plot
│ ├── ALLHiC_prune
│ ├── ALLHiC_rescue
│ └── allhic
└── scripts/
├── ALLHiC2ALLMAPS.pl
├── PreprocessSAMs.pl
├── agp2tour.pl
├── bam2CLM.pl
├── bam2CLM_simple.pl
├── bam2net.pl
├── bam_HiCplotter.py
├── blastn_parse.pl
├── classify.pl
├── filterBAM_forHiC.pl
├── gmap2AlleleTable.pl
├── gmap2AlleleTableBED.pl
├── link_superscaffold.pl
├── make_bed_around_RE_site.pl
├── mc_bam.pl
├── odering2tour.pl
├── partition.pl
├── partition_gmap.pl
├── partition_gmap.py
├── prune.pl
├── ragoo2ALLHiC.pl
├── release3DDNA.pl
├── remove_reads.pl
├── remove_small_contigs.py
├── simuCTG.pl
└── statAGP.pl
SYMBOL INDEX (14 symbols across 3 files) FILE: scripts/bam_HiCplotter.py function get_read_pos_with_sam_bam_file (line 10) | def get_read_pos_with_sam_bam_file(sam_bam_file): function get_chr_len (line 37) | def get_chr_len(chr_list): function calc_read_count_per_bin (line 51) | def calc_read_count_per_bin(chr_len_db, chr_order, read_on_chr, bin_size): function draw_heatmap (line 115) | def draw_heatmap(data, chrn, bin_size, ext): FILE: scripts/partition_gmap.py function get_opt (line 9) | def get_opt(): function read_fasta (line 19) | def read_fasta(in_fa): function load_allele (line 34) | def load_allele(allele_table): function split_files (line 63) | def split_files(chrn, allele_table, ref, bam_file, wrkdir): function partition_gmap (line 86) | def partition_gmap(ref, allele_table, bam, wrkdir, threads): FILE: scripts/remove_small_contigs.py function assembly_to_groups (line 11) | def assembly_to_groups(assembly, len_cutoff): function output_clusters (line 29) | def output_clusters(ctg_dict, cluster_list): function output_counts (line 37) | def output_counts(ctg_dict, counts): function output_fasta (line 48) | def output_fasta(ctg_dict, fasta): function main (line 62) | def main():
Condensed preview — 37 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (116K chars).
[
{
"path": ".gitmodules",
"chars": 74,
"preview": "[submodule \"src\"]\n\tpath = src\n\turl = https://github.com/tanghaibao/allhic\n"
},
{
"path": "README.md",
"chars": 501,
"preview": "# ALLHiC\nALLHiC: phasing and scaffolding polyploid genomes based on Hi-C data \nSee wiki for details (https://github.com"
},
{
"path": "bin/ALLHiC_build",
"chars": 1578,
"preview": "#!/usr/bin/perl -w\n\n\ndie \"Usage: perl $0 refSeq.fasta\\n\" if(!(defined $ARGV[0]));\n\nprint \"1. tour format to agp ...\\n\";\n"
},
{
"path": "bin/ALLHiC_corrector",
"chars": 8997,
"preview": "#!/usr/bin/env python\nimport sys\nimport multiprocessing\nimport math\nimport numpy as np\nimport pysam\nimport time\nimport a"
},
{
"path": "bin/ALLHiC_partition",
"chars": 1676,
"preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"b:r:e:k:m:\";\n\n\nif ( (!defined $opt_r)|| (!defined $opt_e)|| (!defined $opt"
},
{
"path": "bin/ALLHiC_pip.sh",
"chars": 2796,
"preview": "#!/bin/bash\n\nusage()\n{\n\techo \" Usage: `basename $0` -r reference -1 R1.fq -2 R2.fq -k group_count [-e enzyme] [-t thr"
},
{
"path": "bin/ALLHiC_plot",
"chars": 10040,
"preview": "#!/usr/bin/env python\nimport argparse\nimport numpy as np\nimport matplotlib as mpl\nmpl.use(\"Agg\")\nimport matplotlib.pyplo"
},
{
"path": "bin/ALLHiC_rescue",
"chars": 5091,
"preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"b:r:c:i:m:\";\n\n\n\nif ( (!defined $opt_b)|| (!defined $opt_r)|| (!defined $op"
},
{
"path": "scripts/ALLHiC2ALLMAPS.pl",
"chars": 653,
"preview": "#!/usr/bin/perl -w\n### Convert ALLHiC output AGP file to ALLMAPS input csv file\nprint \"Convert ALLHiC output AGP file to"
},
{
"path": "scripts/PreprocessSAMs.pl",
"chars": 8482,
"preview": "#!/usr/bin/perl -w\nuse strict;\n\n\n\n# PreprocessSAMs.pl\n#\n# Syntax: PreprocessSAMs.pl <sam or bam filename> <draft assembl"
},
{
"path": "scripts/agp2tour.pl",
"chars": 665,
"preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 chr.agp\\n\" if(!defined $ARGV[0]);\nmy %infordb;\nmy $cnt = 0;\nopen(IN, \"grep -v co"
},
{
"path": "scripts/bam2CLM.pl",
"chars": 5176,
"preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"b:r:d:\";\n\n\nif ((!defined $opt_b)|| (!defined $opt_r) || (!defined $opt_d) "
},
{
"path": "scripts/bam2CLM_simple.pl",
"chars": 3007,
"preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 mapping.bam refSeq.fasta\\n\" if((!defined $ARGV[0]) or (!defined $ARGV[1]));\n\nmy "
},
{
"path": "scripts/bam2net.pl",
"chars": 1420,
"preview": "#!/usr/bin/perl -w\nuse Getopt::Std;\ngetopts \"c:b:o:\";\n\n\nif ((!defined $opt_c)|| (!defined $opt_b)||(!defined $opt_o) ) {"
},
{
"path": "scripts/bam_HiCplotter.py",
"chars": 6809,
"preview": "#!/usr/bin/env python\nimport os\nimport sys\nimport gc\nfrom math import log\nimport time\n\n\n# Get position of read based on "
},
{
"path": "scripts/blastn_parse.pl",
"chars": 1721,
"preview": "#!/usr/bin/perl -w\n\n###This script was used to parse blast+ result (outfmt 6)\n###you can get best hit with parameter -b "
},
{
"path": "scripts/classify.pl",
"chars": 3698,
"preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"i:p:r:g:\";\n\n\nif ((!defined $opt_i)|| (!defined $opt_p) || (!defined $opt_r"
},
{
"path": "scripts/filterBAM_forHiC.pl",
"chars": 1156,
"preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 file.bam out.sam\\n\" if(!defined($ARGV[0]) or !defined($ARGV[1]));\nopen(OUT, \"> $"
},
{
"path": "scripts/gmap2AlleleTable.pl",
"chars": 711,
"preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 ref.gff3\\n\" if(!defined ($ARGV[0]));\nmy $refGFF = $ARGV[0];\nopen(IN, \"grep 'gene"
},
{
"path": "scripts/gmap2AlleleTableBED.pl",
"chars": 675,
"preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 ref.bed\\n\" if(!defined ($ARGV[0]));\nmy $refGFF = $ARGV[0];\nopen(IN, \"grep 'gene'"
},
{
"path": "scripts/link_superscaffold.pl",
"chars": 3611,
"preview": "#!/usr/bin/perl -w\n\nmy %namedb;\nmy %removedb;\nwhile(<DATA>){\n\tchomp;\n\tmy ($id,$name) = (split/\\s+/,$_)[0,1];\n\t$namedb{$n"
},
{
"path": "scripts/make_bed_around_RE_site.pl",
"chars": 7717,
"preview": "#!/usr/bin/perl -w\nuse strict;\n\n\n# make_bed_around_restriction_site.pl: Make a BED file representing the regions around "
},
{
"path": "scripts/mc_bam.pl",
"chars": 2123,
"preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"b:a:r:\";\n\n\nif ((!defined $opt_b)|| (!defined $opt_a) ||(!defined $opt_r)) "
},
{
"path": "scripts/odering2tour.pl",
"chars": 350,
"preview": "#!/usr/bin/perl -w\n\nwhile(my $file=glob \"*_orderings.txt\"){\n\tmy $name = $file; \n\t\t $name =~ s/_orderings.txt//g;\n\t\t $nam"
},
{
"path": "scripts/partition.pl",
"chars": 2932,
"preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"g:d:b:r:\";\n\n\nif ((!defined $opt_g)|| (!defined $opt_r)) {\n die \"*******"
},
{
"path": "scripts/partition_gmap.pl",
"chars": 3248,
"preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"g:d:b:r:l:\";\n\n\nif ((!defined $opt_g)|| (!defined $opt_r)) {\n die \"*****"
},
{
"path": "scripts/partition_gmap.py",
"chars": 4071,
"preview": "#!/usr/bin/env python\nimport sys\nimport os\nimport argparse\nimport multiprocessing\nimport pysam\n\n\ndef get_opt():\n\tgroup ="
},
{
"path": "scripts/prune.pl",
"chars": 4175,
"preview": "#!/usr/bin/perl -w\n\n\nuse Getopt::Std;\ngetopts \"i:b:r:\";\n\n\nif ((!defined $opt_i)|| (!defined $opt_b)|| (!defined $opt_r))"
},
{
"path": "scripts/ragoo2ALLHiC.pl",
"chars": 2615,
"preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"l:r:b:e:\";\n\n\nif ((!defined $opt_l)|| (!defined $opt_r) ||(!defined $opt_b)"
},
{
"path": "scripts/release3DDNA.pl",
"chars": 1410,
"preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 No_of_chr seq.FINAL.fasta\\n\" if(!defined ($ARGV[0]) or !defined($ARGV[1]));\nmy $"
},
{
"path": "scripts/remove_reads.pl",
"chars": 1131,
"preview": "#!/usr/bin/perl -w\n\nmy %bamdb = ();\nopen(IN, \"bam.list\") or die\"\";\nwhile(<IN>){\n chomp;\n my $bam = $_;\n "
},
{
"path": "scripts/remove_small_contigs.py",
"chars": 2813,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\n# Author: Xiaofei Zeng\n# Email: xiaofei_zeng@whu.edu.cn\n# Created Time: 2"
},
{
"path": "scripts/simuCTG.pl",
"chars": 3294,
"preview": "#!/usr/bin/perl -w\n\nuse Getopt::Std;\ngetopts \"i:m:s:\";\n\nif ((!defined $opt_i)|| (!defined $opt_m) || (!defined $opt_s))"
},
{
"path": "scripts/statAGP.pl",
"chars": 1215,
"preview": "#!/usr/bin/perl -w\n\ndie \"Usage: perl $0 chr.agp\\n\" if(!defined $ARGV[0]);\nmy $agp = $ARGV[0];\nmy %uctgdb;\nmy %actgdb;\nmy"
}
]
// ... and 3 more files (download for full content)
About this extraction
This page contains the full source code of the tangerzhang/ALLHiC GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 37 files (10.5 MB), approximately 35.9k tokens, and a symbol index with 14 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.