Repository 'splicetrap'
hg clone https://toolshed.g2.bx.psu.edu/repos/bioitcore/splicetrap

Changeset 1:adc0f7765d85 (2017-09-07)
Previous changeset 0:d4ca551ca300 (2017-09-07) Next changeset 2:01169764d7ca (2017-09-07)
Commit message:
planemo upload
added:
PostAnalysis
SpliceChange
SpliceTrap.pl
TXdbgen
bin/ApplyCutoff.jie.pl
bin/Pair_estimate_c
bin/PostAnalysis
bin/PostAnalysis.pl
bin/SpliceChange
bin/SpliceChange.pl
bin/SpliceTrap
bin/SpliceTrap.pl
bin/SpliceTrap_measure.pl
bin/TXdbgen
bin/TXdbgen.pl
bin/apply_cutoff.sh
bin/batch_para_cov10p_fit.sh
bin/batchqsub.pl
bin/batchqsub.pl_orig
bin/beta_fit.R
bin/bowtie2eland.pl
bin/calc_pval.R
bin/downloaddb.pl
bin/get.frag.size.pl
bin/get.hist.pl
bin/get_bed_fa_j.pl
bin/get_event_dist_fit.pl
bin/gtf2bed.pl
bin/mapping_bowtie.sh
bin/mapping_rmap.sh
bin/mark.mt.4eland.pl
bin/rmap2eland.pl
bin/scan_nomt.pl
bin/scanbed2txdb.pl
bin/splitdb.sh
bin/vslz.pl
cutoffs/cutoff.pair.06.txt
cutoffs/cutoff.pair.07.txt
cutoffs/cutoff.pair.08.txt
refGenes.bed
splice_trap.xml
src/Makefile
src/splicetrap.estimate.cpp
test-data/input1.fastq
test-data/input2.fastq
test-data/output1.txt
test-data/output2.txt
removed:
SpliceTrap.tar.gz
b
diff -r d4ca551ca300 -r adc0f7765d85 PostAnalysis
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PostAnalysis Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,66 @@
+#!/usr/bin/perl
+my $SrcFolder="/home/galaxy/galaxy-dist/tools/SpliceTrap.0.90.1/bin";
+# this script is a wrapup for Post analysis based on the ratio file output
+
+use strict;
+use Getopt::Long;
+my $RatioFile="";
+my $OutputFile = "";
+my $JunctionCut=5;
+my $CutoffLevel="M";
+my $noIRM = 0;
+my $noIRMstr="";
+
+GetOptions (
+ "i:s"=>\$RatioFile,
+ "o:s"=>\$OutputFile,
+ "c:s"=>\$CutoffLevel,
+ "noIRM|noirm"=>\$noIRM,
+ "j:i"=>\$JunctionCut
+);
+
+my $InputParaDes=" Usage of the script:
+ -i      input file (.ratio file)
+ -o      output file
+ -c      Cutoff Level:H/[M]/L
+ Means High, Middle or Low
+ -j Junction reads per junction requirement for each exon-isoform [5]
+ --noIRM Use the unadjusted inclusion ratios (before IRM correction)
+";
+
+if($RatioFile eq "")
+{
+ print $InputParaDes;
+ exit;
+}
+
+if($CutoffLevel ne "H" and  $CutoffLevel ne "M" and  $CutoffLevel ne "L")
+{
+ print $InputParaDes;
+ exit;
+}
+if($noIRM)
+{
+ $noIRMstr= "noirm";
+}
+
+
+system("perl $SrcFolder/ApplyCutoff.jie.pl $RatioFile $CutoffLevel $JunctionCut  $noIRMstr >$OutputFile.raw");
+
+open(rawfile, "$OutputFile.raw");
+open(outfile, ">$OutputFile");
+while(my $line=<rawfile>)
+{
+ chomp($line);
+ my @a=split("\t",$line);
+ if($noIRM)
+ {
+ print outfile join("\t",$a[21],$a[1],$a[3],$a[4],$a[5],$a[6],$a[7],$a[11],$a[12],$a[13],$a[14]),"\n";
+ }
+ else
+ {
+ print outfile join("\t",$a[21],$a[2],$a[3],$a[4],$a[5],$a[6],$a[7],$a[11],$a[12],$a[13],$a[14]),"\n";
+ }
+}
+close(outfile);
+close(rawfile);
b
diff -r d4ca551ca300 -r adc0f7765d85 SpliceChange
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SpliceChange Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,176 @@
+#!/usr/bin/perl
+my $SrcFolder="/home/galaxy/galaxy-dist/tools/SpliceTrap.0.90.1/bin";
+# compare two outputs from SpliceTrap
+use strict;
+
+# the information needed
+# inclusion ratio input file
+# filtered out or not input file
+# minimal inclusion ratio at least 0.1 for one condition
+# minimal splicing changes parameter
+# orignial pipeline written by Martin Akerman
+# re-organized and re-written by Jie Wu
+
+use FileHandle;
+
+use Getopt::Long;
+
+my @programs = ('grep','mkdir','R','paste','awk','sort');
+foreach my $program (@programs)
+{
+        die ("CHECK: $program not found\n") if(system("hash $program >/dev/null"));
+}
+
+
+my $InputFileName1 = "";
+my $InputFileName2 = "";
+my $OutputFileName = "";
+my $minchange = 0.3;
+my $mininc = 0.1;
+my $noIRM = 0;
+
+
+GetOptions (
+ "1:s"=>\$InputFileName1,
+ "2:s"=>\$InputFileName2,
+ "o:s"=>\$OutputFileName,
+ "noIRM|noirm"=>\$noIRM,
+ "m:f"=>\$mininc,
+ "c:f"=>\$minchange
+);
+
+my $InputParaDes=" Usage of the script:
+ -1 input file 1, output from SpliceTrap, *.raw file in the output folder 
+ -2 input file 2. see above.
+ -o output file prefix.
+ -c minimal change required, [default:0.3]
+ -m minimal inclusion ratio for at least one condition. [defualt:0.1]
+ --noIRM Use the unadjusted inclusion ratios (before IRM correction)
+";
+
+if($InputFileName1 eq "" or $InputFileName2 eq "" or $OutputFileName eq "")
+{
+ print $InputParaDes;
+ exit;
+}
+
+
+
+if(-d "$OutputFileName.cache" )
+{
+ print "Aborted! output cache folder exists: $OutputFileName.cache \n";
+ exit;
+}
+else
+{
+ system("mkdir $OutputFileName.cache");
+}
+
+#
+my %ir1; # records ir from file1
+my %ir2; # records ir from file2
+# only records trios above the cutoffs
+
+open(input1, $InputFileName1) or die "$InputFileName1 open error!\n";
+while(my $line=<input1>)
+{
+ chomp($line);
+ my @a = split("\t", $line);
+ if($a[21] ne "na")
+ {
+ if($noIRM)
+ {
+ $ir1{$a[0]} = $a[1];
+ }
+ else
+ {
+ $ir1{$a[0]} = $a[2];
+ }
+ }
+}
+print scalar(keys (%ir1) )," records loaded from $InputFileName1\n";
+close(input1);
+
+open(input2, $InputFileName2) or die "$InputFileName2 open error!\n";
+while(my $line=<input2>)
+{
+        chomp($line);
+        my @a = split("\t", $line);
+        if($a[21] ne "na")
+        {
+ if($noIRM)
+ {
+ $ir2{$a[0]} = $a[1];
+ }
+ else
+ {
+                 $ir2{$a[0]} = $a[2];
+ }
+        }
+}
+print scalar(keys (%ir2) )," records loaded from $InputFileName2\n";
+
+
+close(input2);
+
+
+##
+my %mean;
+my %sd;
+
+my %num;
+
+my %filehandles;

+my @types = ("CA", "IR", "AD","AA");
+
+foreach my $type (@types)
+{
+ my $fh = new FileHandle;
+ open($fh, ">$OutputFileName.cache/$type") or die "Cannot open $OutputFileName.cache/$type\n";
+ $filehandles{$type} = $fh;
+}
+
+
+foreach my $key (keys %ir1)
+{
+ if(exists $ir2{$key})
+ {
+ if(($ir1{$key} + $ir2{$key}) > 0)
+ {
+ #find the type
+ my $type = substr($key, 0, 2);
+ $type = "CA" if $type eq "CS";
+ $num{$type}++;
+
+ my $change = ($ir2{$key} - $ir1{$key})/ ($ir1{$key} + $ir2{$key});
+ $mean{$type} = $mean{$type} + $change;
+ $sd{$type} = $change*$change + $sd{$type};
+
+ $change = sprintf("%.4f",$change);
+
+ my $fout =  $filehandles{$type};
+ print $fout $key,"\t",$ir1{$key},"\t",$ir2{$key},"\t",$change,"\n";
+ }
+ }
+}
+
+foreach my $type (keys %filehandles)
+{
+ close($filehandles{$type});
+ if($num{$type} == 0)
+ {
+ warn  "no AS events passed filters for both files\n";
+ next;
+ }
+ $mean{$type} = $mean{$type}/$num{$type};
+ $sd{$type} = sqrt($sd{$type}/$num{$type});
+ system("R  --slave --args  $OutputFileName.cache/$type $mean{$type} $sd{$type} $num{$type} <$SrcFolder/calc_pval.R");
+ system("paste $OutputFileName.cache/$type $OutputFileName.cache/$type.p |awk '(\$2>$mininc||\$3>$mininc)&&(\$4>$minchange||\$4<-$minchange)' |sort -k4nr >$OutputFileName.$type.report");
+ print "$num{$type} $type events processed...\n";
+ #print $mean{$type},"\t",  $sd{$type} ,"\t",$num{$type},"\n";
+
+}
+system("rm $OutputFileName.cache -rf");
+
+
b
diff -r d4ca551ca300 -r adc0f7765d85 SpliceTrap.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SpliceTrap.pl Thu Sep 07 15:06:58 2017 -0400
[
b'@@ -0,0 +1,285 @@\n+#!/usr/bin/perl\n+# Author: wuj@cshl.edu\n+# Modified: Baekdoo Kim (baegi7942@gmail.com)\n+use strict;\n+use Getopt::Long;\n+use Data::Dumper;\n+####################\n+use Cwd;\n+my $PROG = $0;\n+my $CUR_DIR = Cwd::abs_path(Cwd::cwd());\n+my $PROG_ABS_PATH = Cwd::abs_path($PROG);\n+#my $SrcFolder=`dirname $PROG_ABS_PATH`;\n+#chomp($SrcFolder);\n+#my %config=do "$ENV{HOME}/.SpliceTrap.pl.ini";\n+#my $SrcFolder=$config{SrcFolder};\n+\n+my @programs = (\'R\',\'echo\',\'cat\',\'bash\',\'perl\',\'ln\',\'mkdir\',\'paste\',\'grep\',\'sort\',\'basename\',\'awk\',\'wc\',\'mv\',\'cd\',\'rm\',\'split\',\'head\' );\n+foreach my $program (@programs)\n+{\n+        die ("CHECK: $program not found\\n") if(system("hash $program >/dev/null"));\n+\n+}\n+\n+####################\n+my $SrcFolder="";\n+my $MapSoftware="bowtie";\n+my $DatabasePrefix="hg38";\n+my $ReadFileFormat="";\n+my $ReadFile1Name="";\n+my $ReadFile2Name="";\n+my $CutoffLevel="M";\n+my $Outputfolder=$CUR_DIR;\n+my $OutputPrefix="Result";\n+#my $CutoffOnly=0;\n+my $ReadSize=36;\n+my $JunctionCut=5;\n+my $onGalaxy_raw="";\n+my $onGalaxy_txt="";\n+my $BowtieThreads=1;\n+my $noIRMstr="";\n+my $noIRM = 0;\n+\n+my $num_args = $#ARGV;\n+$onGalaxy_raw = $ARGV[$num_args-1];\n+$onGalaxy_txt = $ARGV[$num_args];\n+\n+GetOptions (\n+\t"l:s"=>\\$SrcFolder,\n+        "m:s"=>\\$MapSoftware,\n+        "d:s"=>\\$DatabasePrefix,\n+#       "f:s"=>\\$ReadFileFormat,\n+        "1:s"=>\\$ReadFile1Name,\n+        "2:s"=>\\$ReadFile2Name,\n+        "c:s"=>\\$CutoffLevel,\n+        "outdir:s"=>\\$Outputfolder,\n+        "o:s"=>\\$OutputPrefix,\n+        "j:i"=>\\$JunctionCut,\n+        "s:i"=>\\$ReadSize,\n+        "p:i"=>\\$BowtieThreads,\n+        "noIRM|noirm"=>\\$noIRM\n+#       "local:s"=>\\$local,\n+#       "rerun"=>\\$CutoffOnly\n+);\n+#-O for galaxy output\n+\n+\n+my $InputParaDes="      Usage of the script:\n+\t-l\tBase Location (required)\n+        -m      Mapping software: [bowtie]/rmap\n+        -d      Database prefix: [hg18]/mm9/rn4/userdefined\n+        -1      Read File 1\n+        -2      Read File 2\n+        -c      Cutoff Level:H/[M]/L\n+                Means High, Middle or Low\n+        -j      Junction reads requirement per junction for each exon-isoform [5]\n+        -o      Output prefix {Result}\n+        -s      Read Size [36]\n+        --outdir Output folder [./]\n+        -p      Bowtie parameter, threads number, only use this when you don\'t use qsub [1]\n+        --noIRM Skip the IRM correction step\n+        \n+        This is a quick help, please refer to the README file for details.\n+";\n+\n+\n+if($SrcFolder eq "") {\n+\tprint "[CHECK] - Please provide the location of the script (option \'-l\')\\n\\n";\n+\texit;\n+}\n+\n+if($ReadFile2Name eq "")\n+{\n+        $ReadFile2Name = $ReadFile1Name;\n+        #trigger singled end mode\n+}\n+\n+if($ReadFile1Name eq "" or $ReadFile2Name eq "" )\n+{\n+        print $InputParaDes;\n+        exit;\n+}\n+\n+if($BowtieThreads < 1)\n+{\n+        print $InputParaDes;\n+        exit;\n+}\n+\n+if (! -e "$SrcFolder/db/$DatabasePrefix/parallel")\n+{\n+        print "CHECK: Error, the database you specified is not properly installed.\\n";\n+        #print $InputParaDes;\n+        exit;\n+\n+}\n+\n+if($CutoffLevel ne "H" and  $CutoffLevel ne "M" and  $CutoffLevel ne "L")\n+{\n+        print $InputParaDes;\n+        exit;\n+}\n+\n+\n+$ReadFile1Name = Cwd::abs_path($ReadFile1Name);\n+$ReadFile2Name = Cwd::abs_path($ReadFile2Name);\n+\n+#check the files\n+open(check,$ReadFile1Name) or die ("CHECK: Error when opening $ReadFile1Name\\n");\n+my $checkoneline = <check>;\n+if(substr($checkoneline,0,1) eq ">")\n+{\n+        $ReadFileFormat = "fasta";\n+}\n+elsif(substr($checkoneline,0,1) eq "@")\n+{\n+        $ReadFileFormat = "fastq";\n+}\n+else\n+{\n+        die("CHECK: ERROR:Please check $ReadFile1Name\\n");\n+}\n+close(check);\n+\n+open(check,$ReadFile2Name) or die ("CHECK: Error when opening $ReadFile2Name\\n");\n+my $checkoneline = <check>;\n+if(substr($checkoneline,0,1) eq ">")\n+{\n+        die("CHECK: $ReadFile2Name has a different format as $ReadFile1Name\\n") if ($ReadFileFormat ne "fasta");\n+}\n+elsif(substr($checkoneline,0,1) eq "'..b'print "CHECK: checking rmap...\\n";\n+        if(system("type rmap &>/dev/null") ==0 )\n+        {\n+                print "CHECK: rmap found, continue\\n";\n+        }\n+        else\n+        {\n+                die "CHECK: No rmap found in PATH, EXIT!\\n";\n+        }\n+}\n+else\n+{\n+        die "CHECK: option -m only takes rmap or bowtie as inputs\\n";\n+}\n+\n+if($ReadSize == 0)\n+{\n+        die "CHECK: Please check option -s Read size\\n";\n+}\n+\n+if($noIRM)\n+{\n+        $noIRMstr= "noirm";\n+}\n+\n+#write more checks later\n+print "PARAMETERS:\\tMapping software:  ",$MapSoftware,"\\n";\n+print "PARAMETERS:\\tDatabase prefix:   ",$DatabasePrefix,"\\n";\n+print "PARAMETERS:\\tRead end 1:        ",$ReadFile1Name,"\\n";\n+print "PARAMETERS:\\tRead end 2:        ",$ReadFile2Name,"\\n" if($ReadFile2Name ne $ReadFile1Name);\n+print "PARAMETERS:\\tGalaxy_raw:        ",$onGalaxy_raw,"\\n"; #if($onGalaxy_raw ne "");\n+print "PARAMETERS:\\tGalaxy_txt:        ",$onGalaxy_txt,"\\n"; #if($onGalaxy_txt ne "");\n+print "PARAMETERS:\\tCutoff level:      ",$CutoffLevel,"\\n";\n+print "PARAMETERS:\\tJunction reads.min:",$JunctionCut,"\\n";\n+print "PARAMETERS:\\tOutput folder:     ",$Outputfolder,"\\n";\n+print "PARAMETERS:\\tOutput prefix:     ",$OutputPrefix,"\\n";\n+print "PARAMETERS:\\tRead size:         ",$ReadSize,"\\n";\n+print "PARAMETERS:\\tBowtie threads #:  ",$BowtieThreads,"\\n";\n+print "PARAMETERS:\\tNo IRM.\\n" if ($noIRM);\n+\n+if($MapSoftware eq "bowtie")\n+{\n+        print "=================STAGE 1 MAPPING===================\\n";\n+        system("bash $SrcFolder/bin/mapping_bowtie.sh $ReadFile1Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder $BowtieThreads");\n+        system("bash $SrcFolder/bin/mapping_bowtie.sh $ReadFile2Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder $BowtieThreads") if($ReadFile2Name ne $ReadFile1Name);\n+        print "=================STAGE 2 ESTIMATION================\\n";\n+        #  ratio, log, nums\n+        system("bash $SrcFolder/bin/batch_para_cov10p_fit.sh $ReadFile1Name $ReadFile2Name $OutputPrefix $ReadSize $DatabasePrefix $Outputfolder $SrcFolder $noIRMstr") ;\n+        print "=================STAGE 3 CUTOFF====================\\n";\n+        #   raw\n+        system("bash $SrcFolder/bin/apply_cutoff.sh $OutputPrefix $CutoffLevel $Outputfolder $JunctionCut $SrcFolder $noIRMstr");\n+\n+\n+}\n+\n+if($MapSoftware eq "rmap")\n+{\n+        print "=================STAGE 1 MAPPING===================\\n";\n+\n+        system("bash $SrcFolder/bin/mapping_rmap.sh $ReadFile1Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder") ;\n+        system("bash $SrcFolder/bin/mapping_rmap.sh $ReadFile2Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder") if($ReadFile2Name ne $ReadFile1Name);\n+        print "=================STAGE 2 ESTIMATION================\\n";\n+\n+        system("bash $SrcFolder/bin/batch_para_cov10p_fit.sh $ReadFile1Name $ReadFile2Name $OutputPrefix $ReadSize $DatabasePrefix $Outputfolder $SrcFolder $noIRMstr") ;\n+        print "=================STAGE 3 CUTOFF====================\\n";\n+        system("bash $SrcFolder/bin/apply_cutoff.sh $OutputPrefix $CutoffLevel $Outputfolder $JunctionCut $SrcFolder $noIRMstr");\n+\n+\n+}\n+\n+#print "============ALL DONE, OUTPUTFILE:$OutputPrefix.txt\\n";\n+\n+if($onGalaxy_raw ne "" && $onGalaxy_txt ne "")\n+{\n+        print "OUTPUTFILE:$OutputPrefix.raw\\n";\n+        system("grep -v na $Outputfolder/$OutputPrefix.raw >$onGalaxy_raw");\n+        print "OUTPUTFILE:$OutputPrefix.txt\\n";\n+        system("grep -v na $Outputfolder/$OutputPrefix.txt >$onGalaxy_txt");\n+}\n+\n+print "============Clean up\\n";\n+system("rm -r $Outputfolder/$OutputPrefix.*");\n+\n+sub random_sessid {\n+        #my @chars = (0..9,a..z,A..Z);\n+        my @chars = (\'a\'..\'z\',\'A\'..\'Z\');\n+        my $len = 10;\n+        my $string = join \'\', map {$chars[rand(@chars)]} (1..$len);\n+        return $string;\n+}                                                                                                                                            \n'
b
diff -r d4ca551ca300 -r adc0f7765d85 SpliceTrap.tar.gz
b
Binary file SpliceTrap.tar.gz has changed
b
diff -r d4ca551ca300 -r adc0f7765d85 TXdbgen
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/TXdbgen Thu Sep 07 15:06:58 2017 -0400
b
@@ -0,0 +1,97 @@
+#!/usr/bin/perl
+my $SrcFolder="/home/galaxy/galaxy-dist/tools/SpliceTrap.0.90.1/bin";
+# this script is to generate TXdb database files from bed/gtf file
+
+use strict;
+use Cwd;
+use Getopt::Long;
+
+my @programs = ('split','bowtie-build','sort', 'uniq', 'ls','bash','rm','mv','cut','grep','echo');
+foreach my $program (@programs)
+{
+        die ("CHECK: $program not found\n") if(system("hash $program >/dev/null"));
+
+}
+
+
+my $genomedir = "";
+
+my $annofilename = "";
+my $txdbname = "userdefined";
+my $knownonly = 0;
+my $gtfinput = 0;
+
+GetOptions (
+ "g:s"=>\$genomedir,
+ "a:s"=>\$annofilename,
+ "n:s"=>\$txdbname,
+ "gtf"=>\$gtfinput,
+ "knownonly"=>\$knownonly
+);
+
+my $InputParaDes="      Usage of the script:
+        -g      genome fasta file location
+ -a annotation file (bed/gtf)
+ -n txdb name
+ --gtf specify this if annotation file is in gtf format
+";
+
+if($genomedir eq "" or $annofilename eq "")
+{
+ print $InputParaDes;
+ exit;
+}
+
+$genomedir = Cwd::abs_path($genomedir);
+$annofilename = Cwd::abs_path($annofilename);
+
+my $annofilebase = `basename $annofilename`;
+chomp($annofilebase);
+#need a cache folder to avoid mess
+
+my $cachefolder = $annofilebase.".cache";
+
+if (! -e $cachefolder)
+{
+ mkdir $cachefolder or die "TXDBGEN: could not create cache folder $cachefolder\n";
+}
+if($gtfinput)
+{
+ print "TXDBGEN: converting gtf file into bed format\n";
+ system ("perl $SrcFolder/gtf2bed.pl $annofilename >$cachefolder/$annofilebase.bed");
+ $annofilename = "$cachefolder/$annofilebase.bed";
+}
+
+
+print "TXDBGEN: scan $annofilename for AS events...\n";
+system("perl $SrcFolder/scanbed2txdb.pl $annofilename $cachefolder/TXdb.tmp");
+print "TXDBGEN: fetch sequences from $genomedir...\n";
+system("sort -k1,1 $cachefolder/TXdb.tmp >$cachefolder/TXdb.tmp.sort");
+#get fasta file list
+system("ls $genomedir/*.fa >$cachefolder/chr.list");
+
+system("perl $SrcFolder/get_bed_fa_j.pl $cachefolder/TXdb.tmp.sort $cachefolder/chr.list $cachefolder/out.bed $cachefolder/TXdb.fasta");
+
+print "TXDBGEN: generate files for parallel computing...\n";
+if (! -e "$cachefolder/parallel")
+{
+ mkdir "$cachefolder/parallel" or die "TXDBGEN: could not create $cachefolder/parallel\n";
+}
+system("grep L $cachefolder/out.bed >$cachefolder/TXdb.bed");
+system("rm $cachefolder/out.bed");
+system("sort $cachefolder/TXdb.tmp.evi >$cachefolder/TXdb.evi");
+system("rm $cachefolder/TXdb.tmp.evi");
+system("bash $SrcFolder/splitdb.sh $cachefolder/parallel");
+print "TXDBGEN: build Bowtie index...\n";
+
+if (! -e "$cachefolder/btw")
+{
+ mkdir "$cachefolder/btw" or die "TXDBGEN: could not create $cachefolder/btw\n";
+}
+system("bowtie-build $cachefolder/TXdb.fasta $cachefolder/btw/TXdb");
+system("rm $cachefolder/TXdb.tmp* $cachefolder/chr.list");
+print "TXDBGEN: Copy files to $SrcFolder/../db/$txdbname\n";
+
+system("mv $cachefolder $SrcFolder/../db/$txdbname");
+print "TXDBGEN: Done!\n";
+
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/ApplyCutoff.jie.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/ApplyCutoff.jie.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,108 @@
+#apply our cuttoff hash table on the IR calculated by Jie
+#Modified from Martin's code
+use strict;
+
+use Cwd;
+my $PROG = $0;
+my $CUR_DIR = Cwd::abs_path(Cwd::cwd());
+my $PROG_ABS_PATH = Cwd::abs_path($PROG);
+my $SrcFolder=`dirname $PROG_ABS_PATH`;
+chomp($SrcFolder);
+
+my %cutoff;
+my @Exlen;
+
+my $cutoff_level=$ARGV[1];
+my $JunctionCut = $ARGV[2];
+my $noirm = $ARGV[3];
+
+my $cutoff_level_index=7;
+
+ $cutoff_level_index=8 if $cutoff_level eq "H";
+$cutoff_level_index=6 if $cutoff_level eq "L";
+
+open(CUT,"$SrcFolder/../cutoffs/cutoff.pair.0".$cutoff_level_index.".txt") || die "cutoff file not found $!\n";
+
+while(<CUT>){
+ chomp;
+ my @a=split(/\t/,$_);
+ push @Exlen,$a[0];
+ $cutoff{$a[0]}=$a[1];
+}
+close(CUT);
+
+open(IN,$ARGV[0]);
+
+while(<IN>){
+ chomp;
+ my @a=split(/\t/,$_);
+ my $Ez='Ez=yes';
+ my $print=$_;
+ if($a[0]=~m/#/g){next}
+ my $eventid=substr($a[0],0,2);
+ my $bir =$a[2];
+ $bir =$a[1] if($noirm eq "noirm");
+ my $j12 = $a[8];
+ my $j23 = $a[9];
+ my $j13 = $a[10];
+ my $cov1=$a[11];
+ my $cov2=$a[12];
+ my $cov3=$a[13];
+ my $siz1=$a[15];
+ my $siz2=$a[16];
+ my $siz3=$a[17];
+
+
+ my $stat1='exon1='.cutoff($siz1,$cov1,\@Exlen,%cutoff);
+ my $stat2='exon2='.cutoff($siz2,$cov2,\@Exlen,%cutoff);
+ my $stat3='exon3='.cutoff($siz3,$cov3,\@Exlen,%cutoff);
+ if($stat1 eq "exon1=yes" and $stat3 eq "exon3=yes") 
+ {
+ #$Ez="passed";
+ $Ez=$eventid if $eventid eq "AA";
+ $Ez=$eventid if $eventid eq "AD";
+ $Ez=$eventid if $eventid eq "IR";
+ if ($eventid eq "CS" or $eventid eq "CA" or $eventid eq "ME") 
+ {
+ if($bir >0.9)
+ {
+ $Ez = "CS";
+ }
+ else
+ {
+ $Ez = "CA";
+ }
+ }
+
+ }
+ else
+ {
+ #$Ez="declined";
+ $Ez = "na";
+ }
+ if( ($j12<$JunctionCut or $j23<$JunctionCut) and $j13 <$JunctionCut)
+ {
+ $Ez = "na";
+ }
+ print $print,"\t",$stat1,"\t",$stat2,"\t",$stat3,"\t",$Ez,"\n";
+}
+close(IN);
+####################################################################
+
+sub cutoff{
+ my($s,$c,$E,%cutoff)=@_;
+ my @Exlen=@$E;
+ if($c eq 'NA'){return('NA')}
+ my $range=$Exlen[$#Exlen];
+ foreach my $l(@Exlen){if($s<$l){$range=$l;last}}
+        if($c<$cutoff{$range}){return('no')}
+ return('yes')
+}
+
+
+sub contain{
+ my ($a,@a)=@_;
+ foreach(@a){if($a eq $_){return(1)}}
+ return(0)
+}
+
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/Pair_estimate_c
b
Binary file bin/Pair_estimate_c has changed
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/PostAnalysis
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/PostAnalysis Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,66 @@
+#!/usr/bin/perl
+my $SrcFolder="/home/galaxy/galaxy-dist/tools/SpliceTrap.0.90.1/bin";
+# this script is a wrapup for Post analysis based on the ratio file output
+
+use strict;
+use Getopt::Long;
+my $RatioFile="";
+my $OutputFile = "";
+my $JunctionCut=5;
+my $CutoffLevel="M";
+my $noIRM = 0;
+my $noIRMstr="";
+
+GetOptions (
+ "i:s"=>\$RatioFile,
+ "o:s"=>\$OutputFile,
+ "c:s"=>\$CutoffLevel,
+ "noIRM|noirm"=>\$noIRM,
+ "j:i"=>\$JunctionCut
+);
+
+my $InputParaDes=" Usage of the script:
+ -i      input file (.ratio file)
+ -o      output file
+ -c      Cutoff Level:H/[M]/L
+ Means High, Middle or Low
+ -j Junction reads per junction requirement for each exon-isoform [5]
+ --noIRM Use the unadjusted inclusion ratios (before IRM correction)
+";
+
+if($RatioFile eq "")
+{
+ print $InputParaDes;
+ exit;
+}
+
+if($CutoffLevel ne "H" and  $CutoffLevel ne "M" and  $CutoffLevel ne "L")
+{
+ print $InputParaDes;
+ exit;
+}
+if($noIRM)
+{
+ $noIRMstr= "noirm";
+}
+
+
+system("perl $SrcFolder/ApplyCutoff.jie.pl $RatioFile $CutoffLevel $JunctionCut  $noIRMstr >$OutputFile.raw");
+
+open(rawfile, "$OutputFile.raw");
+open(outfile, ">$OutputFile");
+while(my $line=<rawfile>)
+{
+ chomp($line);
+ my @a=split("\t",$line);
+ if($noIRM)
+ {
+ print outfile join("\t",$a[21],$a[1],$a[3],$a[4],$a[5],$a[6],$a[7],$a[11],$a[12],$a[13],$a[14]),"\n";
+ }
+ else
+ {
+ print outfile join("\t",$a[21],$a[2],$a[3],$a[4],$a[5],$a[6],$a[7],$a[11],$a[12],$a[13],$a[14]),"\n";
+ }
+}
+close(outfile);
+close(rawfile);
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/PostAnalysis.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/PostAnalysis.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,64 @@
+# this script is a wrapup for Post analysis based on the ratio file output
+
+use strict;
+use Getopt::Long;
+my $RatioFile="";
+my $OutputFile = "";
+my $JunctionCut=5;
+my $CutoffLevel="M";
+my $noIRM = 0;
+my $noIRMstr="";
+
+GetOptions (
+ "i:s"=>\$RatioFile,
+ "o:s"=>\$OutputFile,
+ "c:s"=>\$CutoffLevel,
+ "noIRM|noirm"=>\$noIRM,
+ "j:i"=>\$JunctionCut
+);
+
+my $InputParaDes=" Usage of the script:
+ -i      input file (.ratio file)
+ -o      output file
+ -c      Cutoff Level:H/[M]/L
+ Means High, Middle or Low
+ -j Junction reads per junction requirement for each exon-isoform [5]
+ --noIRM Use the unadjusted inclusion ratios (before IRM correction)
+";
+
+if($RatioFile eq "")
+{
+ print $InputParaDes;
+ exit;
+}
+
+if($CutoffLevel ne "H" and  $CutoffLevel ne "M" and  $CutoffLevel ne "L")
+{
+ print $InputParaDes;
+ exit;
+}
+if($noIRM)
+{
+ $noIRMstr= "noirm";
+}
+
+
+system("perl $SrcFolder/ApplyCutoff.jie.pl $RatioFile $CutoffLevel $JunctionCut  $noIRMstr >$OutputFile.raw");
+
+open(rawfile, "$OutputFile.raw");
+open(outfile, ">$OutputFile");
+while(my $line=<rawfile>)
+{
+ chomp($line);
+ my @a=split("\t",$line);
+ if($noIRM)
+ {
+ print outfile join("\t",$a[21],$a[1],$a[3],$a[4],$a[5],$a[6],$a[7],$a[11],$a[12],$a[13],$a[14]),"\n";
+ }
+ else
+ {
+ print outfile join("\t",$a[21],$a[2],$a[3],$a[4],$a[5],$a[6],$a[7],$a[11],$a[12],$a[13],$a[14]),"\n";
+ }
+}
+close(outfile);
+close(rawfile);
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/SpliceChange
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/SpliceChange Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,176 @@
+#!/usr/bin/perl
+my $SrcFolder="/home/galaxy/galaxy-dist/tools/SpliceTrap.0.90.1/bin";
+# compare two outputs from SpliceTrap
+use strict;
+
+# the information needed
+# inclusion ratio input file
+# filtered out or not input file
+# minimal inclusion ratio at least 0.1 for one condition
+# minimal splicing changes parameter
+# orignial pipeline written by Martin Akerman
+# re-organized and re-written by Jie Wu
+
+use FileHandle;
+
+use Getopt::Long;
+
+my @programs = ('grep','mkdir','R','paste','awk','sort');
+foreach my $program (@programs)
+{
+        die ("CHECK: $program not found\n") if(system("hash $program >/dev/null"));
+}
+
+
+my $InputFileName1 = "";
+my $InputFileName2 = "";
+my $OutputFileName = "";
+my $minchange = 0.3;
+my $mininc = 0.1;
+my $noIRM = 0;
+
+
+GetOptions (
+ "1:s"=>\$InputFileName1,
+ "2:s"=>\$InputFileName2,
+ "o:s"=>\$OutputFileName,
+ "noIRM|noirm"=>\$noIRM,
+ "m:f"=>\$mininc,
+ "c:f"=>\$minchange
+);
+
+my $InputParaDes=" Usage of the script:
+ -1 input file 1, output from SpliceTrap, *.raw file in the output folder 
+ -2 input file 2. see above.
+ -o output file prefix.
+ -c minimal change required, [default:0.3]
+ -m minimal inclusion ratio for at least one condition. [defualt:0.1]
+ --noIRM Use the unadjusted inclusion ratios (before IRM correction)
+";
+
+if($InputFileName1 eq "" or $InputFileName2 eq "" or $OutputFileName eq "")
+{
+ print $InputParaDes;
+ exit;
+}
+
+
+
+if(-d "$OutputFileName.cache" )
+{
+ print "Aborted! output cache folder exists: $OutputFileName.cache \n";
+ exit;
+}
+else
+{
+ system("mkdir $OutputFileName.cache");
+}
+
+#
+my %ir1; # records ir from file1
+my %ir2; # records ir from file2
+# only records trios above the cutoffs
+
+open(input1, $InputFileName1) or die "$InputFileName1 open error!\n";
+while(my $line=<input1>)
+{
+ chomp($line);
+ my @a = split("\t", $line);
+ if($a[21] ne "na")
+ {
+ if($noIRM)
+ {
+ $ir1{$a[0]} = $a[1];
+ }
+ else
+ {
+ $ir1{$a[0]} = $a[2];
+ }
+ }
+}
+print scalar(keys (%ir1) )," records loaded from $InputFileName1\n";
+close(input1);
+
+open(input2, $InputFileName2) or die "$InputFileName2 open error!\n";
+while(my $line=<input2>)
+{
+        chomp($line);
+        my @a = split("\t", $line);
+        if($a[21] ne "na")
+        {
+ if($noIRM)
+ {
+ $ir2{$a[0]} = $a[1];
+ }
+ else
+ {
+                 $ir2{$a[0]} = $a[2];
+ }
+        }
+}
+print scalar(keys (%ir2) )," records loaded from $InputFileName2\n";
+
+
+close(input2);
+
+
+##
+my %mean;
+my %sd;
+
+my %num;
+
+my %filehandles;

+my @types = ("CA", "IR", "AD","AA");
+
+foreach my $type (@types)
+{
+ my $fh = new FileHandle;
+ open($fh, ">$OutputFileName.cache/$type") or die "Cannot open $OutputFileName.cache/$type\n";
+ $filehandles{$type} = $fh;
+}
+
+
+foreach my $key (keys %ir1)
+{
+ if(exists $ir2{$key})
+ {
+ if(($ir1{$key} + $ir2{$key}) > 0)
+ {
+ #find the type
+ my $type = substr($key, 0, 2);
+ $type = "CA" if $type eq "CS";
+ $num{$type}++;
+
+ my $change = ($ir2{$key} - $ir1{$key})/ ($ir1{$key} + $ir2{$key});
+ $mean{$type} = $mean{$type} + $change;
+ $sd{$type} = $change*$change + $sd{$type};
+
+ $change = sprintf("%.4f",$change);
+
+ my $fout =  $filehandles{$type};
+ print $fout $key,"\t",$ir1{$key},"\t",$ir2{$key},"\t",$change,"\n";
+ }
+ }
+}
+
+foreach my $type (keys %filehandles)
+{
+ close($filehandles{$type});
+ if($num{$type} == 0)
+ {
+ warn  "no AS events passed filters for both files\n";
+ next;
+ }
+ $mean{$type} = $mean{$type}/$num{$type};
+ $sd{$type} = sqrt($sd{$type}/$num{$type});
+ system("R  --slave --args  $OutputFileName.cache/$type $mean{$type} $sd{$type} $num{$type} <$SrcFolder/calc_pval.R");
+ system("paste $OutputFileName.cache/$type $OutputFileName.cache/$type.p |awk '(\$2>$mininc||\$3>$mininc)&&(\$4>$minchange||\$4<-$minchange)' |sort -k4nr >$OutputFileName.$type.report");
+ print "$num{$type} $type events processed...\n";
+ #print $mean{$type},"\t",  $sd{$type} ,"\t",$num{$type},"\n";
+
+}
+system("rm $OutputFileName.cache -rf");
+
+
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/SpliceChange.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/SpliceChange.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,174 @@
+# compare two outputs from SpliceTrap
+use strict;
+
+# the information needed
+# inclusion ratio input file
+# filtered out or not input file
+# minimal inclusion ratio at least 0.1 for one condition
+# minimal splicing changes parameter
+# orignial pipeline written by Martin Akerman
+# re-organized and re-written by Jie Wu
+
+use FileHandle;
+
+use Getopt::Long;
+
+my @programs = ('grep','mkdir','R','paste','awk','sort');
+foreach my $program (@programs)
+{
+        die ("CHECK: $program not found\n") if(system("hash $program >/dev/null"));
+}
+
+
+my $InputFileName1 = "";
+my $InputFileName2 = "";
+my $OutputFileName = "";
+my $minchange = 0.3;
+my $mininc = 0.1;
+my $noIRM = 0;
+
+
+GetOptions (
+ "1:s"=>\$InputFileName1,
+ "2:s"=>\$InputFileName2,
+ "o:s"=>\$OutputFileName,
+ "noIRM|noirm"=>\$noIRM,
+ "m:f"=>\$mininc,
+ "c:f"=>\$minchange
+);
+
+my $InputParaDes=" Usage of the script:
+ -1 input file 1, output from SpliceTrap, *.raw file in the output folder 
+ -2 input file 2. see above.
+ -o output file prefix.
+ -c minimal change required, [default:0.3]
+ -m minimal inclusion ratio for at least one condition. [defualt:0.1]
+ --noIRM Use the unadjusted inclusion ratios (before IRM correction)
+";
+
+if($InputFileName1 eq "" or $InputFileName2 eq "" or $OutputFileName eq "")
+{
+ print $InputParaDes;
+ exit;
+}
+
+
+
+if(-d "$OutputFileName.cache" )
+{
+ print "Aborted! output cache folder exists: $OutputFileName.cache \n";
+ exit;
+}
+else
+{
+ system("mkdir $OutputFileName.cache");
+}
+
+#
+my %ir1; # records ir from file1
+my %ir2; # records ir from file2
+# only records trios above the cutoffs
+
+open(input1, $InputFileName1) or die "$InputFileName1 open error!\n";
+while(my $line=<input1>)
+{
+ chomp($line);
+ my @a = split("\t", $line);
+ if($a[21] ne "na")
+ {
+ if($noIRM)
+ {
+ $ir1{$a[0]} = $a[1];
+ }
+ else
+ {
+ $ir1{$a[0]} = $a[2];
+ }
+ }
+}
+print scalar(keys (%ir1) )," records loaded from $InputFileName1\n";
+close(input1);
+
+open(input2, $InputFileName2) or die "$InputFileName2 open error!\n";
+while(my $line=<input2>)
+{
+        chomp($line);
+        my @a = split("\t", $line);
+        if($a[21] ne "na")
+        {
+ if($noIRM)
+ {
+ $ir2{$a[0]} = $a[1];
+ }
+ else
+ {
+                 $ir2{$a[0]} = $a[2];
+ }
+        }
+}
+print scalar(keys (%ir2) )," records loaded from $InputFileName2\n";
+
+
+close(input2);
+
+
+##
+my %mean;
+my %sd;
+
+my %num;
+
+my %filehandles;

+my @types = ("CA", "IR", "AD","AA");
+
+foreach my $type (@types)
+{
+ my $fh = new FileHandle;
+ open($fh, ">$OutputFileName.cache/$type") or die "Cannot open $OutputFileName.cache/$type\n";
+ $filehandles{$type} = $fh;
+}
+
+
+foreach my $key (keys %ir1)
+{
+ if(exists $ir2{$key})
+ {
+ if(($ir1{$key} + $ir2{$key}) > 0)
+ {
+ #find the type
+ my $type = substr($key, 0, 2);
+ $type = "CA" if $type eq "CS";
+ $num{$type}++;
+
+ my $change = ($ir2{$key} - $ir1{$key})/ ($ir1{$key} + $ir2{$key});
+ $mean{$type} = $mean{$type} + $change;
+ $sd{$type} = $change*$change + $sd{$type};
+
+ $change = sprintf("%.4f",$change);
+
+ my $fout =  $filehandles{$type};
+ print $fout $key,"\t",$ir1{$key},"\t",$ir2{$key},"\t",$change,"\n";
+ }
+ }
+}
+
+foreach my $type (keys %filehandles)
+{
+ close($filehandles{$type});
+ if($num{$type} == 0)
+ {
+ warn  "no AS events passed filters for both files\n";
+ next;
+ }
+ $mean{$type} = $mean{$type}/$num{$type};
+ $sd{$type} = sqrt($sd{$type}/$num{$type});
+ system("R  --slave --args  $OutputFileName.cache/$type $mean{$type} $sd{$type} $num{$type} <$SrcFolder/calc_pval.R");
+ system("paste $OutputFileName.cache/$type $OutputFileName.cache/$type.p |awk '(\$2>$mininc||\$3>$mininc)&&(\$4>$minchange||\$4<-$minchange)' |sort -k4nr >$OutputFileName.$type.report");
+ print "$num{$type} $type events processed...\n";
+ #print $mean{$type},"\t",  $sd{$type} ,"\t",$num{$type},"\n";
+
+}
+system("rm $OutputFileName.cache -rf");
+
+
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/SpliceTrap
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/SpliceTrap Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,263 @@
+#!/usr/bin/perl
+my $SrcFolder="/home/galaxy/galaxy-dist/tools/SpliceTrap.0.90.1/bin";
+# Author: wuj@cshl.edu
+use strict;
+use Getopt::Long;
+####################
+use Cwd;
+my $PROG = $0;
+my $CUR_DIR = Cwd::abs_path(Cwd::cwd());
+my $PROG_ABS_PATH = Cwd::abs_path($PROG);
+#my $SrcFolder=`dirname $PROG_ABS_PATH`;
+#chomp($SrcFolder);
+#my %config=do "$ENV{HOME}/.SpliceTrap.pl.ini";
+#my $SrcFolder=$config{SrcFolder};
+
+my @programs = ('R','echo','cat','bash','perl','ln','mkdir','paste','grep','sort','basename','awk','wc','mv','cd','rm','split','head' );
+foreach my $program (@programs)
+{
+ die ("CHECK: $program not found\n") if(system("hash $program >/dev/null"));
+
+}
+
+####################
+my $MapSoftware="bowtie";
+my $DatabasePrefix="hg18";
+my $ReadFileFormat="";
+my $ReadFile1Name="";
+my $ReadFile2Name="";
+my $CutoffLevel="M";
+my $Outputfolder=$CUR_DIR;
+my $OutputPrefix="Result";
+#my $CutoffOnly=0;
+my $ReadSize=36;
+my $JunctionCut=5;
+my $onGalaxy="";
+my $BowtieThreads=1;
+my $noIRMstr="";
+my $noIRM = 0;
+
+GetOptions (
+ "m:s"=>\$MapSoftware,
+ "d:s"=>\$DatabasePrefix,
+# "f:s"=>\$ReadFileFormat,
+ "1:s"=>\$ReadFile1Name,
+ "2:s"=>\$ReadFile2Name,
+ "c:s"=>\$CutoffLevel,
+ "outdir:s"=>\$Outputfolder,
+ "o:s"=>\$OutputPrefix,
+ "j:i"=>\$JunctionCut,
+ "s:i"=>\$ReadSize,
+ "p:i"=>\$BowtieThreads,
+ "noIRM|noirm"=>\$noIRM,
+ "g:s"=>\$onGalaxy
+# "local:s"=>\$local,
+# "rerun"=>\$CutoffOnly
+);
+#-O for galaxy output
+
+
+my $InputParaDes=" Usage of the script:
+ -m Mapping software: [bowtie]/rmap
+ -d Database prefix: [hg18]/mm9/rn4/userdefined
+ -1 Read File 1
+ -2 Read File 2
+ -c Cutoff Level:H/[M]/L
+ Means High, Middle or Low
+ -j Junction reads requirement per junction for each exon-isoform [5]
+ -o Output prefix {Result}
+ -s Read Size [36]
+ --outdir Output folder [./]
+ -p Bowtie parameter, threads number, only use this when you don't use qsub [1]
+ --noIRM Skip the IRM correction step
+
+ This is a quick help, please refer to the README file for details.
+";
+
+if($ReadFile2Name eq "")
+{
+ $ReadFile2Name = $ReadFile1Name;
+ #trigger singled end mode
+}
+
+if($ReadFile1Name eq "" or $ReadFile2Name eq "" )
+{
+ print $InputParaDes;
+ exit;
+}
+
+if($BowtieThreads < 1)
+{
+ print $InputParaDes;
+ exit;
+}
+
+if (! -e "$SrcFolder/../db/$DatabasePrefix/parallel")
+{
+ print "CHECK: Error, the database you specified is not properly installed.\n";
+ #print $InputParaDes;
+ exit;
+
+}
+
+if($CutoffLevel ne "H" and  $CutoffLevel ne "M" and  $CutoffLevel ne "L")
+{
+ print $InputParaDes;
+        exit;
+}
+
+$ReadFile1Name = Cwd::abs_path($ReadFile1Name);
+$ReadFile2Name = Cwd::abs_path($ReadFile2Name);
+
+#check the files
+open(check,$ReadFile1Name) or die ("CHECK: Error when opening $ReadFile1Name\n");
+my $checkoneline = <check>;
+if(substr($checkoneline,0,1) eq ">")
+{
+ $ReadFileFormat = "fasta";
+}
+elsif(substr($checkoneline,0,1) eq "@")
+{
+ $ReadFileFormat = "fastq";
+}
+else
+{
+ die("CHECK: ERROR:Please check $ReadFile1Name\n");
+}
+close(check);
+
+open(check,$ReadFile2Name) or die ("CHECK: Error when opening $ReadFile2Name\n");
+my $checkoneline = <check>;
+if(substr($checkoneline,0,1) eq ">")
+{
+        die("CHECK: $ReadFile2Name has a different format as $ReadFile1Name\n") if ($ReadFileFormat ne "fasta");
+}
+elsif(substr($checkoneline,0,1) eq "@")
+{
+        die("CHECK: $ReadFile2Name has a different format as $ReadFile1Name\n") if ($ReadFileFormat ne "fastq");
+}
+else
+{
+        die("CHECK: ERROR:Please check $ReadFile2Name\n");
+}
+close(check);
+
+$Outputfolder= Cwd::abs_path($Outputfolder);
+if($Outputfolder eq "/tmp")
+{
+        while(-e $Outputfolder)
+        {
+                my $random_foldername = random_sessid();
+                $Outputfolder = "/tmp/".$random_foldername;
+        }
+}
+
+
+if(! -e $Outputfolder)
+{
+ mkdir $Outputfolder or die "CHECK: cannot mkdir $Outputfolder\n";
+}
+if(! -d $Outputfolder)
+{
+ die "CHECK: $Outputfolder is not a folder\n";
+}
+
+if($MapSoftware eq "bowtie")
+{
+ print "CHECK: whether bowtie installed and in PATH\n";
+ my $bowtiechecker=`bowtie --version`;
+ if($bowtiechecker ne "")
+ {
+ print "CHECK: bowtie found, information below:\n";
+ print $bowtiechecker,"\n";
+ }
+ else
+ {
+ die "CHECK: No bowtie found in PATH, EXIT!\n";
+ }
+}
+elsif($MapSoftware eq "rmap")
+{
+ print "CHECK: checking rmap...\n";
+ if(system("type rmap &>/dev/null") ==0 )
+ {
+ print "CHECK: rmap found, continue\n";
+ }
+ else
+ {
+ die "CHECK: No rmap found in PATH, EXIT!\n";
+ }
+}
+else
+{
+ die "CHECK: option -m only takes rmap or bowtie as inputs\n";
+}
+
+if($ReadSize == 0)
+{
+ die "CHECK: Please check option -s Read size\n";
+}
+
+if($noIRM)
+{
+ $noIRMstr= "noirm";
+}
+
+#write more checks later
+print "PARAMETERS:\tMapping software:  ",$MapSoftware,"\n";
+print "PARAMETERS:\tDatabase prefix:   ",$DatabasePrefix,"\n";
+print "PARAMETERS:\tRead end 1:        ",$ReadFile1Name,"\n";
+print "PARAMETERS:\tRead end 2:        ",$ReadFile2Name,"\n" if($ReadFile2Name ne $ReadFile1Name);
+print "PARAMETERS:\tCutoff level:      ",$CutoffLevel,"\n";
+print "PARAMETERS:\tJunction reads.min:",$JunctionCut,"\n";
+print "PARAMETERS:\tOutput folder:     ",$Outputfolder,"\n";
+print "PARAMETERS:\tOutput prefix:     ",$OutputPrefix,"\n";
+print "PARAMETERS:\tRead size:         ",$ReadSize,"\n";
+print "PARAMETERS:\tBowtie threads #:  ",$BowtieThreads,"\n";
+print "PARAMETERS:\tNo IRM.\n" if ($noIRM);
+
+if($MapSoftware eq "bowtie")
+{
+ print "=================STAGE 1 MAPPING===================\n";
+ system("bash $SrcFolder/mapping_bowtie.sh $ReadFile1Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder $BowtieThreads");
+        system("bash $SrcFolder/mapping_bowtie.sh $ReadFile2Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder $BowtieThreads") if($ReadFile2Name ne $ReadFile1Name);
+ print "=================STAGE 2 ESTIMATION================\n";
+
+ system("bash $SrcFolder/batch_para_cov10p_fit.sh $ReadFile1Name $ReadFile2Name $OutputPrefix $ReadSize $DatabasePrefix $Outputfolder $SrcFolder $noIRMstr") ;
+ print "=================STAGE 3 CUTOFF====================\n";
+        system("bash $SrcFolder/apply_cutoff.sh $OutputPrefix $CutoffLevel $Outputfolder $JunctionCut $SrcFolder $noIRMstr");
+
+
+}
+
+if($MapSoftware eq "rmap")
+{
+ print "=================STAGE 1 MAPPING===================\n";
+
+        system("bash $SrcFolder/mapping_rmap.sh $ReadFile1Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder") ;
+        system("bash $SrcFolder/mapping_rmap.sh $ReadFile2Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder") if($ReadFile2Name ne $ReadFile1Name);
+ print "=================STAGE 2 ESTIMATION================\n";
+
+        system("bash $SrcFolder/batch_para_cov10p_fit.sh $ReadFile1Name $ReadFile2Name $OutputPrefix $ReadSize $DatabasePrefix $Outputfolder $SrcFolder $noIRMstr") ;
+ print "=================STAGE 3 CUTOFF====================\n";
+        system("bash $SrcFolder/apply_cutoff.sh $OutputPrefix $CutoffLevel $Outputfolder $JunctionCut $SrcFolder $noIRMstr");
+
+
+}
+
+print "============ALL DONE, OUTPUTFILE:$OutputPrefix.txt\n";
+
+if($onGalaxy ne "")
+{
+        system("grep -v na $Outputfolder/$OutputPrefix.txt >$onGalaxy");
+}
+
+sub random_sessid
+{
+        #my @chars = (0..9,a..z,A..Z);
+        my @chars = ('a'..'z','A'..'Z');
+        my $len = 10;
+        my $string = join '', map {$chars[rand(@chars)]} (1..$len);
+        return $string;
+     }
+
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/SpliceTrap.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/SpliceTrap.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,261 @@
+# Author: wuj@cshl.edu
+use strict;
+use Getopt::Long;
+####################
+use Cwd;
+my $PROG = $0;
+my $CUR_DIR = Cwd::abs_path(Cwd::cwd());
+my $PROG_ABS_PATH = Cwd::abs_path($PROG);
+#my $SrcFolder=`dirname $PROG_ABS_PATH`;
+#chomp($SrcFolder);
+#my %config=do "$ENV{HOME}/.SpliceTrap.pl.ini";
+#my $SrcFolder=$config{SrcFolder};
+
+my @programs = ('R','echo','cat','bash','perl','ln','mkdir','paste','grep','sort','basename','awk','wc','mv','cd','rm','split','head' );
+foreach my $program (@programs)
+{
+ die ("CHECK: $program not found\n") if(system("hash $program >/dev/null"));
+
+}
+
+####################
+my $MapSoftware="bowtie";
+my $DatabasePrefix="hg18";
+my $ReadFileFormat="";
+my $ReadFile1Name="";
+my $ReadFile2Name="";
+my $CutoffLevel="M";
+my $Outputfolder=$CUR_DIR;
+my $OutputPrefix="Result";
+#my $CutoffOnly=0;
+my $ReadSize=36;
+my $JunctionCut=5;
+my $onGalaxy="";
+my $BowtieThreads=1;
+my $noIRMstr="";
+my $noIRM = 0;
+
+GetOptions (
+ "m:s"=>\$MapSoftware,
+ "d:s"=>\$DatabasePrefix,
+# "f:s"=>\$ReadFileFormat,
+ "1:s"=>\$ReadFile1Name,
+ "2:s"=>\$ReadFile2Name,
+ "c:s"=>\$CutoffLevel,
+ "outdir:s"=>\$Outputfolder,
+ "o:s"=>\$OutputPrefix,
+ "j:i"=>\$JunctionCut,
+ "s:i"=>\$ReadSize,
+ "p:i"=>\$BowtieThreads,
+ "noIRM|noirm"=>\$noIRM,
+ "g:s"=>\$onGalaxy
+# "local:s"=>\$local,
+# "rerun"=>\$CutoffOnly
+);
+#-O for galaxy output
+
+
+my $InputParaDes=" Usage of the script:
+ -m Mapping software: [bowtie]/rmap
+ -d Database prefix: [hg18]/mm9/rn4/userdefined
+ -1 Read File 1
+ -2 Read File 2
+ -c Cutoff Level:H/[M]/L
+ Means High, Middle or Low
+ -j Junction reads requirement per junction for each exon-isoform [5]
+ -o Output prefix {Result}
+ -s Read Size [36]
+ --outdir Output folder [./]
+ -p Bowtie parameter, threads number, only use this when you don't use qsub [1]
+ --noIRM Skip the IRM correction step
+
+ This is a quick help, please refer to the README file for details.
+";
+
+if($ReadFile2Name eq "")
+{
+ $ReadFile2Name = $ReadFile1Name;
+ #trigger singled end mode
+}
+
+if($ReadFile1Name eq "" or $ReadFile2Name eq "" )
+{
+ print $InputParaDes;
+ exit;
+}
+
+if($BowtieThreads < 1)
+{
+ print $InputParaDes;
+ exit;
+}
+
+if (! -e "$SrcFolder/../db/$DatabasePrefix/parallel")
+{
+ print "CHECK: Error, the database you specified is not properly installed.\n";
+ #print $InputParaDes;
+ exit;
+
+}
+
+if($CutoffLevel ne "H" and  $CutoffLevel ne "M" and  $CutoffLevel ne "L")
+{
+ print $InputParaDes;
+        exit;
+}
+
+$ReadFile1Name = Cwd::abs_path($ReadFile1Name);
+$ReadFile2Name = Cwd::abs_path($ReadFile2Name);
+
+#check the files
+open(check,$ReadFile1Name) or die ("CHECK: Error when opening $ReadFile1Name\n");
+my $checkoneline = <check>;
+if(substr($checkoneline,0,1) eq ">")
+{
+ $ReadFileFormat = "fasta";
+}
+elsif(substr($checkoneline,0,1) eq "@")
+{
+ $ReadFileFormat = "fastq";
+}
+else
+{
+ die("CHECK: ERROR:Please check $ReadFile1Name\n");
+}
+close(check);
+
+open(check,$ReadFile2Name) or die ("CHECK: Error when opening $ReadFile2Name\n");
+my $checkoneline = <check>;
+if(substr($checkoneline,0,1) eq ">")
+{
+        die("CHECK: $ReadFile2Name has a different format as $ReadFile1Name\n") if ($ReadFileFormat ne "fasta");
+}
+elsif(substr($checkoneline,0,1) eq "@")
+{
+        die("CHECK: $ReadFile2Name has a different format as $ReadFile1Name\n") if ($ReadFileFormat ne "fastq");
+}
+else
+{
+        die("CHECK: ERROR:Please check $ReadFile2Name\n");
+}
+close(check);
+
+$Outputfolder= Cwd::abs_path($Outputfolder);
+if($Outputfolder eq "/tmp")
+{
+        while(-e $Outputfolder)
+        {
+                my $random_foldername = random_sessid();
+                $Outputfolder = "/tmp/".$random_foldername;
+        }
+}
+
+
+if(! -e $Outputfolder)
+{
+ mkdir $Outputfolder or die "CHECK: cannot mkdir $Outputfolder\n";
+}
+if(! -d $Outputfolder)
+{
+ die "CHECK: $Outputfolder is not a folder\n";
+}
+
+if($MapSoftware eq "bowtie")
+{
+ print "CHECK: whether bowtie installed and in PATH\n";
+ my $bowtiechecker=`bowtie --version`;
+ if($bowtiechecker ne "")
+ {
+ print "CHECK: bowtie found, information below:\n";
+ print $bowtiechecker,"\n";
+ }
+ else
+ {
+ die "CHECK: No bowtie found in PATH, EXIT!\n";
+ }
+}
+elsif($MapSoftware eq "rmap")
+{
+ print "CHECK: checking rmap...\n";
+ if(system("type rmap &>/dev/null") ==0 )
+ {
+ print "CHECK: rmap found, continue\n";
+ }
+ else
+ {
+ die "CHECK: No rmap found in PATH, EXIT!\n";
+ }
+}
+else
+{
+ die "CHECK: option -m only takes rmap or bowtie as inputs\n";
+}
+
+if($ReadSize == 0)
+{
+ die "CHECK: Please check option -s Read size\n";
+}
+
+if($noIRM)
+{
+ $noIRMstr= "noirm";
+}
+
+#write more checks later
+print "PARAMETERS:\tMapping software:  ",$MapSoftware,"\n";
+print "PARAMETERS:\tDatabase prefix:   ",$DatabasePrefix,"\n";
+print "PARAMETERS:\tRead end 1:        ",$ReadFile1Name,"\n";
+print "PARAMETERS:\tRead end 2:        ",$ReadFile2Name,"\n" if($ReadFile2Name ne $ReadFile1Name);
+print "PARAMETERS:\tCutoff level:      ",$CutoffLevel,"\n";
+print "PARAMETERS:\tJunction reads.min:",$JunctionCut,"\n";
+print "PARAMETERS:\tOutput folder:     ",$Outputfolder,"\n";
+print "PARAMETERS:\tOutput prefix:     ",$OutputPrefix,"\n";
+print "PARAMETERS:\tRead size:         ",$ReadSize,"\n";
+print "PARAMETERS:\tBowtie threads #:  ",$BowtieThreads,"\n";
+print "PARAMETERS:\tNo IRM.\n" if ($noIRM);
+
+if($MapSoftware eq "bowtie")
+{
+ print "=================STAGE 1 MAPPING===================\n";
+ system("bash $SrcFolder/mapping_bowtie.sh $ReadFile1Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder $BowtieThreads");
+        system("bash $SrcFolder/mapping_bowtie.sh $ReadFile2Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder $BowtieThreads") if($ReadFile2Name ne $ReadFile1Name);
+ print "=================STAGE 2 ESTIMATION================\n";
+
+ system("bash $SrcFolder/batch_para_cov10p_fit.sh $ReadFile1Name $ReadFile2Name $OutputPrefix $ReadSize $DatabasePrefix $Outputfolder $SrcFolder $noIRMstr") ;
+ print "=================STAGE 3 CUTOFF====================\n";
+        system("bash $SrcFolder/apply_cutoff.sh $OutputPrefix $CutoffLevel $Outputfolder $JunctionCut $SrcFolder $noIRMstr");
+
+
+}
+
+if($MapSoftware eq "rmap")
+{
+ print "=================STAGE 1 MAPPING===================\n";
+
+        system("bash $SrcFolder/mapping_rmap.sh $ReadFile1Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder") ;
+        system("bash $SrcFolder/mapping_rmap.sh $ReadFile2Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder") if($ReadFile2Name ne $ReadFile1Name);
+ print "=================STAGE 2 ESTIMATION================\n";
+
+        system("bash $SrcFolder/batch_para_cov10p_fit.sh $ReadFile1Name $ReadFile2Name $OutputPrefix $ReadSize $DatabasePrefix $Outputfolder $SrcFolder $noIRMstr") ;
+ print "=================STAGE 3 CUTOFF====================\n";
+        system("bash $SrcFolder/apply_cutoff.sh $OutputPrefix $CutoffLevel $Outputfolder $JunctionCut $SrcFolder $noIRMstr");
+
+
+}
+
+print "============ALL DONE, OUTPUTFILE:$OutputPrefix.txt\n";
+
+if($onGalaxy ne "")
+{
+        system("grep -v na $Outputfolder/$OutputPrefix.txt >$onGalaxy");
+}
+
+sub random_sessid
+{
+        #my @chars = (0..9,a..z,A..Z);
+        my @chars = ('a'..'z','A'..'Z');
+        my $len = 10;
+        my $string = join '', map {$chars[rand(@chars)]} (1..$len);
+        return $string;
+     }
+
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/SpliceTrap_measure.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/SpliceTrap_measure.pl Thu Sep 07 15:06:58 2017 -0400
b
@@ -0,0 +1,111 @@
+# Author: wuj@cshl.edu
+use strict;
+use Getopt::Long;
+####################
+my %config=do "$ENV{HOME}/.SpliceTrap.pl.ini";
+my $SrcFolder=$config{SrcFolder};
+#my $SrcFolder="/data/zhang/wuj/tools/SpliceTrap.0.8";
+####################
+my $MapSoftware="eland";
+my $ReadFileFormat="";
+my $ReadFile1Name="";
+my $ReadFile2Name="";
+my $CutoffLevel="H";
+my $OutputPrefix="Result";
+my $CutoffOnly=0;
+my $ReadSize=36;
+
+GetOptions (
+ "m:s"=>\$MapSoftware,
+ "f:s"=>\$ReadFileFormat,
+ "1:s"=>\$ReadFile1Name,
+ "2:s"=>\$ReadFile2Name,
+ "c:s"=>\$CutoffLevel,
+ "o:s"=>\$OutputPrefix,
+ "s:i"=>\$ReadSize,
+# "local:s"=>\$local,
+ "rerun"=>\$CutoffOnly
+);
+
+
+my $InputParaDes=" Usage of the script (v0.82):
+ -m Mapping software: eland/bowtie/rmap
+ -f Read File Format: fasta/fastq
+ -1 Read File 1
+ -2 Read File 2
+ -c Cutoff Level:H/M/L
+ Means High, Middle or Low
+ -o Output prefix
+ -s Read Size 36
+ --rerun Only run the last step, which is filtering
+";
+
+if($ReadFile1Name eq "" or $ReadFile2Name eq "" or $ReadFileFormat eq "")
+{
+ print $InputParaDes;
+ exit;
+}
+
+if($ReadFileFormat ne "fastq" and $ReadFileFormat ne "fasta")
+{
+        print $InputParaDes;
+        exit;
+
+}
+
+if($CutoffLevel ne "H" and  $CutoffLevel ne "M" and  $CutoffLevel ne "L")
+{
+ print $InputParaDes;
+        exit;
+}
+
+my $dirname1=`dirname $ReadFile1Name`;
+my $dirname2=`dirname $ReadFile2Name`;
+if($dirname1 ne ".")
+{
+         system("ln -s $ReadFile1Name ./");
+}
+if($dirname2 ne ".")
+{
+         system("ln -s $ReadFile2Name ./");
+}
+
+$ReadFile1Name = `basename $ReadFile1Name`;
+chomp($ReadFile1Name);
+$ReadFile2Name = `basename $ReadFile2Name`;
+chomp($ReadFile2Name);
+my $start = time;
+if($MapSoftware eq "bowtie")
+{
+ print "=================STAGE 1 MAPPING===================\n";
+ system("bash $SrcFolder/mapping_bowtie.sh $ReadFile1Name $ReadFileFormat") if not $CutoffOnly;
+        system("bash $SrcFolder/mapping_bowtie.sh $ReadFile2Name $ReadFileFormat") if not $CutoffOnly;
+ print "STAGE 1 FINISHED IN ",time-$start," seconds\n";
+ print "=================STAGE 2 ESTIMATION================\n";
+
+ system("bash $SrcFolder/batch_para_cov10p_fit.sh $ReadFile1Name $ReadFile2Name $OutputPrefix $ReadSize") if not $CutoffOnly;
+ print "STAGE 2 FINISHED IN ",time-$start," seconds\n";
+ print "=================STAGE 3 CUTOFF====================\n";
+        system("bash $SrcFolder/apply_cutoff.sh $OutputPrefix $CutoffLevel");
+ print "STAGE 3 FINISHED IN ",time-$start," seconds\n";
+
+
+
+}
+
+if($MapSoftware eq "rmap")
+{
+ print "=================STAGE 1 MAPPING===================\n";
+
+        system("bash $SrcFolder/mapping_rmap.sh $ReadFile1Name $ReadFileFormat") if not $CutoffOnly;
+        system("bash $SrcFolder/mapping_rmap.sh $ReadFile2Name $ReadFileFormat") if not $CutoffOnly;
+ print "=================STAGE 2 ESTIMATION================\n";
+
+        system("bash $SrcFolder/batch_para_cov10p_fit.sh $ReadFile1Name $ReadFile2Name $OutputPrefix $ReadSize") if not $CutoffOnly;
+ print "=================STAGE 3 CUTOFF====================\n";
+        system("bash $SrcFolder/apply_cutoff.sh $OutputPrefix $CutoffLevel");
+
+
+}
+
+print "============ALL DONE, OUTPUTFILE:$OutputPrefix.txt\n";
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/TXdbgen
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/TXdbgen Thu Sep 07 15:06:58 2017 -0400
b
@@ -0,0 +1,97 @@
+#!/usr/bin/perl
+my $SrcFolder="/home/galaxy/galaxy-dist/tools/SpliceTrap.0.90.1/bin";
+# this script is to generate TXdb database files from bed/gtf file
+
+use strict;
+use Cwd;
+use Getopt::Long;
+
+my @programs = ('split','bowtie-build','sort', 'uniq', 'ls','bash','rm','mv','cut','grep','echo');
+foreach my $program (@programs)
+{
+        die ("CHECK: $program not found\n") if(system("hash $program >/dev/null"));
+
+}
+
+
+my $genomedir = "";
+
+my $annofilename = "";
+my $txdbname = "userdefined";
+my $knownonly = 0;
+my $gtfinput = 0;
+
+GetOptions (
+ "g:s"=>\$genomedir,
+ "a:s"=>\$annofilename,
+ "n:s"=>\$txdbname,
+ "gtf"=>\$gtfinput,
+ "knownonly"=>\$knownonly
+);
+
+my $InputParaDes="      Usage of the script:
+        -g      genome fasta file location
+ -a annotation file (bed/gtf)
+ -n txdb name
+ --gtf specify this if annotation file is in gtf format
+";
+
+if($genomedir eq "" or $annofilename eq "")
+{
+ print $InputParaDes;
+ exit;
+}
+
+$genomedir = Cwd::abs_path($genomedir);
+$annofilename = Cwd::abs_path($annofilename);
+
+my $annofilebase = `basename $annofilename`;
+chomp($annofilebase);
+#need a cache folder to avoid mess
+
+my $cachefolder = $annofilebase.".cache";
+
+if (! -e $cachefolder)
+{
+ mkdir $cachefolder or die "TXDBGEN: could not create cache folder $cachefolder\n";
+}
+if($gtfinput)
+{
+ print "TXDBGEN: converting gtf file into bed format\n";
+ system ("perl $SrcFolder/gtf2bed.pl $annofilename >$cachefolder/$annofilebase.bed");
+ $annofilename = "$cachefolder/$annofilebase.bed";
+}
+
+
+print "TXDBGEN: scan $annofilename for AS events...\n";
+system("perl $SrcFolder/scanbed2txdb.pl $annofilename $cachefolder/TXdb.tmp");
+print "TXDBGEN: fetch sequences from $genomedir...\n";
+system("sort -k1,1 $cachefolder/TXdb.tmp >$cachefolder/TXdb.tmp.sort");
+#get fasta file list
+system("ls $genomedir/*.fa >$cachefolder/chr.list");
+
+system("perl $SrcFolder/get_bed_fa_j.pl $cachefolder/TXdb.tmp.sort $cachefolder/chr.list $cachefolder/out.bed $cachefolder/TXdb.fasta");
+
+print "TXDBGEN: generate files for parallel computing...\n";
+if (! -e "$cachefolder/parallel")
+{
+ mkdir "$cachefolder/parallel" or die "TXDBGEN: could not create $cachefolder/parallel\n";
+}
+system("grep L $cachefolder/out.bed >$cachefolder/TXdb.bed");
+system("rm $cachefolder/out.bed");
+system("sort $cachefolder/TXdb.tmp.evi >$cachefolder/TXdb.evi");
+system("rm $cachefolder/TXdb.tmp.evi");
+system("bash $SrcFolder/splitdb.sh $cachefolder/parallel");
+print "TXDBGEN: build Bowtie index...\n";
+
+if (! -e "$cachefolder/btw")
+{
+ mkdir "$cachefolder/btw" or die "TXDBGEN: could not create $cachefolder/btw\n";
+}
+system("bowtie-build $cachefolder/TXdb.fasta $cachefolder/btw/TXdb");
+system("rm $cachefolder/TXdb.tmp* $cachefolder/chr.list");
+print "TXDBGEN: Copy files to $SrcFolder/../db/$txdbname\n";
+
+system("mv $cachefolder $SrcFolder/../db/$txdbname");
+print "TXDBGEN: Done!\n";
+
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/TXdbgen.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/TXdbgen.pl Thu Sep 07 15:06:58 2017 -0400
b
@@ -0,0 +1,95 @@
+# this script is to generate TXdb database files from bed/gtf file
+
+use strict;
+use Cwd;
+use Getopt::Long;
+
+my @programs = ('split','bowtie-build','sort', 'uniq', 'ls','bash','rm','mv','cut','grep','echo');
+foreach my $program (@programs)
+{
+        die ("CHECK: $program not found\n") if(system("hash $program >/dev/null"));
+
+}
+
+
+my $genomedir = "";
+
+my $annofilename = "";
+my $txdbname = "userdefined";
+my $knownonly = 0;
+my $gtfinput = 0;
+
+GetOptions (
+ "g:s"=>\$genomedir,
+ "a:s"=>\$annofilename,
+ "n:s"=>\$txdbname,
+ "gtf"=>\$gtfinput,
+ "knownonly"=>\$knownonly
+);
+
+my $InputParaDes="      Usage of the script:
+        -g      genome fasta file location
+ -a annotation file (bed/gtf)
+ -n txdb name
+ --gtf specify this if annotation file is in gtf format
+";
+
+if($genomedir eq "" or $annofilename eq "")
+{
+ print $InputParaDes;
+ exit;
+}
+
+$genomedir = Cwd::abs_path($genomedir);
+$annofilename = Cwd::abs_path($annofilename);
+
+my $annofilebase = `basename $annofilename`;
+chomp($annofilebase);
+#need a cache folder to avoid mess
+
+my $cachefolder = $annofilebase.".cache";
+
+if (! -e $cachefolder)
+{
+ mkdir $cachefolder or die "TXDBGEN: could not create cache folder $cachefolder\n";
+}
+if($gtfinput)
+{
+ print "TXDBGEN: converting gtf file into bed format\n";
+ system ("perl $SrcFolder/gtf2bed.pl $annofilename >$cachefolder/$annofilebase.bed");
+ $annofilename = "$cachefolder/$annofilebase.bed";
+}
+
+
+print "TXDBGEN: scan $annofilename for AS events...\n";
+system("perl $SrcFolder/scanbed2txdb.pl $annofilename $cachefolder/TXdb.tmp");
+print "TXDBGEN: fetch sequences from $genomedir...\n";
+system("sort -k1,1 $cachefolder/TXdb.tmp >$cachefolder/TXdb.tmp.sort");
+#get fasta file list
+system("ls $genomedir/*.fa >$cachefolder/chr.list");
+
+system("perl $SrcFolder/get_bed_fa_j.pl $cachefolder/TXdb.tmp.sort $cachefolder/chr.list $cachefolder/out.bed $cachefolder/TXdb.fasta");
+
+print "TXDBGEN: generate files for parallel computing...\n";
+if (! -e "$cachefolder/parallel")
+{
+ mkdir "$cachefolder/parallel" or die "TXDBGEN: could not create $cachefolder/parallel\n";
+}
+system("grep L $cachefolder/out.bed >$cachefolder/TXdb.bed");
+system("rm $cachefolder/out.bed");
+system("sort $cachefolder/TXdb.tmp.evi >$cachefolder/TXdb.evi");
+system("rm $cachefolder/TXdb.tmp.evi");
+system("bash $SrcFolder/splitdb.sh $cachefolder/parallel");
+print "TXDBGEN: build Bowtie index...\n";
+
+if (! -e "$cachefolder/btw")
+{
+ mkdir "$cachefolder/btw" or die "TXDBGEN: could not create $cachefolder/btw\n";
+}
+system("bowtie-build $cachefolder/TXdb.fasta $cachefolder/btw/TXdb");
+system("rm $cachefolder/TXdb.tmp* $cachefolder/chr.list");
+print "TXDBGEN: Copy files to $SrcFolder/../db/$txdbname\n";
+
+system("mv $cachefolder $SrcFolder/../db/$txdbname");
+print "TXDBGEN: Done!\n";
+
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/apply_cutoff.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/apply_cutoff.sh Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,36 @@
+#SrcFolder="/data/zhang/wuj/scripts/SpliceTrap.0.8";
+
+outputname=$1;
+CutoffLevel=$2;
+Outputfolder=$3
+SrcFolder=$5
+JunctionCut=$4
+noIRM=$8
+
+echo "CUTOFF: Entering cutoff step...";
+echo "CUTOFF: Cache folder: $outputname.filter"
+mkdir $Outputfolder/$outputname.filter
+cd $Outputfolder/$outputname.filter
+ln -s ../$outputname.ratio
+ln -s ../$outputname.nums
+echo "CUTOFF: spliting file....and generating shell scripts..."
+split -11000 $outputname.ratio
+
+for ratiofiles in  x*
+do
+        echo "perl $SrcFolder/ApplyCutoff.jie.pl $ratiofiles $CutoffLevel $JunctionCut $noIRM > $ratiofiles.out" >>filter.sh
+done
+
+echo "CUTOFF: submit scripts..."
+perl $SrcFolder/batchqsub.pl filter.sh
+echo "CUTOFF: merging file...."
+cat *.out >../$outputname.raw
+cd ../
+#perl /data/zhang/wuj/tools/SpliceTrap.0.8/ApplyCutoff.jie.pl $outputname.ratio $outputname.nums 8 >$outputname.txt
+if [ "$noIRM" ];then
+ awk '{printf $22"\t"$2"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$12"\t"$13"\t"$14"\t"$15"\n"}' $outputname.raw >$outputname.txt
+else
+ awk '{printf $22"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$12"\t"$13"\t"$14"\t"$15"\n"}' $outputname.raw >$outputname.txt
+fi
+rm $outputname.filter -rf
+
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/batch_para_cov10p_fit.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/batch_para_cov10p_fit.sh Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,101 @@
+#!/bin/bash
+ReadFile1Name=`basename $1`
+ReadFile2Name=`basename $2`
+outputname=$3
+readsize=$4
+DatabasePrefix=$5
+Outputfolder=$6
+SrcFolder=$7
+noIRM=$8
+
+cd $Outputfolder
+if [ $ReadFile1Name != $ReadFile2Name ];then
+ echo "ESTIMATE: Getting fragment size information from data..."
+ perl $SrcFolder/get.frag.size.pl $ReadFile1Name.nomt $ReadFile2Name.nomt $readsize
+ perl $SrcFolder/get.hist.pl $ReadFile1Name.nomt.fragsize -w=1 -c=1
+else
+ echo "ESTIMATE: Generating the other half of reads..."
+ readnum=`wc -l $ReadFile1Name.nomt |cut -f1 -d" "`
+ for (( i=0; i<$readnum; i++ ))
+ do
+ echo "NM" >>$ReadFile1Name.f.nomt
+ done
+ echo "#Width:1" >$ReadFile1Name.nomt.fragsize.hist
+fi
+echo "ESTIMATE: Creating cache folder.."
+if [ $ReadFile1Name != $ReadFile2Name ];then
+ ReadFile2FinalName=$ReadFile2Name.nomt
+else
+ ReadFile2FinalName=$ReadFile1Name.f.nomt
+fi
+
+mkdir $ReadFile1Name.result
+cd $ReadFile1Name.result
+ln -s ../$ReadFile1Name.nomt ./
+ln -s ../$ReadFile2FinalName ./
+ln -s ../$ReadFile1Name.nomt.fragsize.hist ./
+echo "ESTIMATE: Split mapping results via chromosomes..."
+perl $SrcFolder/scan_nomt.pl $ReadFile1Name.nomt $ReadFile2FinalName
+loopi=0
+echo "ESTIMATE: Generating shell scripts for Loop $loopi..."
+while read chrlist
+do
+ chr=`echo $chrlist |tr -d "\n"`
+ for dbfile in $SrcFolder/../db/$DatabasePrefix/parallel/$chr.*
+ do
+ base=`basename $dbfile`
+ echo "$SrcFolder/Pair_estimate_c -f $ReadFile1Name.nomt.fragsize.hist -o $ReadFile1Name.$loopi.$base -d $dbfile -1 $ReadFile1Name.nomt.$chr -2 $ReadFile2FinalName.$chr -s $readsize" >>r$loopi.sh
+ done
+done <$SrcFolder/../db/$DatabasePrefix/parallel/chr.list
+
+echo "ESTIMATE: Submit shell scripts for Loop $loopi..."
+perl $SrcFolder/batchqsub.pl r$loopi.sh
+echo "ESTIMATE: Loop $loopi done..."
+
+cat $ReadFile1Name.$loopi.*.ratio >$outputname.$loopi.ratio
+cat $ReadFile1Name.$loopi.*.log >$outputname.$loopi.log
+cat $ReadFile1Name.$loopi.*.nums >$outputname.$loopi.nums
+rm  $ReadFile1Name.$loopi.*.ratio
+rm $ReadFile1Name.$loopi.*.log
+rm $ReadFile1Name.$loopi.*.nums
+
+
+if [ "$noIRM" ];then
+ echo "ESTIMATE: No IRM correction, skipped..."
+ mv $outputname.$loopi.ratio $outputname.ratio
+ mv $outputname.$loopi.log $outputname.log
+ mv $outputname.$loopi.nums $outputname.nums
+else
+
+ echo "ESTIMATE: derive IRMs from data..."
+ awk '{if ($15>=10) printf $0"\n"}' $outputname.$loopi.ratio >$outputname.mle
+ perl $SrcFolder/get_event_dist_fit.pl $outputname.mle -c=2 -w=0.001
+
+ loopi=1
+ echo "ESTIMATE: Generating shell scripts for Loop $loopi..."
+ while read chrlist
+ do
+ chr=`echo $chrlist |tr -d "\n"`
+ for dbfile in $SrcFolder/../db/$DatabasePrefix/parallel/$chr.*
+ do
+ base=`basename $dbfile`
+ echo "$SrcFolder/Pair_estimate_c -f $ReadFile1Name.nomt.fragsize.hist -o $ReadFile1Name.$loopi.$base -d $dbfile -1 $ReadFile1Name.nomt.$chr -2 $ReadFile2FinalName.$chr -b $outputname.mle.fit.hist -s $readsize" >>r$loopi.sh
+ done
+ done  <$SrcFolder/../db/$DatabasePrefix/parallel/chr.list
+ echo "ESTIMATE: Submit shell scripts for Loop $loopi..."
+
+#perl $SrcFolder/qsub/batchqsub.pl r$loopi.sh $taskname
+ perl $SrcFolder/batchqsub.pl r$loopi.sh
+ echo "ESTIMATE: Loop $loopi done..."
+ cat $ReadFile1Name.$loopi.*.ratio >$outputname.ratio
+ cat $ReadFile1Name.$loopi.*.log >$outputname.log
+ cat $ReadFile1Name.$loopi.*.nums >$outputname.nums
+ rm  $ReadFile1Name.$loopi.*.ratio
+ rm $ReadFile1Name.$loopi.*.log
+ rm $ReadFile1Name.$loopi.*.nums
+fi
+
+mv $outputname.ratio $outputname.log $outputname.nums ../
+cd ../
+rm $ReadFile1Name.result -rf
+rm $ReadFile1Name.nomt $ReadFile2FinalName 
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/batchqsub.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/batchqsub.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,158 @@
+
+# modified from Chenghai Xue's script
+
+#test if qsub works
+
+my $qsub_checker = 0;
+if(system("hash qsub >/dev/null"))
+{
+ $qsub_checker = 0;
+}
+else
+{
+ $test_randname=random_sessid();
+ system("mkdir $test_randname;");
+ system("echo 'mkdir $test_randname/$test_randname' >$test_randname/$test_randname.sh");
+ system("mkdir $test_randname/qsub_cache");
+ system ("qsub -cwd -v TMPDIR=$test_randname/qsub_cache -V -e $test_randname/qsub_cache -o $test_randname/qsub_cache -N $test_randname  $test_randname/$test_randname.sh");
+
+ $status=0;
+ $sec=5;
+ while(1)
+ {
+ $chkresult=`qstat |grep $test_randname |wc -l`;
+ chomp($chkresult);
+ if ($chkresult == 0)
+ {
+ $sec=10;
+ $status++;
+ last if ($status==3);
+ }
+ else
+ {
+ $status=0;
+ $sec=5;
+ }
+ print "QSTAT: $chkresult testing tasks running.....$taskname\n";
+ sleep($sec);
+ }
+ print "QSUB: testing done\n";
+ print "$test_randname/$test_randname\n";
+ if(-d "$test_randname/$test_randname")
+ {
+ $qsub_checker=1;
+ print "QSUB: working well!\n";
+ }
+#$qsub_checker=`qsub </dev/null 2>&1|grep stdin|wc -l`;
+ system("rm $test_randname -rf");
+}
+#$qsub_checker=0;
+if($qsub_checker == 0)
+{
+ print "QSUB: No GRID qsub found\n";
+ print "QSUB: if you are using PBS qsub, please wait for the next version! Thanks.\n";
+ print "QSUB: Running in serial mode...\n";
+ system("sh $ARGV[0]");
+ exit;
+}
+
+$performListFile = $ARGV[0];
+$taskname = "";
+$taskname = $ARGV[1];
+if (not $taskname)
+{
+ $taskname=random_sessid();
+}
+#$outfullDir = $ARGV[2];
+$outfullDir ="qsub_cache";
+
+# correct path
+if(! (-d $outfullDir) ){
+ system ("mkdir $outfullDir");
+}
+
+# create a temp cache
+@temp = split("/", $0);
+$prog = pop @temp;
+$cache = $outfullDir."/".$prog."_".$taskname;
+if(! (-d $cache) ){
+ system ("mkdir $cache");
+}
+
+open (IN_1, "$performListFile") or die "can not open file $performListFile to read\n";
+@performList = (<IN_1>);
+chomp @performList;
+close IN_1 or die "can't close the input file : $!";
+
+
+$scriptListFile = $outfullDir."/".$taskname."_scripts.list";
+open (FSCRIPLIST, ">$scriptListFile");
+for($i=0; $i<@performList; $i++){
+ $scriptFile = $outfullDir."/".$taskname."_script$i.sh";
+ print FSCRIPLIST "$scriptFile\n";
+ open (FOUT, ">$scriptFile");
+
+# print FOUT "#!/usr/bin/sh\n";
+ print FOUT "$performList[$i]\n";
+
+# print OUT_1 "$outfile.map\n";
+ close (FOUT);
+}
+close (FSCRIPLIST);
+
+open (IN_2, "$scriptListFile") or die "can not open file $scriptListFile to read\n";
+$basename=`basename $performListFile`;
+chop($basename);
+$taskname=$taskname."_".$basename;
+#print $basename;
+while(<IN_2>){
+ $f = $_;
+ chomp $f;
+ @temp = split("/", $f);
+ $base = pop @temp;
+
+ #use default queues
+# print "/opt/n1ge6/bin/lx24-amd64/qsub -l virtual_free=1.7G -cwd -v TMPDIR=$cache -V -e $cache -o $cache -N $taskname.$base $f\n\n\n";
+ #system ("qsub -l virtual_free=1.7G -cwd -v TMPDIR=$cache -V -e $cache -o $cache -N $taskname.$base $f");
+ system ("qsub -cwd -v TMPDIR=$cache -V -e $cache -o $cache -N $taskname.$base $f");
+
+}
+close IN_2 or die "can't close the input file : $!";
+
+#start to check stats of qsub tasks
+#######################################
+my $taskname_query=substr($taskname,0,10);
+
+
+$status=0;
+$sec=60;
+while(1)
+{
+ $chkresult=`qstat |grep $taskname_query |wc -l`;
+ chomp($chkresult);
+ if ($chkresult == 0)
+ {
+ $sec=10;
+ $status++;
+ last if ($status==3);
+ }
+ else
+ {
+ $status=0;
+ $sec=60;
+ }
+ print "QSTAT: $chkresult tasks running.....$taskname\n";
+ sleep($sec);
+}
+print "QSUB: done: $taskname \n";
+######################################
+
+#
+sub random_sessid
+{
+ #my @chars = (0..9,a..z,A..Z);
+ my @chars = ('a'..'z','A'..'Z');
+ my $len = 10;
+ my $string = join '', map {$chars[rand(@chars)]} (1..$len);
+ return $string;
+}
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/batchqsub.pl_orig
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/batchqsub.pl_orig Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,158 @@
+
+# modified from Chenghai Xue's script
+
+#test if qsub works
+
+my $qsub_checker = 0;
+if(system("hash qsub >/dev/null"))
+{
+ $qsub_checker = 0;
+}
+else
+{
+ $test_randname=random_sessid();
+ system("mkdir $test_randname;");
+ system("echo 'mkdir $test_randname/$test_randname' >$test_randname/$test_randname.sh");
+ system("mkdir $test_randname/qsub_cache");
+ system ("qsub -cwd -v TMPDIR=$test_randname/qsub_cache -V -e $test_randname/qsub_cache -o $test_randname/qsub_cache -N $test_randname  $test_randname/$test_randname.sh");
+
+ $status=0;
+ $sec=5;
+ while(1)
+ {
+ $chkresult=`qstat |grep $test_randname |wc -l`;
+ chomp($chkresult);
+ if ($chkresult == 0)
+ {
+ $sec=10;
+ $status++;
+ last if ($status==3);
+ }
+ else
+ {
+ $status=0;
+ $sec=5;
+ }
+ print "QSTAT: $chkresult testing tasks running.....$taskname\n";
+ sleep($sec);
+ }
+ print "QSUB: testing done\n";
+ print "$test_randname/$test_randname\n";
+ if(-d "$test_randname/$test_randname")
+ {
+ $qsub_checker=1;
+ print "QSUB: working well!\n";
+ }
+#$qsub_checker=`qsub </dev/null 2>&1|grep stdin|wc -l`;
+ system("rm $test_randname -rf");
+}
+#$qsub_checker=0;
+if($qsub_checker == 0)
+{
+ print "QSUB: No GRID qsub found\n";
+ print "QSUB: if you are using PBS qsub, please wait for the next version! Thanks.\n";
+ print "QSUB: Running in serial mode...\n";
+ system("sh $ARGV[0]");
+ exit;
+}
+
+$performListFile = $ARGV[0];
+$taskname = "";
+$taskname = $ARGV[1];
+if (not $taskname)
+{
+ $taskname=random_sessid();
+}
+#$outfullDir = $ARGV[2];
+$outfullDir ="qsub_cache";
+
+# correct path
+if(! (-d $outfullDir) ){
+ system ("mkdir $outfullDir");
+}
+
+# create a temp cache
+@temp = split("/", $0);
+$prog = pop @temp;
+$cache = $outfullDir."/".$prog."_".$taskname;
+if(! (-d $cache) ){
+ system ("mkdir $cache");
+}
+
+open (IN_1, "$performListFile") or die "can not open file $performListFile to read\n";
+@performList = (<IN_1>);
+chomp @performList;
+close IN_1 or die "can't close the input file : $!";
+
+
+$scriptListFile = $outfullDir."/".$taskname."_scripts.list";
+open (FSCRIPLIST, ">$scriptListFile");
+for($i=0; $i<@performList; $i++){
+ $scriptFile = $outfullDir."/".$taskname."_script$i.sh";
+ print FSCRIPLIST "$scriptFile\n";
+ open (FOUT, ">$scriptFile");
+
+# print FOUT "#!/usr/bin/sh\n";
+ print FOUT "$performList[$i]\n";
+
+# print OUT_1 "$outfile.map\n";
+ close (FOUT);
+}
+close (FSCRIPLIST);
+
+open (IN_2, "$scriptListFile") or die "can not open file $scriptListFile to read\n";
+$basename=`basename $performListFile`;
+chop($basename);
+$taskname=$taskname."_".$basename;
+#print $basename;
+while(<IN_2>){
+ $f = $_;
+ chomp $f;
+ @temp = split("/", $f);
+ $base = pop @temp;
+
+ #use default queues
+# print "/opt/n1ge6/bin/lx24-amd64/qsub -l virtual_free=1.7G -cwd -v TMPDIR=$cache -V -e $cache -o $cache -N $taskname.$base $f\n\n\n";
+ #system ("qsub -l virtual_free=1.7G -cwd -v TMPDIR=$cache -V -e $cache -o $cache -N $taskname.$base $f");
+ system ("qsub -cwd -v TMPDIR=$cache -V -e $cache -o $cache -N $taskname.$base $f");
+
+}
+close IN_2 or die "can't close the input file : $!";
+
+#start to check stats of qsub tasks
+#######################################
+my $taskname_query=substr($taskname,0,10);
+
+
+$status=0;
+$sec=60;
+while(1)
+{
+ $chkresult=`qstat |grep $taskname_query |wc -l`;
+ chomp($chkresult);
+ if ($chkresult == 0)
+ {
+ $sec=10;
+ $status++;
+ last if ($status==3);
+ }
+ else
+ {
+ $status=0;
+ $sec=60;
+ }
+ print "QSTAT: $chkresult tasks running.....$taskname\n";
+ sleep($sec);
+}
+print "QSUB: done: $taskname \n";
+######################################
+
+#
+sub random_sessid
+{
+ #my @chars = (0..9,a..z,A..Z);
+ my @chars = ('a'..'z','A'..'Z');
+ my $len = 10;
+ my $string = join '', map {$chars[rand(@chars)]} (1..$len);
+ return $string;
+}
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/beta_fit.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/beta_fit.R Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,39 @@
+args = commandArgs();
+input_file=args[4];
+#input_file="control_a.0.1.flt.ratio.tmpca";
+#print (input_file);
+
+
+library(MASS);
+
+p=array(0,dim=1000);
+
+for (i in 0:999)
+{
+         p[i]=0.001
+}
+
+if ( file.info(input_file)["size"]>0 )
+{
+
+data=read.table(input_file);
+col=1;
+x=data[,col];
+x1=x;
+if (length(x)>10)
+{
+ x1[x==0] <- .Machine$double.eps;
+ x1[x==1] <- (1-.Machine$double.eps);
+ xbar=mean(x1)
+ xvar=var(x1)
+ a <- (xbar*(1-xbar)/xvar - 1)*xbar
+ b <- (1-xbar)*a/xbar
+ (f=fitdistr(x1,"beta",list(shape1=a,shape2=b)))
+ for (i in 0:999)
+ {
+ p[i]=dbeta(i/1000,f[["estimate"]][["shape1"]],f[["estimate"]][["shape2"]])
+ }
+}
+
+}
+write(p,file=paste(input_file,"fit",sep="."),ncolumns=1);
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/bowtie2eland.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/bowtie2eland.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,175 @@
+use strict;
+
+my $bowtiefilename=$ARGV[0];
+my $readsfilename=$ARGV[1];
+my $elandfilename=$ARGV[2];
+
+open(readsfile, $readsfilename);
+
+my $detectformat=`head -c 1 $readsfilename`;
+
+#my $firstletter=$detectformat;
+#my $looplinenumbers=4;
+
+#$looplinenumbers=2 if ($detectformat eq ">");
+open(bowtiefile, $bowtiefilename);
+open(elandfile, ">".$elandfilename);
+my $readfilelinenum=0;
+# hash the positions of the alignments for each read id
+my %readposhash;
+my $bowtiepos = tell (bowtiefile);
+while (my $bowtieline=<bowtiefile>)
+{
+ my ($bowtiereadname, $strand, $mapped_id, $pos, $seq, $qt,$num, $mapinfo)=split("\t",$bowtieline);
+ if (not exists $readposhash{$bowtiereadname} )
+ {
+ $readposhash{$bowtiereadname} = $bowtiepos;
+ }
+ $bowtiepos = tell (bowtiefile);
+}
+
+while(my $readline=<readsfile>)
+{
+ $readfilelinenum++;
+ if(($readline=~/^$detectformat/) && ($readfilelinenum%2 ==1))
+ {
+ chomp($readline);
+ my $readname=substr($readline, 1, length($readline)-1);
+ if( not exists $readposhash{$readname} )
+ {
+ print elandfile $readname,"\tNA\tNM\n";
+ next;
+ }
+ else
+ {
+ my @mapped_ids=();
+                        my @mapped_pos=();
+                        my @mapped_strand=();
+ seek(bowtiefile, $readposhash{$readname}, 0);
+ while (my $bowtieline=<bowtiefile>)
+ {
+ my ($bowtiereadname, $strand, $mapped_id, $pos, $seq, $qt,$num, $mapinfo)=split("\t",$bowtieline);
+ if($readname eq $bowtiereadname)
+ {
+ push(@mapped_ids, $mapped_id);
+                                        push(@mapped_pos,$pos);
+                                        push(@mapped_strand,$strand);
+ }
+ else
+ {
+ last;
+ }
+
+ }
+ print elandfile $readname,"\t";
+ print elandfile "NA\t";
+ print elandfile scalar(@mapped_ids),":0:0\t";
+ for(my $i=0;$i<@mapped_ids;$i++)
+ {
+ print elandfile "/",$mapped_ids[$i];
+ print elandfile ":",$mapped_pos[$i]+1;
+ if($mapped_strand[$i] eq "+")
+ {
+ print elandfile "F0,";
+ }
+ else
+ {
+ print elandfile "R0,";
+ }
+
+ }
+ print elandfile "\n";
+
+ }
+ }
+}
+
+close(elandfile);
+close(bowtiefile);
+close(readsfile);
+
+exit;
+while(my $bowtieline=<bowtiefile>)
+{
+ my ($bowtiereadname, $strand, $mapped_id, $pos, $seq, $qt,$num, $mapinfo)=split("\t",$bowtieline);
+ while(my $readline=<readsfile>)
+ {
+ $readfilelinenum++;
+ if(($readline=~/^$detectformat/) && ($readfilelinenum%2 ==1))
+ #if($readline=~/^$detectformat/)
+ {
+ chomp($readline);
+ my $readname=substr($readline, 1, length($readline)-1);
+
+
+ if($readname ne $bowtiereadname)
+ {
+ print elandfile $readname,"\tNA\tNM\n";
+ next;
+ }
+ else
+ {
+ my @mapped_ids=();
+ my @mapped_pos=();
+ my @mapped_strand=();
+ push(@mapped_ids, $mapped_id);
+ push(@mapped_pos,$pos);
+ push(@mapped_strand,$strand);
+ while(1)
+ {
+ $bowtieline=<bowtiefile>;
+ my ($bowtiereadname, $strand, $mapped_id, $pos, $seq, $qt,$num, $mapinfo)=split("\t",$bowtieline);
+ if( $bowtiereadname eq $readname )
+ {
+ push(@mapped_ids, $mapped_id);
+ push(@mapped_pos,$pos);
+ push(@mapped_strand,$strand);
+ }
+ else
+ {
+ seek(bowtiefile, -1*length($bowtieline),1);
+ print elandfile $readname,"\t";
+ print elandfile "NA\t";
+ print elandfile scalar(@mapped_ids),":0:0\t";
+ for(my $i=0;$i<@mapped_ids;$i++)
+ {
+ print elandfile "/",$mapped_ids[$i];
+ print elandfile ":",$mapped_pos[$i]+1;
+ if($mapped_strand[$i] eq "+")
+ {
+ print elandfile "F0,";
+ }
+ else
+ {
+ print elandfile "R0,";
+ }
+
+ }
+ print elandfile "\n";
+ last;
+ }
+ }
+ last;
+
+ }
+ }
+ }
+}
+
+while(my $readline=<readsfile>)
+{
+ $readfilelinenum++;
+ if(($readline=~/^$detectformat/) && ($readfilelinenum%2 ==1))
+ #if($readline=~/^$detectformat/)
+        {
+             chomp($readline);
+             my $readname=substr($readline, 1, length($readline)-1);
+      print  elandfile $readname,"\tNA\tNM\n";
+ }
+}
+
+close(elandfile);
+close(bowtiefile);
+
+
+close(readsfile);
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/calc_pval.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/calc_pval.R Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,29 @@
+args = commandArgs();
+input_file=args[4];
+av=as.numeric(args[5]);
+sd=as.numeric(args[6]);
+nu=as.numeric(args[7]);
+
+
+data=read.table(input_file);
+
+col=4;
+x=data[,col];
+pup=pnorm(x, mean=av, sd=sd, lower.tail = FALSE);
+adpup=p.adjust(pup,method="fdr");
+pdn=pnorm(x, mean=av, sd=sd, lower.tail = TRUE);
+adpdn=p.adjust(pdn,method="fdr");
+
+p=pup;
+
+size = length(x);
+
+for (i in 1:size)
+{
+ if(x[i]<0) 
+ {
+ p[i]=pdn[i];
+ }
+}
+write(p, file=paste(input_file,"p",sep="."),ncolumns=1);
+
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/downloaddb.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/downloaddb.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,54 @@
+use strict;
+use Cwd;
+
+my %flags=(
+ "hg18"=>0,
+ "mm9"=>0,
+ "rn4"=>0,
+);
+my $PROG = $0;
+my $PROG_ABS_PATH = Cwd::abs_path($PROG);
+$PROG_ABS_PATH = `dirname $PROG_ABS_PATH`;
+chomp($PROG_ABS_PATH);
+print "\n\tPrepare to download databases from CSHL...\n";
+
+print "\tWhich database(s) do you want to download?\n";
+print "\tChoose from\t ";
+foreach my $key (keys %flags)
+{
+ print $key,"/";
+}
+print "ALL \n\n\tseparated by blank...don't enter anything if you don't want to download!\n\nPlease enter:[NONE]";
+
+my @dbnames = split(/\s+/,<>);
+for (my $i=0;$i<@dbnames;$i++)
+{
+ if (uc($dbnames[$i]) eq "ALL" )
+ {
+ foreach my $key (keys %flags)
+ {
+ $flags{$key}=1;
+ }
+ last;
+ }
+
+ if( exists $flags{$dbnames[$i]})
+ {
+ $flags{$dbnames[$i]} = 1;
+ }
+ #system ("wget http://rulai.cshl.edu/splicetrap/db/")
+}
+
+foreach my $key (keys %flags)
+{
+ if ($flags{$key} ==1)
+ {
+ system ("wget http://rulai.cshl.edu/splicetrap/db/".$key.".tar.gz");
+ print "untar the database file for $key...please wait...";
+ system ("tar -ixzf $key.tar.gz");
+ system ("rm $key.tar.gz");
+ mkdir "$key/parallel";
+ print "creating files for parallel computing...\n";
+ system("bash $PROG_ABS_PATH/splitdb.sh $key/parallel")
+ }
+}
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/get.frag.size.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/get.frag.size.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,55 @@
+#for the results from paired end
+#The two inputs are the results from the two ends
+use strict;
+my $read_size = $ARGV[2];
+open(input1, $ARGV[0]);
+open(input2, $ARGV[1]);
+open(output, ">$ARGV[0].fragsize");
+#open(fusefile,">$ARGV[0].fuse");
+
+#my $LongMarker="L";
+#my $ShortMarker="S";
+
+while(my $line1=<input1>)
+{
+ my $line2=<input2>;
+ chomp($line1);
+ chomp($line2);
+ next if($line1=~/$\NM/ or $line2=~/$\NM/ or $line1=~/$\MT/ or $line2=~/$\MT/);
+ my @array1 = split("\t",$line1);
+ #my $read_size=length($array1[1]);
+ my @array2 = split("\t",$line2);
+ my $match1=$array1[3];
+ my $match2=$array2[3];
+# my $marker=$LongMarker.$ShortMarker;
+ my @sizes=();
+ #while($match1=~/\/(\S[^,]*\[[$marker]\])\S[^,]*:(\d*)[RF]/g)
+ while($match1=~/\/(\S[^,]*\[\w+\])\S[^,]*:(\d*)[RF]/g)
+ {
+ my $name=$1;
+ my $posa=$2;
+ #print $name,"\n";
+
+ if($match2=~/\Q$name\E\S[^,]*:(\d*)[RF]/)
+ {
+ #print "match\n";
+ my $posb=$1;
+ push @sizes, abs($posb-$posa)+$read_size;
+
+ }
+
+ }
+ my %saw;
+ @saw{@sizes}=();
+ my @keya= keys %saw;
+ #print scalar(@keya),"\n";
+ if(scalar(@keya)==1)
+ {
+ print output $keya[0],"\n";
+ }
+
+}
+#close(fusefile);
+close(output);
+close(input2);
+close(input1);
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/get.hist.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/get.hist.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,49 @@
+use Getopt::Long;
+use strict;
+
+my $InputFileName=$ARGV[0];
+my $OutputFileName = $ARGV[0].".hist";
+#$OutputFileName=$ARGV[1] if $ARGV[1] ne "";
+my $width=0.01;
+my $verbose=1;
+my $col=2;
+my $start=0;
+my $end=1.000;
+
+
+GetOptions (
+        'w:f'=>\$width,
+ 'c:i'=>\$col,
+        'start:f'=>\$start,
+        'end:f'=>\$end,
+        'v'=>\$verbose
+);
+
+$width=$width*1;
+#print "IRM: #Generate hist with delta width of $width \n";
+#print "IRM: #data source from col $col\n";
+
+$col=$col-1;
+
+my @hist;
+my $totalnum=0;
+
+open(Input, $InputFileName);
+while(my $line=<Input>)
+{
+ next if($line=~/^#/);
+ chomp($line);
+ my @array=split(/\s/,$line);
+ $hist[int($array[$col]/$width)]++;
+ $totalnum++;
+
+}
+close(Input);
+
+open(OutputFile, ">$OutputFileName");
+print OutputFile  "#Width:$width\n";
+for(my $i=0;$i<@hist;$i++)
+{
+ print OutputFile $hist[$i]/$totalnum,"\n";
+}
+close(OutputFile);
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/get_bed_fa_j.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/get_bed_fa_j.pl Thu Sep 07 15:06:58 2017 -0400
[
b'@@ -0,0 +1,330 @@\n+# Adapted from Chenghai Xue\'s script\n+\n+$starttime=time();\n+\n+$input_file_1 = $ARGV[0];\t# exon junction file\n+$input_file_2 = $ARGV[1];\t# genome file list\n+$output_file_1 = $ARGV[2];\t# exon junction bed (might be less than input_file_1\n+$output_file_2 = $ARGV[3];\t# exon junction fa\n+#$leftLen = $ARGV[4];\n+#$rightLen = $ARGV[5];\n+\r\n+open(IN_1, "$input_file_1") or die "can\'t open the input file : $!";\n+open(IN_2, "$input_file_2") or die "can\'t open the input file : $!";\n+open OUT_1, ">$output_file_1" or die "Can not open output_file : $!";\r\n+open OUT_2, ">$output_file_2" or die "Can not open output_file : $!";\r\n+\n+@chromList = (<IN_2>);\n+chomp(@chromList);\n+$len_chromList = @chromList;\n+print "BED2FA: in $input_file_2, found $len_chromList chromosomes\\n";\n+foreach $one (@chromList){\n+\tif($one =~ /\\/(chr.[^\\/]*?)\\.*fa$/i){\n+\t\t$chr_hash{$1} = $one;\n+\t\t#print $1,"\\n";\t\n+\t}\n+}\n+@key_chr_hash = keys(%chr_hash);\r\n+$len_key_chr_hash = @key_chr_hash;\r\n+@sort_key_chr_hash = sort_chromNo(@key_chr_hash);\n+$len_sort_key_chr_hash = @sort_key_chr_hash;\n+#for($i=0; $i<$len_sort_key_chr_hash; $i++){\r\n+#\tprint "$sort_key_chr_hash[$i]\t$chr_hash{$sort_key_chr_hash[$i]}\\n";\r\n+#}\n+\n+$num_1=0;\n+$num_2=0;\n+$num_count_chrom=0;\n+my ($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $itemRgb, $blockCount, $blockSizes, $blockStarts);\r\n+$current_chrom = "";\n+while(<IN_1>){\n+\t$num_1++;\r\n+\t$line = $_;\n+\tchomp $line;\n+\t#print $line,"\\n";\n+\t@cols = split ("\\t", $line);\n+        if(scalar(@cols)==12)\n+\t{\n+\t($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $itemRgb, $blockCount, $blockSizes, $blockStarts) = @cols;\n+\t}\n+\tif(scalar(@cols)!=12)\n+\t{\n+\t\t($chrom, $chromStart, $chromEnd, $name, $score, $strand)=@cols;\n+\t\t$thickStart=$chromStart;\n+\t#\tprint $thickStart,"\\n";\n+\t\t$thickEnd = $chromEnd;\n+\t\t$blockCount=1;\n+\t\t$blockSizes=$chromEnd-$chromStart;\n+\t\t$blockStarts = 0;\t\n+\t}\n+\t$strand="+" if !$strand;\n+\t@a_blockSizes = split (/\\,/, $blockSizes);\n+\t@a_blockStarts = split (/\\,/, $blockStarts);\n+\tif($chrom ne $current_chrom){\n+\t\tif($num_1 != 1){\n+\t\t\tprint "$num_chr_1\t$num_chr_2\t$len_contigSeqStr\\n";\t\t\t\n+\t\t}\n+\t\tprint "BED2FA: $chrom:\t";\n+\t\t\n+\t\t$num_chr_1=0;\n+\t\t$num_chr_2=0;\t\t\n+\n+\t\tif(exists $chr_hash{$chrom}){\n+\t\t\t$num_count_chrom++;\n+\t\t\t$current_chrom = $chrom;\n+\t\t\t#print $current_chrom,"\\n";\n+#=pod\t\t\t\n+\t\t\t$chromFastaFile = $chr_hash{$chrom};\n+\t\t\t#print $chromFastaFile,"\\n";\n+\t\t\topen($fin, "<$chromFastaFile") or die "can\'t open the chrom file : $!";\n+\t\t\tlocal ($/) = undef;\n+\t\t\t$contigSeqStr = <$fin>;\n+\t\t\tclose ($fin);\n+\t\t\t#print $contigSeqStr,"mark\\t";\n+\t\t\t$contigSeqStr =~s/^\\>.*?\\n//g;\n+                        #print $contigSeqStr,"mark2\\t";\n+\n+\t\t\t$contigSeqStr =~s/\\s|\\n//g;\n+                        #print $contigSeqStr,"mark3\\n";\n+\n+\t\t\t$len_contigSeqStr = length $contigSeqStr;\n+#=cut\n+\t\t}\n+\t\telse{\n+\t\t\t$num_chr_1++;\n+\t\t\tnext;\n+\t\t}\n+\t}\n+\t$num_chr_1++;\n+\t\r\n+# modify from here................................\n+\tmy @Starts;\n+\tmy @Ends;\n+\tmy @JuncSeq;\n+\tmy $ssStrTag=1;\n+\tfor($i_wuj=0;$i_wuj<$blockCount;$i_wuj++)\n+\t{\n+\t\t$Starts[$i_wuj] = $chromStart + $a_blockStarts[$i_wuj];\n+\t\t$Ends[$i_wuj] = $Starts[$i_wuj] + $a_blockSizes[$i_wuj];\n+\t\t$JuncSeq[$i_wuj] = uc substr ($contigSeqStr,$Starts[$i_wuj], $a_blockSizes[$i_wuj]);\n+\t\tif($strand eq "-"){\n+\t\t  $JuncSeq[$i_wuj] = uc string_reverse_complement(lc $JuncSeq[$i_wuj]);\n+\t\t}\n+\t}\t \n+       # for($i_wuj=0;$i_wuj<$blockCount-1;$i_wuj++)\n+#\t{\n+#\t        $ssStr = uc substr ($contigSeqStr, $Ends[$i_wuj], 2) . substr ($contigSeqStr, $Starts[$i_wuj+1]  - 2, 2);\n+#\t        if($strand eq "-"){\n+ #               $ssStr = uc string_reverse_complement(lc $ssStr);\n+                #$ssStr = $rc_ssStr;\n+#\t        }\n+#\t\t$ssStrTag = 0 if ($ssStr ne "GTAG");\n+\t\t\n+ #       }\n+#\tif($ssStrTag ==1){\n+        if(1){\n+\t\t$num_2++;\n+\t\t$num_chr_2++;\n+\t\tprint OUT_1 "$line\\n";\n+\t\t#print OUT_2 ">$name\\|$chrom\\|$chromStart\\|$chromEnd\\|$strand\\|$ssStr\\|$num_2\\n$junctionSeqStrLeft'..b'ret;\r\n+}\n+\r\n+sub sort_chromNo{\n+\tlocal(@chrom) = @_;\n+\tlocal($len_key_chr_hash, $i, @sort_chr_hash);\n+\tlocal(@digit_random, @words_random, @digit_other_1, @digit_other_2, @words_other_1, @words_other_2, @digit, @words);\n+\tlocal(@sort_digit, @sort_words, @sort_digit_random, @sort_words_random, @sort_digit_other, @sort_words_other);\n+\tlocal($len_digit, $len_words, $len_digit_random, $len_words_random, $len_digit_other, $len_words_other, $term);\n+\t\n+\t$len_key_chr_hash = @chrom;\n+\t# sort via chr number for printing result\r\n+\tfor($i=0; $i<$len_key_chr_hash; $i++){\n+\t\tif($key_chr_hash[$i] =~ /chr(\\d+)\\_random/){\r\n+\t\t\tpush(@digit_random, $1);\r\n+\t\t}\r\n+\t\telsif($key_chr_hash[$i] =~ /chr(\\w+)\\_random/){\r\n+\t\t\tpush(@words_random, $1);\r\n+\t\t}\r\n+\t\telsif($key_chr_hash[$i] =~ /chr(\\d+)\\_([\\w\\d\\_]+)/){\r\n+\t\t\tpush(@digit_other_1, $1);\n+\t\t\tpush(@digit_other_2, $2);\r\n+\t\t}\r\n+\t\telsif($key_chr_hash[$i] =~ /chr(\\w+)\\_([\\w\\d\\_]+)/){\r\n+\t\t\tpush(@words_other_1, $1);\n+\t\t\tpush(@words_other_2, $2);\r\n+\t\t}\r\n+\t\telsif($key_chr_hash[$i] =~ /chr(\\d+)/){\r\n+\t\t\tpush(@digit, $1);\r\n+\t\t}\r\n+\t\telsif($key_chr_hash[$i] =~ /chr(\\w+)/){\r\n+\t\t\tpush(@words, $1);\r\n+\t\t}\r\n+\t\telse{\r\n+\t\t\tprint "BED2FA: There is unknown type of chromosomes: $key_chr_hash[$i]\\n";\r\n+\t\t}\r\n+\t}\r\n+\t@sort_digit = sort by_mostly_numeric @digit;\r\n+\t@sort_words = sort by_mostly_string @words;\n+\t@sort_digit_random = sort by_mostly_numeric @digit_random;\r\n+\t@sort_words_random = sort by_mostly_string @words_random;\n+\t@sort_digit_other = sort_2_array_number_string(\\@digit_other_1, \\@digit_other_2);\r\n+\t@sort_words_other = sort_2_array_string_string(\\@words_other_1, \\@words_other_2);\r\n+\t\r\n+\t$len_digit = @sort_digit;\r\n+\tfor($i=0; $i<$len_digit; $i++){\r\n+\t\t$term = "chr".$sort_digit[$i];\r\n+\t\tpush(@sort_chr_hash, $term);\r\n+\t}\r\n+\t$len_words = @sort_words;\r\n+\tfor($i=0; $i<$len_words; $i++){\r\n+\t\t$term = "chr".$sort_words[$i];\r\n+\t\tpush(@sort_chr_hash, $term);\r\n+\t}\n+\t$len_digit_random = @sort_digit_random;\r\n+\tfor($i=0; $i<$len_digit_random; $i++){\r\n+\t\t$term = "chr".$sort_digit_random[$i]."_random";\r\n+\t\tpush(@sort_chr_hash, $term);\r\n+\t}\r\n+\t$len_words_random = @sort_words_random;\r\n+\tfor($i=0; $i<$len_words_random; $i++){\r\n+\t\t$term = "chr".$sort_words_random[$i]."_random";\r\n+\t\tpush(@sort_chr_hash, $term);\r\n+\t}\t\r\n+\t$len_digit_other = @sort_digit_other;\r\n+\tfor($i=0; $i<$len_digit_other; $i=$i+2){\r\n+\t\t$term = "chr".$sort_digit_other[$i]."_".$sort_digit_other[$i+1];\r\n+\t\tpush(@sort_chr_hash, $term);\r\n+\t}\r\n+\t$len_words_other = @sort_words_other;\r\n+\tfor($i=0; $i<$len_words_other; $i=$i+2){\r\n+\t\t$term = "chr".$sort_words_other[$i]."_".$sort_words_other[$i+1];\r\n+\t\tpush(@sort_chr_hash, $term);\r\n+\t}\t\n+\t\n+\treturn @sort_chr_hash;\n+}\r\n+\n+sub sort_2_array_number_string{\r\n+\tlocal($a, $b) = @_;\r\n+\tlocal($len_a, $len_b, $i, %family, $one, $two);\r\n+\tlocal(@ret);\r\n+\t\r\n+\t$len_a = @$a;\r\n+\t$len_b = @$b;\r\n+\tif($len_a == $len_b){\r\n+\t\tfor($i=0; $i<$len_a; $i++){\r\n+\t\t\t$family{$$a[$i]}{$$b[$i]} = 0;\t\t\t\t\t\t\t\t\t\r\n+\t\t}\r\n+\t\tfor $one (sort by_mostly_numeric keys %family) {\r\n+\t\t\tfor $two (sort by_mostly_string keys %{ $family{$one} }) {\r\n+\t\t\t\t\tpush(@ret, $one);\r\n+\t\t\t\t\tpush(@ret, $two);\r\n+\t\t\t}\t\r\n+\t\t}\r\n+\t}\r\n+\telse{\t\t\r\n+\t\tprint "ERROR: Sort array is not same size\\n";\r\n+\t\tprint "a $len_a, b $len_b\\n";\r\n+\t}\t\r\n+\t\r\n+\treturn @ret;\r\n+}\n+\n+sub sort_2_array_string_string{\r\n+\tlocal($a, $b) = @_;\r\n+\tlocal($len_a, $len_b, $i, %family, $one, $two);\r\n+\tlocal(@ret);\r\n+\t\r\n+\t$len_a = @$a;\r\n+\t$len_b = @$b;\r\n+\tif($len_a == $len_b){\r\n+\t\tfor($i=0; $i<$len_a; $i++){\r\n+\t\t\t$family{$$a[$i]}{$$b[$i]} = 0;\t\t\t\t\t\t\t\t\t\r\n+\t\t}\r\n+\t\tfor $one (sort by_mostly_string keys %family) {\r\n+\t\t\tfor $two (sort by_mostly_string keys %{ $family{$one} }) {\r\n+\t\t\t\t\tpush(@ret, $one);\r\n+\t\t\t\t\tpush(@ret, $two);\r\n+\t\t\t}\t\r\n+\t\t}\r\n+\t}\r\n+\telse{\t\t\r\n+\t\tprint "ERROR: Sort array is not same size\\n";\r\n+\t\tprint "a $len_a, b $len_b\\n";\r\n+\t}\t\r\n+\t\r\n+\treturn @ret;\r\n+}\n+\n+sub by_mostly_numeric{\r\n+#\t( $a <=> $b ) || ( $a cmp $b );\r\n+\t( $a <=> $b );\r\n+}\r\n+\r\n+sub by_mostly_string{\r\n+#\t( $a <=> $b ) || ( $a cmp $b );\r\n+\t( $a cmp $b );\r\n+}\n+\r\n'
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/get_event_dist_fit.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/get_event_dist_fit.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,108 @@
+use Getopt::Long;
+use strict;
+
+use Cwd;
+my $PROG = $0;
+my $CUR_DIR = Cwd::abs_path(Cwd::cwd());
+my $PROG_ABS_PATH = Cwd::abs_path($PROG);
+my $SrcFolder=`dirname $PROG_ABS_PATH`;
+chomp($SrcFolder);
+
+#my $SrcFolder="/data/zhang/wuj/scripts/SpliceTrap.0.8/";
+my $InputFileName=$ARGV[0];
+my $OutputFileName = $ARGV[0].".hist";
+#$OutputFileName=$ARGV[1] if $ARGV[1] ne "";
+my $width=0.001;
+my $verbose=1;
+my $col=2;
+
+GetOptions (
+        'w:f'=>\$width,
+        'c:i'=>\$col,
+        'v'=>\$verbose
+);
+
+$width=$width*1;
+my $binnum=1/$width;
+$col=$col-1;
+
+my @CAratios;
+my @CSratios;
+my @ADratios;
+my @AAratios;
+my @AIratios;
+my @IRratios;
+
+open(Input, $InputFileName);
+while(my $line=<Input>)
+{
+        next if($line=~/^#/);
+        chomp($line);
+        my @array=split(/\s/,$line);
+ next if($array[$col]<=0.001 or $array[$col]>=0.999);
+#        push( @CAratios,$array[$col]) if($array[0]=~/^C[AS]/ or $array[0]=~/^ME/);
+ push( @CAratios,$array[$col]) if($array[0]=~/^CA/ or $array[0]=~/^ME/);
+ push( @CSratios,$array[$col]) if($array[0]=~/^CS/);
+ push( @ADratios,$array[$col]) if($array[0]=~/^AD/);
+ push( @AAratios,$array[$col]) if($array[0]=~/^AA/);
+ push( @AIratios,$array[$col]) if($array[0]=~/^AI/);
+ push( @IRratios,$array[$col]) if($array[0]=~/^IR/);
+
+}
+
+close(Input);
+
+open(tmpFile, ">$InputFileName.tmpca");
+for(my $i=0;$i<@CAratios;$i++)
+{
+ print tmpFile $CAratios[$i],"\n";
+}
+close(tmpFile);
+open(tmpFile, ">$InputFileName.tmpcs");
+for(my $i=0;$i<@CSratios;$i++)
+{
+        print tmpFile $CSratios[$i],"\n";
+}
+close(tmpFile);
+
+
+open(tmpFile, ">$InputFileName.tmpad");
+for(my $i=0;$i<@ADratios;$i++)
+{
+        print tmpFile $ADratios[$i],"\n";
+}
+close(tmpFile);
+
+open(tmpFile, ">$InputFileName.tmpaa");
+for(my $i=0;$i<@AAratios;$i++)
+{
+        print tmpFile $AAratios[$i],"\n";
+}
+close(tmpFile);
+
+open(tmpFile, ">$InputFileName.tmpai");
+for(my $i=0;$i<@AIratios;$i++)
+{
+        print tmpFile $AIratios[$i],"\n";
+}
+close(tmpFile);
+
+open(tmpFile, ">$InputFileName.tmpir");
+for(my $i=0;$i<@IRratios;$i++)
+{
+        print tmpFile $IRratios[$i],"\n";
+}
+close(tmpFile);
+
+system("R --slave --args $InputFileName.tmpca <$SrcFolder/beta_fit.R");
+system("R --slave --args $InputFileName.tmpad <$SrcFolder/beta_fit.R");
+system("R --slave --args $InputFileName.tmpaa <$SrcFolder/beta_fit.R");
+#system("R --slave --args $InputFileName.tmpai <$SrcFolder/R/beta_fit.R");
+system("R --slave --args $InputFileName.tmpir <$SrcFolder/beta_fit.R");
+system("R --slave --args $InputFileName.tmpcs <$SrcFolder/beta_fit.R");
+
+system("echo '#Width:$width' >$InputFileName.fit.hist");
+#system("paste $InputFileName.tmpca.fit $InputFileName.tmpad.fit $InputFileName.tmpaa.fit $InputFileName.tmpai.fit $InputFileName.tmpir.fit $InputFileName.tmpcs.fit >>$InputFileName.fit.hist");
+
+system("paste $InputFileName.tmpca.fit $InputFileName.tmpad.fit $InputFileName.tmpaa.fit $InputFileName.tmpir.fit $InputFileName.tmpcs.fit >>$InputFileName.fit.hist");
+#system("rm $InputFileName.tmpca* $InputFileName.tmpad* $InputFileName.tmpaa* $InputFileName.tmpai* $InputFileName.tmpir* ");
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/gtf2bed.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/gtf2bed.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,81 @@
+# rewrite on Sep 7th,2022
+
+#part of package SpliceTrap
+
+#Jie Wu
+use strict;
+
+my $inputfilename = $ARGV[0];
+
+# input file is a gtf file, 
+# "transcript_id" is required for each line and should not be ambiguous.
+# only the "exon" lines are used
+
+my %chr_hash; 
+my %strand_hash;
+my %tx_exons; #tx_exons{$tx_id){$start} = $size;
+
+my $linenum = 0;
+
+open(input, $inputfilename);
+
+while(my $line=<input>)
+{
+ $linenum++; 
+ my @a = split("\t",$line);
+ if ($a[2] eq "exon")
+ {
+ my $txid;
+ if($a[8]=~/transcript_id "(\S*?)"/)
+ {
+ $txid = $1;
+ }
+ else
+ {
+ die ("$inputfilename format error! No transcript_id in line $linenum \n");
+ }
+
+ if( exists $chr_hash{$txid} and $chr_hash{$txid} ne $a[0])
+ {
+ warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped \n");
+ next;
+ }
+ if( exists $strand_hash{$txid} and $strand_hash{$txid} ne $a[6])
+ {
+ warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped\n");
+ }
+ $chr_hash{$txid} = $a[0];
+ $strand_hash{$txid} = $a[6];
+ $tx_exons{$txid}{$a[3]} = $a[4] - $a[3] +1;
+
+ }
+
+}
+
+foreach my $txid (keys %chr_hash)
+{
+ my @starts;
+ my @sizes;
+ foreach my $start (sort {$a<=>$b} (keys %{$tx_exons{$txid}} ) )
+ {
+ push (@starts, $start);
+ push (@sizes, $tx_exons{$txid}{$start});
+ }
+ my $exon_num   = scalar(@sizes);
+ my $starts_str = "";
+ for(my $i = 0; $i < $exon_num; $i++)
+ {
+ $starts_str = $starts_str.($starts[$i] - $starts[0]).",";
+ if($i>0)
+ {
+ warn "$txid, intron size..".($starts[$i]-$starts[$i-1])."\n" if ($starts[$i]-$starts[$i-1]>1000000);
+ }
+ }
+ my $sizes_str  = join(",",@sizes);
+ my $end = $starts[$exon_num-1] + $sizes[$exon_num-1] -1;
+ print join("\t",$chr_hash{$txid}, $starts[0]-1, $end, $txid,"0",$strand_hash{$txid},$starts[0]-1, $end, "255,0,0",$exon_num,$sizes_str, $starts_str);
+        print "\n";
+}
+
+
+close(input);
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/mapping_bowtie.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/mapping_bowtie.sh Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,89 @@
+#!/bin/bash
+#SrcFolder='/data/zhang/wuj/scripts/SpliceTrap.0.8'
+InputFileName=$1
+faorfq=$2
+DatabasePrefix=$3
+Outputfolder=$4
+SrcFolder=$5
+Threads=$6
+DatabaseFolder=$SrcFolder'/../db/'$DatabasePrefix'/btw/TXdb'
+TmpFolderName=`basename $1`
+#fasta or fastq
+
+cd $Outputfolder;
+#prepare the folder
+if [ -d $TmpFolderName.result ];then
+        echo "MAPPING: !!!Error, there is already a folder named "$TmpFolderName".result !"
+        echo "MAPPING: !!!change the name of that folder first in case I erase them..."
+        exit
+fi
+echo "MAPPING: Start mapping $InputFileName...Creating cache folder $TmpFolderName.result"
+mkdir  $TmpFolderName".result"
+mkdir $TmpFolderName".result"/cache
+
+cd $TmpFolderName".result"
+cd cache
+echo "MAPPING: Split to pieces ..."
+split -l 1000000 $InputFileName
+for name in x*
+do
+
+ if [ $faorfq == "fasta" ];then
+ add="-f"
+ fi
+# if [ $name != $InputFileName ];then
+ echo "bowtie -p $Threads -a -v 2 $DatabaseFolder $name $add >$name.btw; perl $SrcFolder/bowtie2eland.pl $name.btw $name $name.eland;rm $name.btw ;perl $SrcFolder/mark.mt.4eland.pl $name.eland >$name.nomt;rm $name.eland">>map.sh
+ echo $name >>checklist
+# fi
+done
+
+echo "MAPPING: submit scripts..."
+perl $SrcFolder/batchqsub.pl map.sh
+
+tasknum=`wc -l map.sh |tr -d "\n"`
+#checking..
+
+echo "MAPPING: mapping $InputFileName to TXdb done...start to check.."
+while [ 1 ]
+do
+ if [ -f mapcheck.sh ];then
+ rm mapcheck.sh
+ fi
+ while read checklist
+ do
+
+ name=`echo $checklist |tr -d "\n"`
+ echo "MAPPING: checking $name...."
+ readnum=`wc -l $name | cut -f1 -d" "`
+ if [ $faorfq == "fasta" ];then 
+ readnum=`echo "$readnum/2"|bc`
+ else
+ readnum=`echo "$readnum/4"|bc`
+ fi
+ if [ -f $name.nomt ];then
+ bowtienum=`wc -l $name.nomt | cut -f1 -d" "`
+ else
+ bowtienum=0
+ fi
+ if [ $bowtienum != $readnum ];then
+ echo "bowtie -p $Threads -a -v 2 $DatabaseFolder $name $add >$name.btw; perl $SrcFolder/bowtie2eland.pl $name.btw $name $name.eland;rm $name.btw ;perl $SrcFolder/mark.mt.4eland.pl $name.eland >$name.nomt;rm $name.eland">>mapcheck.sh
+
+ fi
+ done <checklist
+ if [ -f mapcheck.sh ];then
+ checktasknum=`wc -l mapcheck.sh |tr -d "\n"`
+ if [ $checktasknum == $tasknum ];then
+ echo "MAPPING: warning! none of the mapping tasks properly finished!"
+ fi
+ echo "MAPPING: resubmit TASKS...."
+ perl $SrcFolder/batchqsub.pl mapcheck.sh
+ else
+ break
+ fi
+done
+echo "MAPPING: Done.....merging files..."
+cat *.nomt >$Outputfolder/$TmpFolderName.nomt
+cd ../../
+rm $TmpFolderName.result -rf
+#/data/zhang/wuj/tools/bowtie-0.12.3/bowtie -a $DatabaseFolderTXdb -f
+
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/mapping_rmap.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/mapping_rmap.sh Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,80 @@
+#/data/zhang/wuj/tools/bowtie-0.12.3/bowtie -a --best /data/zhang/wuj/database/hg18/AS/TXdb.2/btw/TXdb -f -t s_1_sequence.txtparta >s_1.map &
+#SrcFolder='/data/zhang/wuj/scripts/SpliceTrap.0.8'
+InputFileName=$1
+faorfq=$2
+DatabasePrefix=$3
+Outputfolder=$4
+SrcFolder=$5
+DatabaseFolder=$SrcFolder'/../db/'$DatabasePrefix'/TXdb.fasta'
+TmpFolderName=`basename $1`
+
+cd $Outputfolder;
+#prepare the folder
+if [ -d $TmpFolderName.result ];then
+        echo "MAPPING: !!!Error, there is already a folder named "$TmpFolderName".result !"
+        echo "MAPPING: !!!change the name of that folder first in case I erase them..."
+        exit
+fi
+echo "MAPPING: Start to map $InputFileName....Creating cache folder $TmpFolderName.result"
+mkdir  $TmpFolderName".result"
+cd $TmpFolderName".result"
+mkdir cache stat
+cd cache
+echo "MAPPING: Split file..."
+split -l 1000000 $InputFileName
+echo "MAPPING: generating shell scirpts...."
+for name in x*
+do
+
+ echo "rmap -M 100 -m 2 -c $DatabaseFolder -o $name.rmap $name; perl $SrcFolder/rmap2eland.pl $name.rmap $name $name.eland;rm $name.rmap ;perl $SrcFolder/mark.mt.4eland.pl $name.eland >$name.nomt;rm $name.eland">>map.sh
+ echo $name >>checklist
+done
+tasknum=`wc -l map.sh |tr -d "\n"`
+perl $SrcFolder/batchqsub.pl map.sh
+
+echo "MAPPING: map $InputFileName to TXdb done...start to check.."
+while [ 1 ]
+do
+        if [ -f mapcheck.sh ];then
+                rm mapcheck.sh
+        fi
+        while read checklist
+        do
+
+                name=`echo $checklist |tr -d "\n"`
+                echo "MAPPING: checking $name...."
+                readnum=`wc -l $name | cut -f1 -d" "`
+                if [ $faorfq == "fasta" ];then
+                        readnum=`echo "$readnum/2"|bc`
+                else
+                        readnum=`echo "$readnum/4"|bc`
+                fi
+                if [ -f $name.nomt ];then
+                        rmapnum=`wc -l $name.nomt | cut -f1 -d" "`
+                else
+                        rmapnum=0
+                fi
+                if [ $rmapnum != $readnum ];then
+ echo "rmap -M 100 -m 2 -c $DatabaseFolder -o $name.rmap $name; perl $SrcFolder/rmap2eland.pl $name.rmap $name $name.eland;rm $name.rmap ;perl $SrcFolder/mark.mt.4eland.pl $name.eland >$name.nomt;rm $name.eland">>mapcheck.sh
+
+                fi
+        done <checklist
+        if [ -f mapcheck.sh ];then
+                checktasknum=`wc -l mapcheck.sh |tr -d "\n"`
+                if [ $checktasknum == $tasknum ];then
+                        echo "MAPPING: warning! none of the mapping tasks properly finished!"
+                fi
+                echo "MAPPING: resubmiting TASKS...."
+                perl $SrcFolder/batchqsub.pl mapcheck.sh
+        else
+                break
+        fi
+done
+echo "MAPPING: Mapping is really done....merging files..."
+
+
+cat *.nomt >$Outputfolder/$TmpFolderName.nomt
+cd ../../
+
+rm $TmpFolderName.result -rf
+
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/mark.mt.4eland.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/mark.mt.4eland.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,56 @@
+# this file is to convert mult mapped reads to nm reads by simply marked it as NM reads. 
+# its for the convience of inclusion ratio computation, if one read can be mapped to mult positions in the genome, then it will be marked as NM
+# later, can be used to add information for dealing with this mult reads, for example, the coverage in the region
+
+use strict;
+my $inputfilename=$ARGV[0];
+my $LongMarker="L";
+my $ShortMarker="S";
+
+
+open(input, $inputfilename);
+while(my $line=<input>)
+{
+ #print "new line\n";
+ chomp($line);
+ my @array = split("\t",$line);
+ my $match=$array[3];
+ if( $array[2] eq "NM" or $match eq "")
+ {
+ print $line,"\n";
+ next;
+ }
+
+ my $marker=$LongMarker.$ShortMarker;
+ my @genome_pos;
+ #while($match1=~/\/(\S[^,]*\[[$marker]\])\S[^,]*:(\d*)[RF]/g)
+ #this array is used to store the mapped position for this read
+ my @chr;
+ my @start;
+ my @end;
+ while($match=~/(chr\S[^\|]*)\|(\d*)\|(\d*)\|/g)
+ {
+ push @chr, $1;
+ push @start, $2;
+ push @end, $3;
+ }
+ @chr=sort(@chr);
+ if (scalar(@chr)<=1)
+ {
+                print $line,"\n";
+                next;
+        }
+
+ @start=sort(@start);
+ @end=sort(@end);
+ if($chr[0] ne $chr[scalar(@chr)-1] or $start[scalar(@chr)-1]-$start[0]>100000)
+ {
+ print $line, "\tMT\n";
+ }
+ else
+ {
+ print $line,"\n";
+
+ }
+}
+close(input);
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/rmap2eland.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/rmap2eland.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,103 @@
+use strict;
+
+my $rmapfilename=$ARGV[0];
+my $readsfilename=$ARGV[1];
+my $elandfilename=$ARGV[2];
+
+my $detectformat=`head -c 1 $readsfilename`;
+
+#system("grep \"$detectformat\" $readsfilename |sort >$readsfilename.sort");
+system("awk 'NR%2==1' $readsfilename |sort >$readsfilename.sort");
+system("sort -k4,4 $rmapfilename >$rmapfilename.sort");
+
+
+open(readsfile, $readsfilename.".sort");
+
+
+
+#$looplinenumbers=2 if ($detectformat eq ">");
+open(rmapfile, $rmapfilename.".sort");
+open(elandfile, ">".$elandfilename);
+
+while(my $rmapline=<rmapfile>)
+{
+ chomp($rmapline);
+ my ($mapped_id, $start, $end, $rmapreadname, $mismatch, $strand)=split("\t",$rmapline);
+ while(my $readline=<readsfile>)
+ {
+ if($readline=~/^$detectformat/)
+ {
+ chomp($readline);
+ my $readname=substr($readline, 1, length($readline)-1);
+
+
+ if($readname ne $rmapreadname)
+ {
+ print elandfile $readname,"\tNA\tNM\n";
+ next;
+ }
+ else
+ {
+ my @mapped_ids=();
+ my @mapped_pos=();
+ my @mapped_strand=();
+ push(@mapped_ids, $mapped_id);
+ push(@mapped_pos,$start);
+ push(@mapped_strand,$strand);
+ while(1)
+ {
+ $rmapline=<rmapfile>;
+ chomp($rmapline);
+ ($mapped_id, $start, $end, $rmapreadname, $mismatch, $strand)=split("\t",$rmapline);
+ if( $rmapreadname eq $readname )
+ {
+ push(@mapped_ids, $mapped_id);
+ push(@mapped_pos,$start);
+ push(@mapped_strand,$strand);
+ }
+ else
+ {
+ seek(rmapfile, -1*length($rmapline)-1,1);
+ print elandfile $readname,"\t";
+ print elandfile "NA\t";
+ print elandfile scalar(@mapped_ids),":0:0\t";
+ for(my $i=0;$i<@mapped_ids;$i++)
+ {
+ print elandfile "/",$mapped_ids[$i];
+ print elandfile ":",$mapped_pos[$i]+1;
+ if($mapped_strand[$i] eq "+")
+ {
+ print elandfile "F0,";
+ }
+ else
+ {
+ print elandfile "R0,";
+ }
+
+ }
+ print elandfile "\n";
+ last;
+ }
+ }
+ last;
+
+ }
+ }
+ }
+}
+
+while(my $readline=<readsfile>)
+{
+        if($readline=~/^$detectformat/)
+        {
+             chomp($readline);
+             my $readname=substr($readline, 1, length($readline)-1);
+             print  elandfile $readname,"\tNA\tNM\n";
+        }
+}
+
+close(elandfile);
+close(rmapfile);
+
+
+close(readsfile);
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/scan_nomt.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/scan_nomt.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,98 @@
+use strict;
+use FileHandle;
+
+
+my $File1Name=$ARGV[0];
+my $File2Name=$ARGV[1];
+
+my %FileHandle1;
+my %FileHandle2;
+my %chrlist;
+
+open(File1,$File1Name);
+open(File2,$File2Name);
+
+while(my $line1=<File1>)
+{
+ my $line2=<File2>;
+ chomp($line1);
+ chomp($line2);
+ my $read1status=substr($line1,length($line1)-2,2);
+ my $read2status=substr($line2,length($line2)-2,2);
+ #next if( ($read1status eq "NM" or $read1status eq "MT") and ($read2status eq "NM" or $read2status eq "MT");
+ my @array1=split("\t",$line1);
+ my @array2=split("\t",$line2);
+ my $chr1="";
+ my $chr2="";
+ if(scalar(@array1) eq 4)
+ {
+ if($array1[3]=~/(chr\S*?)\|/)
+ {$chr1=$1;}
+ }
+ if(scalar(@array2) eq 4)
+ {
+         if($array2[3]=~/(chr\S*?)\|/)
+         {$chr2=$1;}
+ }
+ my $chr=$chr1;
+ if ($chr eq "")
+ {
+ next if($chr2 eq "");
+ $chr=$chr2;
+ }
+ else
+ {
+  next if($chr2 ne "" and $chr2 ne $chr);
+
+ }
+ next if $chr eq "";
+ if(exists $chrlist{$chr})
+ {
+ my $fout1= $FileHandle1{$chr};
+ my $fout2= $FileHandle2{$chr};
+ if($read1status eq "MT")
+ {
+ print $fout1 $array1[0],"\t",$array1[1],"\tMT\n";
+ }
+ else
+ {
+ print $fout1 $line1,"\n";
+ }
+ if($read2status eq "MT")
+                {
+                        print $fout2 $array2[0],"\t",$array2[1],"\tMT\n";
+                }
+                else
+                {
+                        print $fout2 $line2,"\n";
+                }
+
+
+
+ }
+ else
+ {
+ $chrlist{$chr}=1;
+ my $fout1= new FileHandle;
+ open($fout1, ">".$File1Name.".".$chr);
+ $FileHandle1{$chr}=$fout1;
+
+ my $fout2= new FileHandle;
+                open($fout2, ">".$File2Name.".".$chr);
+                $FileHandle2{$chr}=$fout2;
+
+ }
+
+}
+
+foreach my $fout1 (keys %FileHandle1)
+{
+ close($fout1);
+}
+foreach my $fout2 (keys %FileHandle2)
+{
+        close($fout2);
+}
+
+close(File1);
+close(File2);
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/scanbed2txdb.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/scanbed2txdb.pl Thu Sep 07 15:06:58 2017 -0400
[
b'@@ -0,0 +1,490 @@\n+#argv0: input transcript bed file\n+#argv1: output filename, will be in AS format\n+\n+use strict;\n+my $AnnoFileName = $ARGV[0];\n+my $outputFileName = $ARGV[1];\n+\n+if($AnnoFileName eq "" or $outputFileName eq "")\n+{\n+\tprint "TXDBGEN: Please specify your input files\\n";\n+\texit;\n+}\n+\n+my $cachefolder = `basename $outputFileName`;\n+chomp($cachefolder);\n+my $cachefolder = $cachefolder.".cache";\n+my $AnnoFileBase = `basename $AnnoFileName`; \n+chomp($AnnoFileBase);\n+\n+if(! -e $cachefolder)\n+{\n+\tmkdir $cachefolder or die "CHECK: cannot mkdir $cachefolder\\n";\n+\tprint "TXDBGEN: mkdir $cachefolder\\n";\n+}\n+\n+my $CacheAnnoFileName = $cachefolder."/".$AnnoFileBase.".sort";\n+#sort the annotation file\n+\n+print "TXDBGEN: sort $AnnoFileName \\n";\n+\n+system("sort -k6,6 -k1,1 -k2,2n -k3,3n $AnnoFileName >$CacheAnnoFileName");\n+\n+$AnnoFileName = $CacheAnnoFileName;\n+\n+#read the annotations into hashes\n+\n+open(AnnoFile, $CacheAnnoFileName) or die "can not open",$CacheAnnoFileName;\n+#split the genes into contigs\n+\n+my $contigid = 0;\n+my $end_tmp = 0;\n+my $chr_tmp = "chr";\n+my $strand_tmp="NA";\n+\n+my $TXnumtmp=0;\n+\n+my %eventlist =();\n+#my %eventlist_af = ();\n+#my %eventlist_al = ();\n+my %evidences = ();\n+\n+open(my $output,">$outputFileName");\n+\n+open(my $output2, ">$outputFileName.evi");\n+while(my $line =<AnnoFile>)\n+{\n+\tchomp($line);\n+\tmy @a = split("\\t",$line);\n+\tmy $chr = $a[0];\n+\tmy $start = $a[1];\n+\tmy $end = $a[2];\n+\tmy $name = $a[3];\n+\tmy @sizes = split(",",$a[10]);\n+\tmy $strand = $a[5];\n+\tmy @start_shifts = split(",",$a[11]);\n+\t#my $chrstr = $chr.$strand;\n+\tmy @starts;\n+\tmy @ends;\n+\tfor(my $i=0;$i<@start_shifts;$i++)\n+        {\n+                $starts[$i]=$start_shifts[$i]+$start;\n+                $ends[$i] = $starts[$i]+$sizes[$i];\n+        }\n+\tif($start >$end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand)\n+        {\n+                $contigid++;\n+                #$ctgmultisonum++ if $TXnumtmp>1;\n+                $TXnumtmp=0;\n+\t\tmy $annos = scanevents(\\%eventlist, "inner") ;\n+\t\t#my $annos_af = scanevents(\\%eventlist_af, "af");\n+\t\t#my $annos_al = scanevents(\\%eventlist_al, "al");\n+\t\t\n+\t\tmy $stdout = select ($output);\n+\t\tmy $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0);\n+\t\t#printanno($annos_af,$chr_tmp,$strand_tmp) if(scalar(%$annos_af)>0);\n+\t\t#printanno($annos_al,$chr_tmp,$strand_tmp) if(scalar(%$annos_al)>0);\n+\t\tselect($stdout);\n+\t\t$stdout = select ($output2);\n+\t\t#print cross information\n+\t\tforeach my $connect_str  (keys %$eventids) \n+\t\t{\n+\t\t\tprint $eventids->{$connect_str},"\\t";\n+\t\t\tforeach my $transcriptid  (keys %{$evidences{$connect_str}}) \n+\t\t\t{\n+\t\t\t\tprint $transcriptid,",";\n+\t\t\t}\n+\t\t\tprint "\\n";\n+\t\t}\n+\t\tselect($stdout);\n+\t\tprint "TXDBGEN: Contig ID $contigid at $chr $strand...\\n" if ( $contigid%1000 == 0);\n+\t\t%eventlist = ();\n+\t\t#%eventlist_al = ();\n+\t\t#%eventlist_af = ();\n+                #print CacheContigFile "#ctg$contigid\\n";\n+\n+        }\n+\t$TXnumtmp++;\n+        $end_tmp = $end if ($end > $end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand);\n+        $chr_tmp = $chr;\n+\t\n+\t$strand_tmp = $strand;\n+\t\n+\t#scan connections, 2 and 3 exons for CA/CS/AF\n+#\tif(scalar(@starts)>2)\n+#\t{\n+#\t\tmy $connectionstr_af =$starts[0]."-".\n+#\t\t\t\t$ends[0].",".\n+#\t\t\t\t$starts[1]."-".\n+#\t\t\t\t$ends[1]."," ;\n+#\t\n+#\t\t$eventlist_af{$connectionstr_af} = $starts[0];\n+#\t\tmy $connectionstr_al = $starts[scalar(@starts)-2]."-".\n+#\t\t\t\t$ends[scalar(@starts)-2].",".\n+#\t\t\t\t$starts[scalar(@starts)-1]."-".\n+#\t\t\t\t$ends[scalar(@starts)-1]."," ;\n+#\t\t$eventlist_al{$connectionstr_al} = $starts[scalar(@starts)-2];\n+#\t}\n+\t\t#didn\'t consider direction yet\n+        #add 1 exon for IR\n+\tfor(my $n=1;$n<4;$n++)\n+        {\n+\t\tfor(my $i=0;$i<scalar(@starts)-$n+1;$i++)\n+                {\n+\t\t\tmy $connectionstr = "";\n+\t\t\tfor(my $j=$i;$j<$i+$n;$j++)\n+                        {\n+                                $connectionstr = $connectionstr .\n+                                        $starts[$j]."-".\n+                                        $en'..b'\t\tprint "0,";\n+\t\t\tprint $a[2]-$a[0],",";\n+\t\t\tprint $a[4]-$a[0],"\\n";\n+\t\t\t\n+\t\t\tprint $chr,"\\t",$a[0],"\\t",$a[5],"\\t";\n+                        print $id.".".$num,"[S]\\t";\n+\t\t\tmy $connectstr_ca= $a[0]."-".$a[1].",".$a[4]."-".$a[5].",";\n+\t\t\t$eventids{$connectstr_ca} = $id.".".$num."[S]";\n+                        print "0\\t";\n+                        print $strand,"\\t";\n+                        print $a[0],"\\t",$a[5],"\\t";\n+                        print "255,0,0\\t";\n+                        print "2\\t";    \n+                        print $a[1]-$a[0],",";\n+                        #print $a[3]-$a[2],",";\n+                        print $a[5]-$a[4],"\\t";\n+                        print "0,";\n+                        #print $a[2]-$a[0],",";\n+                        print $a[4]-$a[0],"\\n";\n+\t\t\t\n+\t\t}\n+\t\tif($annos->{$key} eq "ir")\n+\t\t{\n+\t\t\tmy @a=split(/[,-]/,$key);\n+\n+\t\t\tmy $chrid=substr($chr,3,length($chr)-3);\n+\t\t\tmy $id="IR-IR-$chrid"."-".$a[1]."-".$a[2];\n+\t\t\tmy $num=0;\n+\t\t\t$num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};\n+\t\t\t$nums_per_isoform{$id}++;\n+\n+\t\t\tprint $chr,"\\t",$a[0],"\\t",$a[3],"\\t";\n+\t\t\tprint $id.".".$num,"[L]\\t";\n+\t\t\tprint "0\\t";\n+\t\t\tprint $strand,"\\t";\n+\t\t\tprint $a[0],"\\t",$a[3],"\\t";\n+\t\t\t$eventids{$a[0]."-".$a[3].","} = $id.".".$num."[L]";\n+\t\t\tprint "255,0,0\\t";\n+\t\t\tprint "3\\t";\n+\t\t\tprint $a[1]-$a[0],",";\n+\t\t\tprint $a[2]-$a[1],",";\n+\t\t\tprint $a[3]-$a[2],"\\t";\n+\t\t\tprint "0,";\n+\t\t\tprint $a[1]-$a[0],",";\n+\t\t\tprint $a[2]-$a[0],"\\n";\n+\t\t\t\n+\t\t\tprint $chr,"\\t",$a[0],"\\t",$a[3],"\\t";\n+                        print $id.".".$num,"[S]\\t";\n+\t\t\t$eventids{$key} = $id.".".$num."[S]";\n+                        print "0\\t";\n+                        print $strand,"\\t";\n+                        print $a[0],"\\t",$a[3],"\\t";\n+                        print "255,0,0\\t";\n+                        print "2\\t";    \n+                        print $a[1]-$a[0],",";\n+                        #print $a[3]-$a[2],",";\n+                        print $a[3]-$a[2],"\\t";\n+                        print "0,";\n+                        #print $a[2]-$a[0],",";\n+                        print $a[2]-$a[0],"\\n";\n+\t\t\n+\t\t}\n+\t\tif($annos->{$key} eq "ss")\n+\t\t{\n+\t\t\tmy @a=split(/[,-]/,$key);\n+\t\t\tmy $chrid=substr($chr,3,length($chr)-3);\n+\t\t\tmy $type="AA";\n+\t\t\tif( ($strand eq "+" && $a[1]==$a[2]) or ($strand eq "-" && $a[3]==$a[4]))\n+\t\t\t{\n+\t\t\t\t$type="AD";\n+\t\t\t}\n+\t\t\tmy $id="$type-$type-$chrid"."-".$a[2]."-".$a[3];\n+\t\t\tmy $connect_str_L = "";\n+\t\t\tmy $connect_str_S = "";\n+\t\t\tif($a[1]==$a[2])\n+\t\t\t{\n+\t\t\t\t$connect_str_L = $a[0]."-".$a[3].",".$a[4]."-".$a[5].",";\n+\t\t\t\t$connect_str_S = $a[0]."-".$a[1].",".$a[4]."-".$a[5].",";\n+\t\t\t}\n+\t\t\telse\n+\t\t\t{\n+\t\t\t\t$connect_str_L = $a[0]."-".$a[1].",".$a[2]."-".$a[5].",";\n+\t\t\t\t$connect_str_S = $a[0]."-".$a[1].",".$a[3]."-".$a[5].",";\n+\t\t\t\t\n+\t\t\t}\n+\t\t\tmy $num=0;\n+\t\t\t$num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};\n+\t\t\t$nums_per_isoform{$id}++;\n+\t\t\tprint $chr,"\\t",$a[0],"\\t",$a[5],"\\t";\n+\t\t\tprint $id.".".$num,"[L]\\t";\n+\t\t\t$eventids {$connect_str_L} = $id.".".$num."[L]";\n+\t\t\tprint "0\\t";\n+\t\t\tprint $strand,"\\t";\n+\t\t\tprint $a[0],"\\t",$a[5],"\\t";\n+\t\t\tprint "255,0,0\\t";\n+\t\t\tprint "3\\t";\n+\t\t\tprint $a[1]-$a[0],",";\n+\t\t\tprint $a[3]-$a[2],",";\n+\t\t\tprint $a[5]-$a[4],"\\t";\n+\t\t\tprint "0,";\n+\t\t\tprint $a[2]-$a[0],",";\n+\t\t\tprint $a[4]-$a[0],"\\n";\n+\t\t\t\n+\t\t\tprint $chr,"\\t",$a[0],"\\t",$a[5],"\\t";\n+                        print $id.".".$num,"[S]\\t";\n+\t\t\t$eventids {$connect_str_S} = $id.".".$num."[S]";\n+                        print "0\\t";\n+                        print $strand,"\\t";\n+                        print $a[0],"\\t",$a[5],"\\t";\n+                        print "255,0,0\\t";\n+                        print "2\\t";    \n+                        print $a[1]-$a[0],",";\n+                        #print $a[3]-$a[2],",";\n+                        print $a[5]-$a[4],"\\t";\n+                        print "0,";\n+                        #print $a[2]-$a[0],",";\n+                        print $a[4]-$a[0],"\\n";\n+\t\n+\t\n+\t\t}\n+\n+\t}\n+\treturn(\\%eventids);\n+\t\n+}\n+\n'
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/splitdb.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/splitdb.sh Thu Sep 07 15:06:58 2017 -0400
b
@@ -0,0 +1,11 @@
+workingfolder=$1;
+cd $workingfolder
+cut -f1 ../TXdb.bed |uniq >chr.list;
+while read line ;
+do 
+ chr=`echo $line |tr -d "\n"`;
+ grep -w $chr ../TXdb.bed >$chr.bed;
+ split -3000 $chr.bed $chr.;
+ echo $chr....;
+ rm $chr.bed;
+done <chr.list;
b
diff -r d4ca551ca300 -r adc0f7765d85 bin/vslz.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/vslz.pl Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,46 @@
+use strict;
+exit;
+
+my %config=do "$ENV{HOME}/.SpliceTrap.pl.ini";
+my $SrcFolder=$config{SrcFolder};
+
+my $BedFileName=$SrcFolder."/db/TXdb.1101.bed";
+my $RatioFileName=$ARGV[0];
+my $BedFileOutName=$ARGV[1];
+my %Ratios;
+
+open(RatioFile, $RatioFileName);
+
+while(my $RatioLine=<RatioFile>)
+{
+ chomp($RatioLine);
+ my @array=split("\t",$RatioLine);
+ $Ratios{$array[0]}=$array[2] if $array[14] eq "passed";
+}
+
+close(RatioFile);
+
+open(BedFile, $BedFileName);
+open(BedFileOut,">".$BedFileOutName.".bed");
+print BedFileOut "track name=$BedFileOutName discription=$BedFileOutName useScore=1\n";
+while(my $BedLine=<BedFile>)
+{
+ my @array=split("\t",$BedLine);
+ $array[3]=~/^(\S*)\[([LS])\]/;
+ my $id=$1;
+ my $LS=$2;
+ #print $LS,"\n";
+ next if not exists $Ratios{$id};
+ $array[4]=$Ratios{$id};
+ $array[4]=1-$Ratios{$id} if( $LS eq 'S');
+
+ $array[4]=sprintf("%.0f",$array[4]*1000);
+ for (my $i=0;$i<@array; $i++)
+ {
+ print BedFileOut $array[$i];
+ print BedFileOut "\t" if $i<scalar(@array)-1;
+ }
+ #print BedFileOut "\n";
+}
+close(BedFileOut);
+close(BedFile);
b
diff -r d4ca551ca300 -r adc0f7765d85 cutoffs/cutoff.pair.06.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cutoffs/cutoff.pair.06.txt Thu Sep 07 15:06:58 2017 -0400
b
@@ -0,0 +1,111 @@
+9 2.7 0.60603248295953 0.171888000000001
+18 2 0.60524889781465 0.183333
+27 2 0.614125387586727 0.176226000000001
+36 1.9 0.607174152514097 0.174107
+45 1.8 0.600015884070526 0.175151
+54 1.5 0.606211216622966 0.181161
+63 1.5 0.630096537351149 0.17829
+72 1.4 0.600041179609881 0.181678
+81 1.4 0.604419014379505 0.180618
+90 1.4 0.605503082593424 0.179083
+99 1.3 0.615709740786329 0.177374
+108 1.3 0.60153811996046 0.178839
+117 1.3 0.624394647716218 0.172525
+126 1.3 0.607071122577313 0.179287
+135 1.2 0.604935986506864 0.180048
+144 1.2 0.606377388150568 0.179824
+153 1.1 0.609872307866415 0.172894
+162 1.1 0.605221663116349 0.176559
+171 1.1 0.60084962792178 0.177017
+180 1.1 0.606042806460085 0.179842
+189 1.1 0.639661884576477 0.165058000000001
+198 1 0.63233306074533 0.172839
+207 1 0.620992090280475 0.17554
+216 1 0.617967295839559 0.177413000000001
+225 1 0.643160025381914 0.165310000000001
+234 1 0.664093921751576 0.161004
+243 1 0.617444847253655 0.179608
+252 0.9 0.600141496641319 0.18301
+261 0.9 0.61458063483465 0.175533
+270 0.9 0.631646650541943 0.175805
+279 0.9 0.633124568165035 0.169468
+288 0.9 0.611511089439885 0.172566
+297 0.9 0.620072975883583 0.176992
+306 0.8 0.632771460485054 0.177489
+315 0.8 0.616480687227451 0.177778
+324 0.8 0.627081395066885 0.173799
+333 0.8 0.637882935696159 0.166883
+342 0.8 0.601440520784007 0.181724
+351 0.8 0.6308436890493 0.169283000000001
+360 0.8 0.639014375873075 0.164053
+369 0.8 0.608995842436592 0.182319
+378 0.7 0.61599095431047 0.179065
+387 0.7 0.632637490417973 0.173393000000001
+396 0.7 0.611914615604433 0.174309000000001
+405 0.7 0.636212590337212 0.168285
+414 0.7 0.601768900444485 0.17904
+423 0.7 0.652306181407458 0.165425
+432 0.7 0.61284769661545 0.17678
+441 0.7 0.620417630501693 0.179685000000001
+450 0.7 0.630731905615349 0.172766000000001
+459 0.7 0.610688367962893 0.174217
+468 0.7 0.614116817389574 0.169074
+477 0.7 0.627626017337352 0.170694000000001
+486 0.7 0.625196910435048 0.173867
+495 0.7 0.624173913851294 0.170272
+504 0.7 0.600827986205662 0.176072
+513 0.7 0.60691053900984 0.173314
+522 0.7 0.637574555072294 0.164769
+531 0.7 0.623224687213309 0.173131
+540 0.7 0.606854298464756 0.17057
+549 0.7 0.602771105882753 0.176564
+558 0.7 0.635304573393458 0.165681
+567 0.7 0.614272507442932 0.178017
+576 0.7 0.647638749954493 0.161989
+585 0.7 0.635927720935647 0.166479000000001
+594 0.7 0.654872976893549 0.165962
+603 0.6 0.623281215201779 0.169602
+612 0.6 0.620036715664558 0.174000000000001
+621 0.6 0.6207915344063 0.173405000000001
+630 0.6 0.605666404221127 0.172404000000001
+639 0.6 0.605282217899342 0.171117
+648 0.6 0.605873079749684 0.173455
+657 0.6 0.622135854785987 0.171086
+666 0.6 0.609650459959578 0.180212
+675 0.6 0.615727466486383 0.175395
+684 0.6 0.650390040594351 0.157856
+693 0.6 0.600004956285639 0.17354
+702 0.6 0.619480037234101 0.169091000000001
+711 0.6 0.620537197134915 0.170687
+720 0.6 0.63097256862101 0.168571
+729 0.6 0.616862968480703 0.176372
+738 0.6 0.619853798956006 0.169892
+747 0.6 0.668527865891001 0.157351
+756 0.6 0.609299339958315 0.173282
+765 0.6 0.606997082083795 0.176418
+774 0.6 0.60385659737608 0.178646
+783 0.6 0.618166146904953 0.170539000000001
+792 0.6 0.655214353317497 0.162477
+801 0.6 0.667723279232349 0.161535
+810 0.6 0.641501812796978 0.165034000000001
+819 0.6 0.627258070106693 0.167323
+828 0.6 0.611937213679756 0.173702
+837 0.6 0.637164577326402 0.165999
+846 0.6 0.609589936966381 0.16902
+855 0.6 0.605378844036658 0.177104
+864 0.6 0.603894805290932 0.174897
+873 0.6 0.61196823740213 0.177185000000001
+882 0.6 0.645164364920278 0.169894
+891 0.6 0.616946196689952 0.164449
+900 0.6 0.620119902403488 0.173101
+909 0.6 0.612352323569787 0.175357000000001
+918 0.6 0.604523310198474 0.181906
+927 0.6 0.639238406673874 0.161621
+936 0.6 0.626807582940753 0.177092000000001
+945 0.6 0.649699083584984 0.166011
+954 0.6 0.630080161768358 0.175451
+963 0.6 0.634570582339133 0.174267
+972 0.6 0.613974438568147 0.171149000000001
+981 0.6 0.646832147519767 0.16271
+990 0.6 0.611137442619996 0.170677
+999 0.6 0.609074843250805 0.177417
b
diff -r d4ca551ca300 -r adc0f7765d85 cutoffs/cutoff.pair.07.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cutoffs/cutoff.pair.07.txt Thu Sep 07 15:06:58 2017 -0400
b
@@ -0,0 +1,111 @@
+9 3.8 0.723195172634673 0.136373
+18 3.1 0.704962064416613 0.145611
+27 2.7 0.707557510009516 0.148332
+36 2.7 0.727407566746925 0.139424000000001
+45 2.6 0.700849445263879 0.141565
+54 2.5 0.722912449885175 0.135534
+63 2.2 0.700978069072888 0.148155
+72 2.1 0.701953684982546 0.145638
+81 2.1 0.714519324839634 0.144058
+90 2 0.706725312179025 0.143797
+99 1.8 0.711963141532287 0.146397
+108 1.8 0.735577668185521 0.136275
+117 1.7 0.702810428805111 0.149303
+126 1.7 0.720543078508623 0.142071
+135 1.7 0.70149933958699 0.145076
+144 1.7 0.71795624649094 0.139588
+153 1.7 0.704835091689678 0.148916
+162 1.5 0.701545731183728 0.149502
+171 1.5 0.724190117580076 0.139706
+180 1.5 0.713150501847426 0.146353
+189 1.4 0.701931396757465 0.150412
+198 1.4 0.701219389501589 0.150832
+207 1.4 0.709880139643964 0.146954
+216 1.4 0.745895944323654 0.133521
+225 1.4 0.708094624059256 0.144051000000001
+234 1.4 0.714653417236785 0.141035
+243 1.4 0.702278770668715 0.146489
+252 1.4 0.714249389046789 0.140401
+261 1.4 0.720916794007688 0.140816
+270 1.4 0.704112631476039 0.140986
+279 1.4 0.725725706846333 0.140364
+288 1.4 0.713439724527154 0.142329
+297 1.3 0.703874456990055 0.145868
+306 1.3 0.714083967774953 0.138665
+315 1.3 0.717635749171277 0.139297
+324 1.3 0.70800987989496 0.14435
+333 1.3 0.703608941539679 0.143911
+342 1.3 0.712538846663198 0.142044
+351 1.3 0.736517626110649 0.137995
+360 1.3 0.752400150484099 0.133463
+369 1.3 0.700408117809905 0.149221
+378 1.2 0.710876753570226 0.145758
+387 1.2 0.719606098956021 0.141686
+396 1.1 0.701573537304199 0.149306
+405 1.1 0.711449711776699 0.146484
+414 1.1 0.7219174559929 0.140383
+423 1.1 0.711374621895577 0.147498
+432 1.1 0.706470648418195 0.146516
+441 1.1 0.702119897887331 0.144641
+450 1.1 0.710888241224795 0.14668
+459 1.1 0.700236191312255 0.145443
+468 1.1 0.709289110346942 0.145098000000001
+477 1.1 0.723682944143134 0.137715000000001
+486 1.1 0.711909258694556 0.148433000000001
+495 1.1 0.706050645629693 0.146412
+504 1.1 0.71026622444805 0.142836
+513 1.1 0.721840069030837 0.140123
+522 1.1 0.721616423306577 0.139145
+531 1.1 0.706599106603714 0.144744
+540 1.1 0.732906503371939 0.138953
+549 1.1 0.710747551039788 0.146028
+558 1.1 0.72403698505923 0.141907
+567 1.1 0.703858944181491 0.142942
+576 1.1 0.732227703846385 0.137018
+585 1.1 0.706081331511759 0.145895
+594 1.1 0.72851699215126 0.140481
+603 1.1 0.735776902337646 0.13646
+612 1 0.703269143581204 0.150933
+621 1 0.705868277579243 0.149988
+630 1 0.735304036557552 0.133891
+639 1 0.701919162079755 0.140775
+648 1 0.714774599695102 0.142638
+657 1 0.702484973766457 0.145888
+666 1 0.714275579611685 0.144289
+675 1 0.704751859660089 0.148070000000001
+684 1 0.725303040977116 0.139712
+693 1 0.7387858141582 0.134658
+702 1 0.737133543332787 0.135015
+711 1 0.71646444290315 0.140297
+720 1 0.735722464691124 0.136141
+729 1 0.709409780786399 0.144924000000001
+738 1 0.732557952972944 0.137912
+747 1 0.70280408850191 0.148749
+756 1 0.736014898114798 0.13601
+765 1 0.719075344267219 0.141568000000001
+774 1 0.705605602403474 0.142286
+783 1 0.714188898670159 0.143287
+792 1 0.708351298492815 0.14399
+801 1 0.701618595782959 0.140845
+810 1 0.70444245319677 0.142743000000001
+819 1 0.701265538602256 0.146322
+828 1 0.731290352078885 0.136012
+837 1 0.717812366249433 0.137717
+846 1 0.724767043040464 0.142316
+855 1 0.742408530369704 0.132985
+864 1 0.715927851943484 0.141312
+873 1 0.70674171896429 0.145039
+882 0.9 0.705311244948852 0.148678
+891 0.9 0.725745115884173 0.136097
+900 0.9 0.717545411117844 0.145251
+909 0.9 0.71139239340832 0.147716
+918 0.9 0.724110435852362 0.14723
+927 0.9 0.730304270252922 0.142741000000001
+936 0.9 0.702790994935148 0.14686
+945 0.9 0.714312901014806 0.142984
+954 0.9 0.718970705842182 0.14072
+963 0.9 0.702918434736422 0.146811
+972 0.9 0.763309367972586 0.135241
+981 0.9 0.70933162192829 0.14406
+990 0.9 0.714425781696018 0.142589
+999 0.9 0.707307109948989 0.146265
b
diff -r d4ca551ca300 -r adc0f7765d85 cutoffs/cutoff.pair.08.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cutoffs/cutoff.pair.08.txt Thu Sep 07 15:06:58 2017 -0400
b
@@ -0,0 +1,111 @@
+9 5.7 0.807699596880158 0.11075
+18 4.8 0.812102488456895 0.109521
+27 4.8 0.807059148683326 0.109122
+36 4.5 0.810359129334262 0.106039
+45 4.1 0.800899562591127 0.109428
+54 4 0.819734419515034 0.104148
+63 3.6 0.801137818633087 0.112097
+72 3.5 0.805611266779768 0.107837
+81 3.2 0.804721543883597 0.110678
+90 3.2 0.81977543427128 0.105428
+99 3.2 0.80949445276013 0.109698
+108 3.1 0.800113619307504 0.115339
+117 2.9 0.80163357132874 0.111922
+126 2.9 0.807430653105459 0.110639
+135 2.8 0.809101610526483 0.109881
+144 2.8 0.808015366939668 0.109303
+153 2.8 0.815481435245632 0.106845
+162 2.7 0.813972734460064 0.108256
+171 2.7 0.803878583908181 0.111767
+180 2.7 0.814908823498375 0.110845
+189 2.5 0.803934003900257 0.112334
+198 2.5 0.819507987133127 0.106736
+207 2.5 0.816975135145848 0.109487
+216 2.5 0.801969702289987 0.11426
+225 2.5 0.814854242652841 0.106264
+234 2.5 0.809009743289239 0.110027000000001
+243 2.5 0.816859180637723 0.10951
+252 2.5 0.825923196681225 0.104742
+261 2.3 0.805863672217281 0.108068
+270 2.3 0.812114384187481 0.110547
+279 2.3 0.811109163497932 0.109461
+288 2.3 0.80680260883464 0.104169
+297 2.3 0.800054845624205 0.110627
+306 2 0.80196269021579 0.117711
+315 2 0.809945512786952 0.109737
+324 2 0.819364610024312 0.110094
+333 2 0.812712294607599 0.111396
+342 2 0.801568817408624 0.113051
+351 2 0.804296495631151 0.112833
+360 2 0.809648638295373 0.106319
+369 2 0.810216908330331 0.110967
+378 2 0.80081744573067 0.112898
+387 2 0.809614612996421 0.114576
+396 2 0.800940767696918 0.109701
+405 1.8 0.800397828290365 0.115299
+414 1.8 0.80375463158553 0.116013
+423 1.8 0.80204726507529 0.110069
+432 1.8 0.829673313493101 0.109584
+441 1.8 0.801512586472861 0.110591
+450 1.8 0.801317568038407 0.11341
+459 1.8 0.806266605408301 0.11489
+468 1.8 0.804368073911341 0.112083
+477 1.8 0.80034140400705 0.113126
+486 1.8 0.803037870942011 0.112527
+495 1.8 0.816147055378533 0.113451
+504 1.8 0.804009255355535 0.113167
+513 1.8 0.809536527236661 0.111121
+522 1.8 0.810250462485448 0.113045
+531 1.8 0.813388275601197 0.117925
+540 1.8 0.810529197259932 0.111664
+549 1.8 0.80286846045568 0.114872
+558 1.8 0.805231767462103 0.112439
+567 1.8 0.803715104541846 0.112076
+576 1.8 0.800856311237744 0.113416
+585 1.8 0.815485369320364 0.113187
+594 1.8 0.810404807750137 0.111576
+603 1.8 0.80944688874663 0.114048
+612 1.8 0.813671662890222 0.107955
+621 1.8 0.805581198188585 0.111274
+630 1.5 0.801229670148542 0.121577
+639 1.5 0.812201198793304 0.109719
+648 1.5 0.80667869232878 0.111841
+657 1.5 0.817613487850624 0.110405
+666 1.5 0.816866335035283 0.107114
+675 1.5 0.8101804833249 0.117018
+684 1.5 0.818509280416726 0.115303
+693 1.5 0.80107589244594 0.116714
+702 1.5 0.829601652947073 0.107173
+711 1.5 0.820953668506561 0.108043
+720 1.5 0.811426464962464 0.114567
+729 1.5 0.800211543269215 0.1163
+738 1.5 0.814971683053337 0.111887
+747 1.5 0.810647742818646 0.111346
+756 1.5 0.823589356371534 0.1075
+765 1.5 0.808183381146845 0.115074
+774 1.5 0.804024248189691 0.11739
+783 1.5 0.809214446523965 0.114048
+792 1.5 0.816922307151774 0.116292
+801 1.5 0.817083370511117 0.107909
+810 1.5 0.803218112416446 0.122658
+819 1.5 0.801792365687339 0.115227
+828 1.5 0.824077216658734 0.106899
+837 1.5 0.808010406353855 0.115016
+846 1.5 0.805503181301868 0.111932
+855 1.5 0.810848938809874 0.114185
+864 1.5 0.80077383728855 0.11448
+873 1.5 0.804510616030174 0.115034
+882 1.5 0.800413194808912 0.119586
+891 1.5 0.820672192506787 0.110138
+900 1.5 0.810326379209945 0.112103
+909 1.5 0.811096513155687 0.117781
+918 1.5 0.804732736125644 0.117636
+927 1.5 0.807950118492599 0.113333
+936 1.5 0.807915776288532 0.113945
+945 1.5 0.832853748697962 0.107864
+954 1.5 0.819637006281129 0.112646
+963 1.5 0.807519443743277 0.110865
+972 1.5 0.816853380970702 0.113831
+981 1.5 0.802684401572777 0.116302
+990 1.5 0.809446019377896 0.112676
+999 1.5 0.804655052306582 0.112474
b
diff -r d4ca551ca300 -r adc0f7765d85 refGenes.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/refGenes.bed Thu Sep 07 15:06:58 2017 -0400
b
b'@@ -0,0 +1,66259 @@\n+chr1\t67092175\t67134971\tNM_001276351\t0\t-\t67093004\t67127240\t0\t8\t1429,187,70,113,158,92,86,42,\t0,3059,4076,23176,33576,34990,38966,42754,\n+chr1\t67092175\t67134971\tNM_001276352\t0\t-\t67093579\t67127240\t0\t9\t1429,70,145,68,113,158,92,86,42,\t0,4076,11062,19401,23176,33576,34990,38966,42754,\n+chr1\t67092175\t67134971\tNR_075077\t0\t-\t67134971\t67134971\t0\t10\t1429,70,145,68,143,113,158,92,86,42,\t0,4076,11062,19401,21438,23176,33576,34990,38966,42754,\n+chr1\t201283451\t201332993\tNM_000299\t0\t+\t201283702\t201328836\t0\t15\t453,104,395,145,208,178,63,115,156,177,154,187,85,107,2920,\t0,10490,29714,33101,34120,35166,36364,36815,38526,39561,40976,41489,42302,45310,46622,\n+chr1\t201283451\t201332993\tNM_001005337\t0\t+\t201283702\t201328836\t0\t14\t453,104,395,145,208,178,115,156,177,154,187,85,107,2920,\t0,10490,29714,33101,34120,35166,36815,38526,39561,40976,41489,42302,45310,46622,\n+chr1\t8352403\t8423687\tNM_001042682\t0\t-\t8355086\t8364133\t0\t13\t2717,181,147,721,223,1379,114,162,200,93,163,81,127,\t0,3015,3696,5792,7360,7708,9359,10279,11652,12342,13408,70323,71157,\n+chr1\t8352403\t8817640\tNM_001042681\t0\t-\t8355086\t8656297\t0\t23\t2717,181,147,721,223,1379,114,162,200,93,163,81,99,100,125,49,105,97,106,126,71,469,481,\t0,3015,3696,5792,7360,7708,9359,10279,11652,12342,13408,70323,113521,142659,145001,156223,188810,204071,205014,262157,271906,303569,464756,\n+chr1\t8352403\t8817640\tNM_012102\t0\t-\t8355086\t8656297\t0\t24\t2717,181,147,721,223,1379,114,162,200,93,163,81,99,100,125,49,105,97,106,126,71,469,185,481,\t0,3015,3696,5792,7360,7708,9359,10279,11652,12342,13408,70323,113521,142659,145001,156223,188810,204071,205014,262157,271906,303569,440000,464756,\n+chr1\t33513998\t34165274\tNM_001281956\t0\t-\t33519517\t34165097\t0\t71\t2572,213,139,88,113,162,63,180,112,74,174,174,180,177,183,174,189,174,249,174,195,186,189,147,189,114,81,146,178,189,210,117,70,119,105,97,125,204,96,114,117,195,188,139,192,203,127,192,157,170,189,216,117,189,188,139,195,327,183,113,104,122,125,88,78,113,208,195,113,217,364,\t0,5466,5813,7464,9308,10883,13197,19051,19797,23023,23437,26526,27131,28721,32038,36178,43735,45301,53594,55375,57533,58507,63297,66754,69643,72505,73089,86866,88370,91283,97042,100505,102907,103500,108168,109371,110520,111052,112487,119423,121215,122361,132649,138324,143947,148891,178931,184754,186518,195090,200588,202287,210198,210517,211350,212548,225141,229281,258570,274601,278424,296744,305714,306470,311698,332885,404095,421761,518595,574978,650912,\n+chr1\t33513998\t34165842\tNM_052896\t0\t-\t33519517\t34165813\t0\t70\t2572,213,139,88,113,162,63,180,112,74,174,174,180,177,183,174,174,174,195,186,189,147,189,114,81,146,178,189,126,210,117,70,119,105,97,125,204,96,114,117,195,188,139,192,203,127,192,157,170,189,216,117,189,188,139,195,327,183,113,104,122,125,88,78,113,208,195,113,217,96,\t0,5466,5813,7464,9308,10883,13197,19051,19797,23023,23437,26526,27131,28721,32038,36178,53594,55375,57533,58507,63297,66754,69643,72505,73089,86866,88370,91283,91864,97042,100505,102907,103500,108168,109371,110520,111052,112487,119423,121215,122361,132649,138324,143947,148891,178931,184754,186518,195090,200588,202287,210198,210517,211350,212548,225141,229281,258570,274601,278424,296744,305714,306470,311698,332885,404095,421761,518595,574978,651748,\n+chr1\t41847188\t42035925\tNR_038261\t0\t-\t42035925\t42035925\t0\t4\t1257,219,112,119,\t0,1682,71224,188618,\n+chr1\t75202130\t75611114\tNM_001130058\t0\t-\t75203726\t75541447\t0\t24\t1703,85,89,71,74,104,95,263,88,93,100,132,113,84,124,61,126,85,85,74,49,39,82,75,\t0,9337,11574,11788,12474,13623,15735,16359,17126,17669,20230,25595,31855,34856,36382,39870,40755,49079,72827,98481,137451,194452,339304,408909,\n+chr1\t75202132\t75611116\tNM_001320285\t0\t-\t75203726\t75300660\t0\t26\t1701,85,89,71,74,104,95,263,88,93,100,132,113,84,124,61,126,85,85,74,91,49,39,67,82,77,\t0,9335,11572,11786,12472,13621,15733,16357,17124,17667,20228,25593,31853,34854,36380,39868,40753,49077,72825,98479,109482,137449,194450,196222,339302,408907,\n+chr1\t75202132\t75611116\tNM_001320287\t0\t-\t75203726\t75300660\t0\t25\t1701,85,'..b'0123\t230123\t0\t8\t106,146,111,89,40,166,208,100,\t0,3710,5076,6459,7823,9904,12166,12494,\n+chr15_KI270727v1_random\t241092\t241174\tNR_049886\t0\t+\t241174\t241174\t0\t1\t82,\t0,\n+chr15_KI270727v1_random\t241092\t241174\tNR_049895\t0\t+\t241174\t241174\t0\t1\t82,\t0,\n+chr15_KI270727v1_random\t241092\t241174\tNR_128721\t0\t+\t241174\t241174\t0\t1\t82,\t0,\n+chr15_KI270727v1_random\t372321\t373405\tNM_001004719\t0\t+\t372419\t373361\t0\t1\t1084,\t0,\n+chr16_KI270728v1_random\t17231\t19833\tNM_001099687\t0\t+\t17263\t18809\t0\t2\t394,1037,\t0,1565,\n+chr16_KI270728v1_random\t17231\t19833\tNM_016212\t0\t+\t17263\t18809\t0\t2\t394,1037,\t0,1565,\n+chr16_KI270728v1_random\t17231\t19833\tNR_110886\t0\t+\t19833\t19833\t0\t4\t394,167,289,1037,\t0,499,1064,1565,\n+chr16_KI270728v1_random\t17231\t19833\tNR_110897\t0\t+\t19833\t19833\t0\t4\t394,167,289,1037,\t0,499,1064,1565,\n+chr16_KI270728v1_random\t17231\t19833\tNR_110910\t0\t+\t19833\t19833\t0\t3\t394,289,1037,\t0,1064,1565,\n+chr16_KI270728v1_random\t17231\t19833\tNR_110911\t0\t+\t19833\t19833\t0\t3\t394,289,1037,\t0,1064,1565,\n+chr16_KI270728v1_random\t17233\t19273\tNM_001205259\t0\t+\t17263\t18809\t0\t2\t392,477,\t0,1563,\n+chr16_KI270728v1_random\t17233\t19273\tNR_110914\t0\t+\t19273\t19273\t0\t3\t392,289,477,\t0,1062,1563,\n+chr16_KI270728v1_random\t17250\t19837\tNM_001330061\t0\t+\t17263\t18809\t0\t2\t375,1041,\t0,1546,\n+chr16_KI270728v1_random\t17251\t19833\tNM_001330066\t0\t+\t17263\t18809\t0\t2\t374,1037,\t0,1545,\n+chr16_KI270728v1_random\t933855\t936466\tNR_110898\t0\t-\t936466\t936466\t0\t3\t1039,290,268,\t0,1251,2343,\n+chr16_KI270728v1_random\t1001629\t1037618\tNR_135178\t0\t+\t1037618\t1037618\t0\t25\t105,103,322,130,110,172,175,173,104,151,211,133,128,196,134,396,152,192,188,193,183,182,148,197,233,\t0,1009,5706,6177,6753,7361,8561,11932,12489,13586,16444,16928,17168,17388,20254,20921,21434,22284,28770,30290,31190,34103,35109,35455,35756,\n+chr16_KI270728v1_random\t1331814\t1346848\tNR_130771\t0\t-\t1346848\t1346848\t0\t3\t5057,93,295,\t0,6640,14739,\n+chr16_KI270728v1_random\t1331814\t1346848\tNR_130772\t0\t-\t1346848\t1346848\t0\t2\t5057,295,\t0,14739,\n+chr17_GL000205v2_random\t54856\t57966\tNR_003682\t0\t-\t57966\t57966\t0\t1\t3110,\t0,\n+chr22_KI270731v1_random\t69141\t86923\tNR_003267\t0\t+\t86923\t86923\t0\t13\t189,171,135,81,193,158,150,137,188,128,113,114,280,\t0,732,4094,4362,9648,10233,11930,12603,16240,16660,16890,17084,17502,\n+chr22_KI270733v1_random\t122272\t135645\tNR_046235\t0\t+\t135645\t135645\t0\t1\t13373,\t0,\n+chr22_KI270733v1_random\t130203\t135280\tNR_003287\t0\t+\t135280\t135280\t0\t1\t5077,\t0,\n+chr22_KI270733v1_random\t121580\t121672\tNR_106782\t0\t+\t121672\t121672\t0\t1\t92,\t0,\n+chr22_KI270733v1_random\t121580\t121672\tNR_128715\t0\t+\t121672\t121672\t0\t1\t92,\t0,\n+chr22_KI270733v1_random\t121580\t121672\tNR_128716\t0\t+\t121672\t121672\t0\t1\t92,\t0,\n+chr22_KI270733v1_random\t121580\t121672\tNR_128717\t0\t+\t121672\t121672\t0\t1\t92,\t0,\n+chr22_KI270733v1_random\t125128\t125189\tNR_037458\t0\t+\t125189\t125189\t0\t1\t61,\t0,\n+chr22_KI270733v1_random\t125128\t125189\tNR_128714\t0\t+\t125189\t125189\t0\t1\t61,\t0,\n+chr22_KI270733v1_random\t125930\t127799\tNR_003286\t0\t+\t127799\t127799\t0\t1\t1869,\t0,\n+chr22_KI270733v1_random\t128876\t129032\tNR_003285\t0\t+\t129032\t129032\t0\t1\t156,\t0,\n+chr22_KI270733v1_random\t170214\t170275\tNR_037458\t0\t+\t170275\t170275\t0\t1\t61,\t0,\n+chr22_KI270733v1_random\t170214\t170275\tNR_128714\t0\t+\t170275\t170275\t0\t1\t61,\t0,\n+chr22_KI270733v1_random\t171011\t172880\tNR_003286\t0\t+\t172880\t172880\t0\t1\t1869,\t0,\n+chr22_KI270733v1_random\t173955\t174111\tNR_003285\t0\t+\t174111\t174111\t0\t1\t156,\t0,\n+chr22_KI270734v1_random\t72453\t74335\tNR_136575\t0\t+\t74335\t74335\t0\t1\t1882,\t0,\n+chr22_KI270734v1_random\t90958\t98408\tNR_136574\t0\t-\t98408\t98408\t0\t4\t429,155,73,884,\t0,5076,5960,6566,\n+chr22_KI270734v1_random\t131493\t137393\tNM_005675\t0\t+\t131645\t136994\t0\t5\t262,161,101,141,549,\t0,342,3949,4665,5351,\n+chr22_KI270734v1_random\t138078\t161750\tNM_001195226\t0\t-\t138479\t156446\t0\t14\t589,89,99,176,147,93,82,80,117,65,150,35,209,62,\t0,664,4115,5535,6670,6925,8561,9545,10037,10335,12271,12908,18210,23610,\n+chr22_KI270734v1_random\t138078\t161852\tNM_016335\t0\t-\t138479\t161586\t0\t15\t589,89,99,176,147,93,82,80,117,65,150,35,209,313,164,\t0,664,4115,5535,6670,6925,8561,9545,10037,10335,12271,12908,18210,23235,23610,\n'
b
diff -r d4ca551ca300 -r adc0f7765d85 splice_trap.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/splice_trap.xml Thu Sep 07 15:06:58 2017 -0400
[
@@ -0,0 +1,35 @@
+<tool id="splice_trap" name="SpliceTrap" version="1.0.0">
+    <description>A statistic tool for quantifying exon inclusion ratios in paired-end RNA-seq data, with broad applications for the study of alternative splicing.
+    </description>
+    <requirements>
+        <requirement type="package" version="1.2.1.1">bowtie</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        perl $__tool_directory__/SpliceTrap.pl -p 8 -l $__tool_directory__ -d hg38v3 -1 "$input1" -2 "$input2" -s "$read_size" "$output1" "$output2"
+    ]]></command>
+    <inputs>
+        <param type="data" name="input1" format="fastq" />
+        <param type="data" name="input2" format="fastq" />
+        <param name='read_size' type='integer' value='50' label="Read size"  />
+    </inputs>
+    <outputs>
+        <data name="output1" format="txt" />
+        <data name="output2" format="txt" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input1" value="input1.fastq"/>
+            <param name="input2" value="input2.fastq"/>
+            <output name="output1" file="output1.txt"/>
+            <output name="output2" file="output2.txt"/>
+        </test>
+    </tests>
+    <help> 
+        **SpliceTrap**
+    </help>
+    <citations>
+        <citation type="bibtex">
+            http://rulai.cshl.edu/splicetrap
+        </citation>
+    </citations>
+</tool>
b
diff -r d4ca551ca300 -r adc0f7765d85 src/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Makefile Thu Sep 07 15:06:58 2017 -0400
b
@@ -0,0 +1,3 @@
+all:
+ g++ -O2 splicetrap.estimate.cpp -o Pair_estimate_c
+ mv Pair_estimate_c ../bin
b
diff -r d4ca551ca300 -r adc0f7765d85 src/splicetrap.estimate.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/splicetrap.estimate.cpp Thu Sep 07 15:06:58 2017 -0400
[
b'@@ -0,0 +1,854 @@\n+//Author: Jie Wu@CSHL\n+//TO replace the original Perl code\n+\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <iostream>\n+#include <fstream>\n+#include <string.h>\n+#include <unistd.h>\n+#include <sstream>\n+#include <map>\n+#include <vector>\n+#include <math.h>\n+#include <time.h>\n+#include <limits>\n+#include <iomanip>\n+\n+using namespace std;\n+\n+int MAX_LINE_LEN = 1024*1024;\n+\n+void printusage()\n+{\n+    cout<< "\\tUsage:"<<endl;\n+    cout<<"\\t-s\\tread_size"<<endl;\n+    cout<<"\\t-b\\tIRM file"<<endl;\n+    cout<<"\\t-f\\tFZM file"<<endl;\n+    cout<<"\\t-1\\tmapping result 1"<<endl;\n+    cout<<"\\t-2\\tmapping result 2"<<endl;\n+    cout<<"\\t-d\\tTXdb database file"<<endl;\n+    cout<<"\\t-o\\toutput prefix"<<endl;\n+    \n+}\n+\n+struct ReadPairInfo\n+{\n+\tbool isPaired;\n+\tvector<int> IsoFlag; //flag as 1 if mapped to the isoform\n+\tvector<int> Size; //fragment size in the isoform\n+\tvector<int> Pos;\n+};\n+\n+class TXdb_entry{\n+\tpublic:\n+\t string chrid;\n+\t //int start;\n+\t //int end;\n+\t string id;\n+\t bool iscomplete;\n+\t int start;\n+\t int end;\n+\t int exonstarts[3];\n+\t char strand;\n+\t int exonsize[3];\n+\t int len1;\n+\t int len2;\n+\t vector< ReadPairInfo > pairs;\n+\t int snum1; //single-end reads\n+\t int snum2;\n+\t int snum12;\n+\t float ir, bir;\n+\t int enum1; //exon body reads\n+\t int enum2;\n+\t int enum3;\n+\t \n+\t int jnum12;\n+\t int jnum23;\n+\t int jnum13;\n+\n+\t int totalreadnum;\n+\t void estimate(vector<double> &FZM, map<string, vector<double> > &IRM, int &read_size)\n+\t {\n+\t\t float Maxe=0;\n+\t\t float Max = -numeric_limits<float>::max();\n+\t\t float BMaxe=0;\n+\t\t float BMax=-numeric_limits<float>::max();\n+\t\t for(float e1=0.001; e1<1;e1=e1+0.001)\n+\t\t {\n+\t\t\t float efix1 = e1*len1/\n+\t\t\t               (e1*exonsize[1]+len2);\n+\t\t\t float efix2 = 1-efix1;\n+\t\t\t float LL = snum12*log( efix1/(len1-read_size+1)\n+\t\t\t          + efix2/(len2-read_size+1) )\n+\t\t\t          + snum1*log(efix1/(len1-read_size+1)) \n+\t\t\t\t  + snum2*log(efix2/(len2-read_size+1));\n+\t\t\t\t  //cout<<LL<<endl;\n+\t\t\t\t  //cout<<efix1<<"\\t"  <<pairs.size()<<"\\t"  <<len2<<"\\t"\t<<endl;\n+\t\t         //int num1=0, num2=0,num12=0;\n+\t\t         for(int i=0;i<pairs.size();i++)\n+\t\t\t  {\n+                            if(pairs[i].IsoFlag[0] == 1)\n+\t\t\t    {\n+\t\t\t\t    if(pairs[i].IsoFlag[1] == 0)\n+\t\t\t\t    {\n+\t\t\t\t\t   // num1++;\n+\t\t\t\t\t    float tmp=FZM[pairs[i].Size[0]]*efix1/(len1-pairs[i].Size[0]+1);\n+\t\t\t\t\t    if (tmp == 0)\n+\t\t\t\t\t    {\n+\t\t\t\t\t    \t LL = LL - 308;\n+\t\t\t\t\t    }\n+\t\t\t\t\t    else\n+\t\t\t\t\t    {\n+\t\t\t\t\t\t LL = LL + log(tmp);\n+\t\t\t\t\t    }\n+\t\t\t\t    }\n+\t\t\t\t    else\n+\t\t\t\t    {\n+\t\t\t\t\t   // num12++;\n+\t\t\t\t\t    float tmp=(FZM[pairs[i].Size[0]]*efix1/(len1-pairs[i].Size[0]+1)) \n+\t\t\t\t\t          + (FZM[pairs[i].Size[1]]*efix2/(len2-pairs[i].Size[1]+1));\n+\t\t\t\t  \t    if(tmp ==0)\n+\t\t\t\t\t    {\n+\t\t\t\t\t    LL = LL -308;\n+\t\t\t\t\t    }\n+\t\t\t\t\t    else\n+\t\t\t\t\t    {\n+\t\t\t\t\t\t    LL = LL + log(tmp);\n+\t\t\t\t\t    }\n+\t\t\t\t    }\n+\t\t\t    }\n+\t\t\t    else\n+\t\t\t    {\n+\t\t\t\t    //num2++;\n+\t\t\t\t    float tmp = FZM[pairs[i].Size[1]]*efix2/(len2-pairs[i].Size[1]+1);\n+\t\t\t\t    if(tmp ==0)\n+\t\t\t\t    {\n+\t\t\t\t    LL= LL - 308; \n+\t\t\t\t    }\n+\t\t\t\t    else\n+\t\t\t\t    {\n+\t\t\t\t\t    LL = LL +log(tmp);\n+\t\t\t\t    }\n+\t\t\t    }\n+\t\t\t  }\n+\t\t\t  //cout<<num1<<"\\t"<<num2<<"\\t"<<num12<<"\\n";\n+\t\t\t  //cout<<LL<<endl;\n+\t\t\t  \n+\t\t\t  if(IRM["CA"].size() > 0)\n+\t\t\t  {\n+\t\t\t\t  float BLL;\n+\t\t\t\t  string eventtype =id.substr(0,2);\n+\t\t\t\t  if(!eventtype.compare("ME"))\n+\t\t\t\t  {\n+\t\t\t\t\t  eventtype = "CA";\n+\t\t\t\t  }\n+\t\t\t\t  BLL = LL + log(IRM[eventtype][int(e1/0.001)]);\n+\t\t\t\t  if(BLL > BMax)\n+\t\t\t\t  { \n+\t\t\t\t\t  BMax = BLL;\n+\t\t\t\t\t  BMaxe = e1;\n+\t\t\t\t  }\n+\t\t\t  }\n+\t\t\t\t  //cout<<LL<<endl;\n+\t\t\t  if(LL > Max)\n+\t\t\t  {\n+\t\t\t\t  Max= LL;\n+\t\t\t\t  Maxe = e1;\n+\n+\t\t\t  }\n+\t\t\t\t  \n+\t\t }\n+\t\t ir= Maxe;\n+\t\t bir= BMaxe;\n+\t }\n+\t\t\n+\t TXdb_entry(char *line)\n+\t {\n+\t\t size_t found;\n+\t\t string linestr(line);\n+\t\t found = linestr.find("[L]");\n+\t\t if(found != string::npos)\n+\t\t {\n+\t\t\t iscomplete=1;\n+\t\t\n+\t\t \tchar *token;\n+\n+\t\t \ttoken = strtok(line,"\\t");\n+\t\t \tif(token==NULL)\n+\t\t \t{\n+\t\t\t\t cout<<"error1"<<endl;\n+\t\t\t\t exit(0);\n+\t\t\t }\n+\t\t\t else\n+\t\t\t {\n+\t\t\t\t c'..b'gle_read_num<<endl;\n+      cout<<"Paired reads used:\\t"<<total_pair_read_num<<endl;\n+\n+      log_file<<"#total_read_num:\\t"<<total_read_num*2<<endl;\n+      log_file<<"#total_single_read_num:\\t"<<total_single_read_num<<endl;\n+      log_file<<"#total_pair_read_num:\\t"<<total_pair_read_num<<endl;\n+      \n+      //Calculate ratios and output\n+      cout<<"Now doing estimations and output to "<<ratio_file_name<<"\\t"<<num_file_name<<endl;\n+      ofstream ratio_file(ratio_file_name.c_str());\n+      ofstream num_file(num_file_name.c_str());\n+      map<string, TXdb_entry>::iterator txdb_it = TXdb_entries.begin();\n+      for(;txdb_it!=TXdb_entries.end();txdb_it++)\n+      {\n+\t      string eventid = txdb_it->first;\n+\t      //cout<< eventid <<endl; \n+\t      txdb_it->second.estimate(FZM,IRM,read_size);\n+/*\t      ratio_file<< eventid<<"\\t"\n+\t      \t\t<<setprecision(4)<<fixed\n+\t                <<txdb_it->second.ir<<"\\t"\n+\t\t\t<<txdb_it->second.bir<<"\\t"\n+\t\t\t<<"NA\\tNA\\tNA\\t"\n+\t\t\t<<txdb_it->second.jnum12<<"\\t"\n+\t\t\t<<txdb_it->second.jnum23<<"\\t"\n+\t\t\t<<txdb_it->second.jnum13<<"\\t"\n+\t\t\t<<(1.0*read_size*( txdb_it->second.enum1+txdb_it->second.jnum12 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[0] )<<"\\t"\n+\t\t\t<<(1.0*read_size*( txdb_it->second.enum2+txdb_it->second.jnum12 +txdb_it->second.jnum23 )/txdb_it->second.exonsize[1] )<<"\\t"\n+\t\t\t<<(1.0*read_size*( txdb_it->second.enum3+txdb_it->second.jnum23 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[2] )<<"\\t"\n+\t\t\t<<txdb_it->second.exonsize[0]<<"\\t"\n+\t\t\t<<txdb_it->second.exonsize[1]<<"\\t"\n+\t\t\t<<txdb_it->second.exonsize[2]<<"\\t"\n+\t\t<<(1.0*read_size*txdb_it->second.totalreadnum/txdb_it->second.len1)<<endl;\n+\t\t*/\n+\t      ratio_file<<eventid<<"\\t"\n+\t      \t\t<<setprecision(4)<<fixed\n+\t\t\t<<txdb_it->second.ir<<"\\t"  //1\n+\t\t\t<<txdb_it->second.bir<<"\\t" //2\n+\t\t\t<<txdb_it->second.chrid<<"\\t" //3\n+\t\t\t<<txdb_it->second.start<<"," //\n+\t\t\t<<txdb_it->second.start+txdb_it->second.exonsize[0]<<"\\t" //4\n+\t\t\t<<txdb_it->second.start+txdb_it->second.exonstarts[1]<<","\n+\t\t\t<<txdb_it->second.start+txdb_it->second.exonstarts[1]+txdb_it->second.exonsize[1]<<"\\t"//5\n+\t\t\t<<txdb_it->second.start+txdb_it->second.exonstarts[2]<<","\n+\t\t\t<<txdb_it->second.start+txdb_it->second.exonstarts[2]+txdb_it->second.exonsize[2]<<"\\t"//6\n+\t\t\t<<txdb_it->second.strand<<"\\t"//7\n+\t\t\t<<txdb_it->second.jnum12<<"\\t"//8\n+\t\t\t<<txdb_it->second.jnum23<<"\\t"//9\n+\t\t\t<<txdb_it->second.jnum13<<"\\t"//10\n+\t\t\t<<(1.0*read_size*( txdb_it->second.enum1+txdb_it->second.jnum12 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[0] )<<"\\t"//11\n+\t\t\t<<(1.0*read_size*( txdb_it->second.enum2+txdb_it->second.jnum12 +txdb_it->second.jnum23 )/txdb_it->second.exonsize[1] )<<"\\t"//12\n+\t\t\t<<(1.0*read_size*( txdb_it->second.enum3+txdb_it->second.jnum23 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[2] )<<"\\t"//13`\n+\t\t\t<<(1.0*read_size*txdb_it->second.totalreadnum/txdb_it->second.len1)<<"\\t"//14\n+\t\t\t<<txdb_it->second.exonsize[0]<<"\\t"//15\n+\t\t\t<<txdb_it->second.exonsize[1]<<"\\t"//16\n+\t\t\t<<txdb_it->second.exonsize[2]<<endl;//17\n+\n+\t      num_file<< eventid<<"\\t"\n+\t\t\t<<txdb_it->second.chrid<<"\\t"\n+\t\t\t<<txdb_it->second.start<<","\n+\t\t\t<<txdb_it->second.start+txdb_it->second.exonsize[0]<<"\\t"\n+\t\t\t<<txdb_it->second.start+txdb_it->second.exonstarts[1]<<","\n+\t\t\t<<txdb_it->second.start+txdb_it->second.exonstarts[1]+txdb_it->second.exonsize[1]<<"\\t"\n+\t\t\t<<txdb_it->second.start+txdb_it->second.exonstarts[2]<<","\n+\t\t\t<<txdb_it->second.start+txdb_it->second.exonstarts[2]+txdb_it->second.exonsize[2]<<"\\t"\n+\t\t\t<<txdb_it->second.strand<<"\\t"\n+                <<txdb_it->second.enum1<<"\\t"\n+\t\t\t<<txdb_it->second.enum2<<"\\t"\n+\t\t\t<<txdb_it->second.enum3<<"\\t"\n+\t\t\t<<txdb_it->second.jnum12<<"\\t"\n+\t\t\t<<txdb_it->second.jnum23<<"\\t"\n+\t\t\t<<txdb_it->second.jnum13<<endl;\n+      }\n+      log_file.close();\n+      ratio_file.close();\n+      num_file.close();\n+      clock_finish = clock();\n+      cout<<"Done! time used:"<<(double)(clock_finish-clock_start)/CLOCKS_PER_SEC<<" seconds"<<endl;\n+\n+      \n+}\n'
b
diff -r d4ca551ca300 -r adc0f7765d85 test-data/input1.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input1.fastq Thu Sep 07 15:06:58 2017 -0400
b
b"@@ -0,0 +1,200484 @@\n+@ERR030881.107 HWI-BRUNOP16X_0001:2:1:13663:1096#0/1\n+ATCTTTTGTGGCTACAGTAAGTTCAATCTGAAGTCAAAACCAACCAATTT\n++\n+5.544,444344555CC?CAEF@EEFFFFFFFFFFFFFFFFFEFFFEFFF\n+@ERR030881.311 HWI-BRUNOP16X_0001:2:1:18330:1130#0/1\n+TCCATACATAGGCCTCGGGGTGGGGGAGTCAGAAGCCCCCAGACCCTGTG\n++\n+GFFFGFFBFCHHHHHHHHHHIHEEE@@@=GHGHHHHHHHHHHHHHHHHHH\n+@ERR030881.1487 HWI-BRUNOP16X_0001:2:1:4144:1420#0/1\n+GTATAACGCTAGACACAGCGGAGCTCGGGATTGGCTAAACTCCCATAGTA\n++\n+55*'+&&5'55('''888:8FFFFFFFFFF4/1;/4./++FFFFF=5:E#\n+@ERR030881.9549 HWI-BRUNOP16X_0001:2:1:1453:3458#0/1\n+AACGGATCCATTGTTTCGAGAACGTGATCGCCCTCATCTACCTAGCCTCA\n++\n+D<@DDA@A:AHHHHHHHHHHHHHHIHHHHHHHHHHHHHHHHHBHHHHHHH\n+@ERR030881.13497 HWI-BRUNOP16X_0001:2:1:16344:4145#0/1\n+GCTAATCCGACTTCTCGCCATCATCCTCCTGGTGGGTGTCACCATCGTGC\n++\n+F@FFFGGFGFHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHEHHHHHHHHH\n+@ERR030881.14070 HWI-BRUNOP16X_0001:2:1:4377:4232#0/1\n+TGGACAGTTGCTCCTGGCTCCAGAACCTGTCTTGCAAGGGACAGTGGGGT\n++\n+A:AA@HHHHHHHHHHHHHHHHHHHIHHHHHHHHHHGF=GFHHHH@@?AA*\n+@ERR030881.16375 HWI-BRUNOP16X_0001:2:1:2265:4573#0/1\n+ATTAGGAAACATGGAATTTTTTTAAAGGTTTTTCTTGTATCTTTTTTTTT\n++\n+@<><CHHHHHHHHHHHHHHHHHGGHHHHHHHHHHGGGHHHHHHHHGGGGG\n+@ERR030881.18437 HWI-BRUNOP16X_0001:2:1:13904:4828#0/1\n+CAATAGCCAGATGGTTGGTGGGGCAGCCAGGCAGGGAGGACCCAGGGCTG\n++\n+555544555544555;AAAAFFBBEEEE;=FCB9F===<<FFFFEFFEEE\n+@ERR030881.18768 HWI-BRUNOP16X_0001:2:1:15563:4868#0/1\n+GTGCCAAATTGTCACATTCGAGCTTGAGGCTGTGGTACTGAGCTTGCAGT\n++\n+D>BFD@@?>>54454?FFGFGGGGGGGGGGGGGEGGGGGGGGGEGGGGGG\n+@ERR030881.20718 HWI-BRUNOP16X_0001:2:1:12184:5115#0/1\n+CCCGGCCTAACTTTCATTTAATTTCAATGAATTTTCTTTTTTTTTTTTTT\n++\n+56455==@=>HHHHHHHHHGHHHHHHHHGH=HHHHHHEEEECEEEEEEEE\n+@ERR030881.22833 HWI-BRUNOP16X_0001:2:1:13089:5358#0/1\n+GGAGAAGGGGCGAGGGAAGAAGACCTTTGCTATCCCAGATACCAGGACTG\n++\n+55544145444/444GFDFG9A@@@DD>.F@><<=FDD@AGG>GGEGGEG\n+@ERR030881.23643 HWI-BRUNOP16X_0001:2:1:7921:5452#0/1\n+CGGCCCCCTGCTAATCCGACTTCTCGCCATCATCCTCCTGGTGGGTGTCA\n++\n+FBDFFHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHDHHHH\n+@ERR030881.28299 HWI-BRUNOP16X_0001:2:1:6428:5960#0/1\n+ATGAGAAGGAGCCATCAGGACCTTATGAAAGCGACGAAGACAAGAGTGAT\n++\n+55554DDFFFBBFFFHHGHHHHHHHHHHHHHHHHHHDHH8HHHHHHHHFH\n+@ERR030881.28475 HWI-BRUNOP16X_0001:2:1:14780:5977#0/1\n+CGAAAACCAACTCTTTACCTAACTTTGCATGGTGCTTAGTCAAGGACTCC\n++\n+555,4&4551FFFFFBF3BDFFFFFFEFFFFBEFFFFFFDFFFFFFFFF=\n+@ERR030881.29253 HWI-BRUNOP16X_0001:2:1:1570:6070#0/1\n+GGAATGTTTAGCACAAGACACAGCGGAGCTCGGGATTGGCTAAACTCCCA\n++\n+HGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.30545 HWI-BRUNOP16X_0001:2:1:4103:6216#0/1\n+CAACTCTTTACCTAACTTTGCATGGTGCTTAGTCAAGGACTCCTGCGACC\n++\n+54-55A@A@@HHHHHFFGGE555558<=;=55555AAAA?HHHHH>8@@>\n+@ERR030881.32582 HWI-BRUNOP16X_0001:2:1:12474:6471#0/1\n+CTTGCCTCACATGTCAGGGCAGGTATCCACCTAACCAGGCTGCAGGGGAG\n++\n+555555544444544HHHHGHHHHHHHHHHHHHHHHHHHHHH5@HFFF*F\n+@ERR030881.33730 HWI-BRUNOP16X_0001:2:1:14154:6628#0/1\n+CCAGCCTTGATACAGCATTTTCCACTTCTCTCTGTAGAGATCAGACGATT\n++\n+55555555(5@>@=:@=8.@04554CCCCC.441445444-555445555\n+@ERR030881.35226 HWI-BRUNOP16X_0001:2:1:3903:6867#0/1\n+CAGCATCCTGCTTAGGGCCCTGGAAACTGGGGAAATAGGTAGCCAGGTGG\n++\n+55555A@AAAGGEGGGGGGGGGGGGGGGGGGGCGGGFEGFGGGGFGGCGG\n+@ERR030881.38182 HWI-BRUNOP16X_0001:2:1:17495:7451#0/1\n+CACCATCGTGCCCGTTCTTGTCTTCCTTGGAGAGGTGGGCCTGGGAACCC\n++\n+5544455,0545445FFFEEFFFFFFFFFEEBC;D6<5-?FFFFFFFFFF\n+@ERR030881.41234 HWI-BRUNOP16X_0001:2:1:14816:8065#0/1\n+CTCTCCTCTAACCCTCCAGGCCTTAGCTTGCCTCACATGTCAGGGCAGGT\n++\n+55,34)4-53HHEHHGGGGG7DC?@GG;BGGEGGGGGGGGGGGGGGGGGA\n+@ERR030881.55301 HWI-BRUNOP16X_0001:2:1:7892:11256#0/1\n+CAAAAATGTAGCTGCCCTGACCTGGTCTCCCCTGACCCTTCCACGGGGCT\n++\n+56624545442525554455FFECECGEDGFF8DF###############\n+@ERR030881.57346 HWI-BRUNOP16X_0001:2:1:20039:11573#0/1\n+GACAGATGATGTCCAAGCCCCTACATGCCCCAGACCCCAGGGCACGGCTG\n++\n+##################################################\n+@ERR030881.57608 HWI-BRUNOP16X_0001:2:1:16788:11614#0/1\n+ATCTCGTAGTACATCACATAGTGACGCTGCATCTCTGACTTCTCACTGGC\n++\n+5653445555HHHHHHHHHH9;@=@HHHHHHDHHHHHHHHHHHHHHHHDH\n+@ERR030881.58998 HWI-BRUNOP16X_0001:2:1:14252:11816#0/1\n+CACCATTTGACCCTGAGCCAG"..b':6601:197274#0/1\n+CGGCCCCCTGCTAATCCGACTTCTCGCCATCATCCTCCTGGTGGGTGTCA\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHH\n+@ERR030881.74446016 HWI-BRUNOP16X_0001:2:68:6384:197508#0/1\n+TGTGTCTTGTGCTAAACATTCCTTTCTCTCCGTGCCTCTGTCTCCCCTCT\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74446277 HWI-BRUNOP16X_0001:2:68:20062:197534#0/1\n+CAGCCCTCTCACCCTGGTACTGCATGCACGCAATGCTAGCTGCCCCTTTC\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHEHHHGHIIHHHHHAHHHHHHHHHGH\n+@ERR030881.74446743 HWI-BRUNOP16X_0001:2:68:3752:197585#0/1\n+CTGGGACCCAGGCAGCTGCCACCTTGTCACCATGAGAGAATTTGGGGAGT\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHG\n+@ERR030881.74446915 HWI-BRUNOP16X_0001:2:68:8353:197599#0/1\n+GGACTGTCCACCAGGTCCCGACGGGCAGGAATGCAGATGGGTACCTTTCC\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHFHHHHHHHE\n+@ERR030881.74447547 HWI-BRUNOP16X_0001:2:68:9591:197654#0/1\n+GCCAGTGGTGGGCATGCGGCTGCGGAGCACGTCCTGAGCTGTGGGGACGT\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHBDDBB@9@AAHHGHHHHHGHHDHHH\n+@ERR030881.74449534 HWI-BRUNOP16X_0001:2:68:1488:197840#0/1\n+CTACTCCTTCCGCAGCAGGGAGGTGTGCAGAGCCGTGCTCAGCTTCCTCT\n++\n+HHHHHHHHHHHHHHHHHHHHHHH8HAGFGGFHHHFGGHHHHHGHHHIHGH\n+@ERR030881.74453424 HWI-BRUNOP16X_0001:2:68:5325:198191#0/1\n+GTCCTGCCCTACCTCTCCCAAGAGCACCAGCAGCAGGTCTTGGGAGCCAT\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74454854 HWI-BRUNOP16X_0001:2:68:18716:198301#0/1\n+GCCGGGGCTGCTGCGCTTCGCGAGGTCTTGCTCCCTTGGGACCTGGTCTC\n++\n+55555?>?>>5555444555444442=5<=55444C=6C>2555551544\n+@ERR030881.74455894 HWI-BRUNOP16X_0001:2:68:18831:198398#0/1\n+CTGGGACCTGCGGGAGGGCCGCCAGCTGCAGCAGCATGACTTCAGCTCCC\n++\n+HHHHHHHHHHHHHHHHHGEHHHHHHHCHHHFHHHHHEFGDFHHHEHBFHH\n+@ERR030881.74457151 HWI-BRUNOP16X_0001:2:68:9093:198528#0/1\n+AAACAAAACATTTTCCTTTGGGTTTTTTTTTTTCTTTCTTTTTTCTCCGC\n++\n+HHHHHHGGGHHHHHHHHHHHHHHHHHHHGGGGGBGGGBHHGGGGGGGHHH\n+@ERR030881.74458067 HWI-BRUNOP16X_0001:2:68:15716:198600#0/1\n+GTTCCAACCACCGCCGGGGAGGGAGAGGGCCCCTGTCCCTGCAGGGCCGC\n++\n+ADAD?DEFBEHHHHHCCDGDHCEEHCGBGAHHHHHCDCGD5555424554\n+@ERR030881.74460390 HWI-BRUNOP16X_0001:2:68:15056:198815#0/1\n+CCTGGAACTGCCTGACCATAGTCTGATTCTGCAGGTCCCAGACCACAATG\n++\n+?ACDC?DDGG=DDD>55554GGFFADDDA==<==>D=DAD5445544445\n+@ERR030881.74460430 HWI-BRUNOP16X_0001:2:68:19789:198814#0/1\n+CACAAATCCCGTTCAGCCTTTTGACGATCTCAGCCTGTTTGTGCATCTCG\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74460883 HWI-BRUNOP16X_0001:2:68:19795:198864#0/1\n+CTGCCTGGCACGCACCCGGTGGCTGCACCATCCACACGCAAGACTGCAAC\n++\n+HHHHHHHHHHHHHHHHHDHHHHHGHHFHHHHHHHHHHHFHHHFHGHFHHH\n+@ERR030881.74463349 HWI-BRUNOP16X_0001:2:68:7211:199081#0/1\n+CGGGGAGGTTGGGAGGGGGGACAGAGGGGAGACAGAGGCACGGAGAGAAA\n++\n+HHHHHHHGEHHHHHHHGGGGEHGHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74463429 HWI-BRUNOP16X_0001:2:68:16435:199090#0/1\n+CGGGCTCCTCGCACCTACCCCAGCAACTCAAATTCACCACCTCGGACTCC\n++\n+HHHHHHHHHHHHHHHHHHHHHHHEHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74466171 HWI-BRUNOP16X_0001:2:68:1844:199339#0/1\n+ATTTTTTTAAAGGTTTTTCTTGTATCTTTTTTTTTTTTTTTTTTTTTTTT\n++\n+HHHHHHHGGHHHHGHHHGHC83=;><=@=<CCCCCCCCCCCCCCCCCCCC\n+@ERR030881.74466232 HWI-BRUNOP16X_0001:2:68:10444:199339#0/1\n+CCTGGGTCGCCCACCCTCACCCTGCTCCTCCCAGCTCAGCTAAGCTCGTC\n++\n+HHHHHHHHHHHHHHHGGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74466444 HWI-BRUNOP16X_0001:2:68:18815:199349#0/1\n+GTTTAGCACAAGACACAGCGGAGCTCGGGATTGGCTAAACTCCCATAGTA\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHIH\n+@ERR030881.74468879 HWI-BRUNOP16X_0001:2:68:9428:199583#0/1\n+CACCAACCAGCCGCGGGCCGCGCAGCTGGTGGACAAGGACAGCACCTTCC\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHGHH\n+@ERR030881.74470889 HWI-BRUNOP16X_0001:2:68:4971:199775#0/1\n+CAGAGCTTAGCGGGGGGCTGAGCTGGTGTCTTTGAACCTCTAGTCCCAGG\n++\n+HHHHHHHHHHHHHHHCGGGHEHHFHHEHHHHHHHHHHHEHHHHHFHHHHH\n+@ERR030881.74471439 HWI-BRUNOP16X_0001:2:68:16981:199816#0/1\n+TGTGTGCCCCATTTCTCCATATAGTCTTCCTCAGGCAGGTCCTAGGTCCC\n++\n+??DDDEDECC<=@><CCC@?<<<=@EGGGGG?GGGGCGCE>@@6=55554\n+@ERR030881.74471978 HWI-BRUNOP16X_0001:2:68:9605:199866#0/1\n+CCCAGGTCCTGCCCTACCTCTCCCAAGAGCACCAGCAGCAGGTCTTGGGA\n++\n+HHHHHHHHHGHHHHHHHHGHHHHHHHHHHHHFHHHHHHHHHHGHHHHAHE\n'
b
diff -r d4ca551ca300 -r adc0f7765d85 test-data/input2.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input2.fastq Thu Sep 07 15:06:58 2017 -0400
b
b"@@ -0,0 +1,200484 @@\n+@ERR030881.107 HWI-BRUNOP16X_0001:2:1:13663:1096#0/2\n+CGGATTTCAGCTACTGCAAGCTCAGTACCACAGCCTCAAGCTCGAATGTG\n++\n+HH;HHHHHGHHHHHHHHHHGHDHEHHHHHEHHHHBHHFHHHHHHHHHD0F\n+@ERR030881.311 HWI-BRUNOP16X_0001:2:1:18330:1130#0/2\n+GAGTGCGAGGGAAGTCAGGGGAGGATCGCGAGGGAAGCCAGGGGAGGATC\n++\n+HHHHHBF8G>&4555GGGGGHHGGEHHHHHHHHH=HHHHHHHHHHHGB9H\n+@ERR030881.1487 HWI-BRUNOP16X_0001:2:1:4144:1420#0/2\n+AACCGGGGGACGGGCCGGGGCTGCTGCGCTTCGCGAGGTCTTGCTCCCTT\n++\n+@FEEH>==9=05544FGFGFHHHBHHHFHF>AAAAHHHHHHHEHHHHHHH\n+@ERR030881.9549 HWI-BRUNOP16X_0001:2:1:1453:3458#0/2\n+TCAGCATGCTTCTTAGGGCCCTGGAAACTGGGGAAATAGGTAGCCAGGTG\n++\n+5515555/5515444FFHHHHHHHHHHHHHHHHHHHHHHHEHHHHGHH@H\n+@ERR030881.13497 HWI-BRUNOP16X_0001:2:1:16344:4145#0/2\n+GGCCAAGCAGGTCACCGCTCCCGAGCTGAACTCTATCATCCGACAGCAGC\n++\n+HHHHFGHHHGFAFFFHHFHHHHH/HHHHGHHEHHEHGFHHDGF=AA=@@8\n+@ERR030881.14070 HWI-BRUNOP16X_0001:2:1:4377:4232#0/2\n+TGGAGTCCTTCATGCCCAGGTCTGGAACCCAGGTTCTGACCCCAGGGCCC\n++\n+FDFFFEGGGGHHHHGHHHHH>AAA8GGGGGHHHGHHHHHHHHHHHGFHHH\n+@ERR030881.16375 HWI-BRUNOP16X_0001:2:1:2265:4573#0/2\n+GGCCAGCCGGGCTCCAGAGGGGTCAGGGCGCGACGAGAACCAACTCTTTA\n++\n+FDFFBDFDDBAAADDGHGHHHHBHHHHHGHGHHHHHHHHHHHHHHHHHFH\n+@ERR030881.18437 HWI-BRUNOP16X_0001:2:1:13904:4828#0/2\n+GGGCTCTCCCTCTGTATCGCCTGGGGAGGCTGCTGAGGTGACTTTTTGGA\n++\n+A?DDABFBFFHGHEHHHHHHHHHIHHDHCC55555BFFCD;:9=;=@=><\n+@ERR030881.18768 HWI-BRUNOP16X_0001:2:1:15563:4868#0/2\n+CACAGTAGGCGTTCTATAAATGTGTCACAAGAATGGCTTCCCTCAGGAAG\n++\n+55444;@=@>HHHDHHHHHFFGHHHHHHHHHIHHHFH=HHBB?<D#####\n+@ERR030881.20718 HWI-BRUNOP16X_0001:2:1:12184:5115#0/2\n+GCCTGGGCAACATAGCGAAACCACATCTCTACAAAAAAATCCTCCAAAAT\n++\n+HGIEHHHHGHF=@FF8A>>@HFHH=HHHHHHHIHHHGGGGH@@HHGGGEG\n+@ERR030881.22833 HWI-BRUNOP16X_0001:2:1:13089:5358#0/2\n+AGCCACTGCCTTTCTGCTCAGATGCTGGCACCTCCGCCCCCGGGGCTGCC\n++\n+EHHHFF?GFDGFFB???DDAD<FC<55555FFGGG<?>>61/5444-555\n+@ERR030881.23643 HWI-BRUNOP16X_0001:2:1:7921:5452#0/2\n+CGAGCTGAACTCTATCATCCGACAGCAGCTCCAAGCCCACCAGCTGTCCC\n++\n+HHHHHHHHHHHHHHHGHHHHGGHHHHHHHHHHHHHHHHHHHHHHHHDHHH\n+@ERR030881.28299 HWI-BRUNOP16X_0001:2:1:6428:5960#0/2\n+GGAGTCACAGGATTTGGAGGCAGGAGTGCTGGCGGGAAGGGCATTCAGGA\n++\n+HHHHHHHFEH?=DDDHIFHHEHEDE?DAADH@FHHC'@CE##########\n+@ERR030881.28475 HWI-BRUNOP16X_0001:2:1:14780:5977#0/2\n+CTCGGAAGGCAAGGCACATCTTGTGGTAGAAAATTTCGTGCAAATTAGGA\n++\n+HHHHHGGH=IADDADHHGHH444-4A?A?AGHGHHFGFG@/5544HDHEE\n+@ERR030881.29253 HWI-BRUNOP16X_0001:2:1:1570:6070#0/2\n+CTTCGCGAGGTCTTGCTCCCTTGGGACCTGGTCTCCCATCTGACCCTCCA\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.30545 HWI-BRUNOP16X_0001:2:1:4103:6216#0/2\n+GTTTAAAGGTGATACTTATTCTCGGAAGGCAAGGCACATCTTGTGGGAGA\n++\n+EF;GG4445544544FF@FFEHFHFFHGHH####################\n+@ERR030881.32582 HWI-BRUNOP16X_0001:2:1:12474:6471#0/2\n+GGGACAGGGAGGTTGGGAGGGGGGACAGAGGGGAGACAGAGGCACGGAGA\n++\n+FF8FFBFFFFFDF@FCD>CFF@@F:HEHEHHHHBHHHHHF==<>5?DDA;\n+@ERR030881.33730 HWI-BRUNOP16X_0001:2:1:14154:6628#0/2\n+GTGAGGGTGGGCGACCCAGGATTCCCCCTCCCCTTCCCAAATAAAGATGA\n++\n+BEFDB44(4411445DA?ADHHHHIFDDC>:::5@DDDC?HHHDEBFFB>\n+@ERR030881.35226 HWI-BRUNOP16X_0001:2:1:3903:6867#0/2\n+CAGAGCGTAAGAAATGGATCCATTGTTCCGAGAACGTGATCGCCCTCATC\n++\n+HH@HHHFDHHHHHHHFHHHGHGHHHHHHHGGHHHHHFHHAHHHHHGHHGH\n+@ERR030881.38182 HWI-BRUNOP16X_0001:2:1:17495:7451#0/2\n+CCTCTCCCGAGCTGAACTCTATCATCCGACAGCAGCTCCAAGCCCACCAG\n++\n+GG/GGHHHHHHHHHHHHHHHHHHHDHHHHHFDHHHHHH@HHEHHHHHHHH\n+@ERR030881.41234 HWI-BRUNOP16X_0001:2:1:14816:8065#0/2\n+GGCAGGTTGGGAGGGGGGACAGAGGGGAGACAGAGGCACGGAGAGAAAGG\n++\n+FFGHH55,5514441>><<BHHEHFF?9F4FFFBFHHHHHHHHGHHFF4H\n+@ERR030881.55301 HWI-BRUNOP16X_0001:2:1:7892:11256#0/2\n+CTTCGCAAATTTGTCCCAGGGATGGATCGCCTGTGCTGCCTTCGCCCGCC\n++\n+D@5AA4453451444GGGFDHH@GEA;DDD=:=+:D@DFDEDHHB#####\n+@ERR030881.57346 HWI-BRUNOP16X_0001:2:1:20039:11573#0/2\n+CCTGTCCAGAGTCTGAGGGGGGAGGCCAGGCCCTGCCTTGGGGTCTGAGG\n++\n+##################################################\n+@ERR030881.57608 HWI-BRUNOP16X_0001:2:1:16788:11614#0/2\n+GGGGGGCGCCGCAGCTGCGCGGCCGCTCCCTCCTAGCCGGCCCTTGAGGG\n++\n+HHHHHHHEGHIHHHHHHDHF@@<A?FFE@FGGGAG4====HHHHHHHEHB\n+@ERR030881.58998 HWI-BRUNOP16X_0001:2:1:14252:11816#0/2\n+CTGAATCCCTTGCCCAGAGGA"..b':6601:197274#0/2\n+CCGCTCCCGAGCTGAACTCTATCATCCGACAGCAGCTCCAAGCCCACCAG\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGH\n+@ERR030881.74446016 HWI-BRUNOP16X_0001:2:68:6384:197508#0/2\n+GGCCCTGCCCTTGACCCCACTACCCGTGGGGCTGCAGCCGCCTTCGCTGC\n++\n+HHHHHHHHHHHHHHHHHHIHHGHHH>A??@FHHHFHHFHDHH=HHB>4FF\n+@ERR030881.74446277 HWI-BRUNOP16X_0001:2:68:20062:197534#0/2\n+CTTTATTTGGGAAGGGGAGGGGGAATCCTGGGTCGCCCACCCTCACCCTG\n++\n+HHHHHHHGGHHHHHHHHFHGGGGGHHHHHHHGHHHHFHHEH9BHEDD###\n+@ERR030881.74446743 HWI-BRUNOP16X_0001:2:68:3752:197585#0/2\n+CGGCCGGCTGCATCCCACACCAGCCTGAGCCCCAGACGGTCAGTCAGTGC\n++\n+HHHHHHHHHHHHIHHHHHHHHHHHHHHHHHHHHHHHHHHEHHHHHHHHHH\n+@ERR030881.74446915 HWI-BRUNOP16X_0001:2:68:8353:197599#0/2\n+CGAGGGGTCCAGAGTGGAGAGAGCCCCGAGCAGGAGTGCATCTCCCTCGC\n++\n+HHHHHHHFHHHHHHHFHHHHHHHHHHHHHHHGHHHHHHHHHHIHFHHHGH\n+@ERR030881.74447547 HWI-BRUNOP16X_0001:2:68:9591:197654#0/2\n+GGCTGCAGATTCCATTCAGCAGGCCCGAGAGCAAGCACCACGCTAGCCTG\n++\n+HHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHG\n+@ERR030881.74449534 HWI-BRUNOP16X_0001:2:68:1488:197840#0/2\n+CAAGACTGCAACTTCAGATGCTCCGCACGCTGGAGATGCTGGACAGGGGC\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFFHHHHHHHEHEHE\n+@ERR030881.74453424 HWI-BRUNOP16X_0001:2:68:5325:198191#0/2\n+CTTCCTTGGAGAGGTGGGCCTGGGAACCCAGCGCGGACAGCGAGAGGAGG\n++\n+HHGHHHHHHHHHHHHHHHHHHGHHGHHHHHHHHHHHHHHHHHHHEHHHHG\n+@ERR030881.74454854 HWI-BRUNOP16X_0001:2:68:18716:198301#0/2\n+GGAATGTTTAGCACAAGACACAGCGGAGCTCGGGATTGGCTAAACTCCCA\n++\n+HF@GHD?>DA=<>;=444444245444445>>@>;BECBF@?A<>@AAA8\n+@ERR030881.74455894 HWI-BRUNOP16X_0001:2:68:18831:198398#0/2\n+GGACTGAGGACGACTCCTTGGACTGGAAAATGCTGGCCCCGTACGGCGTC\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHDHHHHHHIH\n+@ERR030881.74457151 HWI-BRUNOP16X_0001:2:68:9093:198528#0/2\n+GGAACCTTCTCCGGATTGGGTTCATGAGCATTTTTGTGGGTGTGTATGTG\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGGHHHHHHHHHHFH\n+@ERR030881.74458067 HWI-BRUNOP16X_0001:2:68:15716:198600#0/2\n+GGCCGTCTTTGACCTGCTCCTGGCTGTTGGCATTGCTGCCTACCCTGGCA\n++\n+55555<@@@@===<655244A??DAC:C?#####################\n+@ERR030881.74460390 HWI-BRUNOP16X_0001:2:68:15056:198815#0/2\n+GGTGAGGCCAGCACCTTGTCCATTTGGGACCTGGCGGCGCCCACCCCCCG\n++\n+5-5449=;==BFFBFDBFDDC>?>>D?DDDHHHHHBFFC@44244<<<<<\n+@ERR030881.74460430 HWI-BRUNOP16X_0001:2:68:19789:198814#0/2\n+ATGATGTTTCCACAAAGCAGGCATTCGGGCTCCTCGCACCTACCCCAGCA\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74460883 HWI-BRUNOP16X_0001:2:68:19795:198864#0/2\n+TGCTGCGGGTGTCTCCGGCTGGGCATGCGGGGGCCCGGGGACTGCCTGGC\n++\n+HHHHHHHHHHHHGHHHHFDEBDDBB5552*DDBBFHHHHH@FDF######\n+@ERR030881.74463349 HWI-BRUNOP16X_0001:2:68:7211:199081#0/2\n+CTGGTCTCCCATCTGACCCTCCAGGCCTTAGCTTGCCTCACATGTCAGGG\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFH\n+@ERR030881.74463429 HWI-BRUNOP16X_0001:2:68:16435:199090#0/2\n+GGACCTGGGCACAAATCCCGTTCAGCCTTTTGACGATCTCAGCCTGTTTG\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n+@ERR030881.74466171 HWI-BRUNOP16X_0001:2:68:1844:199339#0/2\n+GGTGGGGGTCGTGGAGTGGGGGAGGGAGGCCAGCCGGGCTCCAGAGGGGT\n++\n+HHHHHHGGGHGBGEFHHHFHHG9GGC;HHEHHHCHFG@FFAA;=9DD;C7\n+@ERR030881.74466232 HWI-BRUNOP16X_0001:2:68:10444:199339#0/2\n+CCGTTTTGAACATGTGTAACCGACAGTCTGCCTGGGCCACAGCCCTCTCA\n++\n+HHHHHHHHHHHHHHHHHHHHHHHHIHHHHHGHDHGHHHBHCFFFFHHHHH\n+@ERR030881.74466444 HWI-BRUNOP16X_0001:2:68:18815:199349#0/2\n+GGAAGGGCCGGGGCTGCTGCGCTTCGCGAGGTCTTGCTCCCTTGGGACCT\n++\n+HHHHIHIHHHHHGHHHHHHHFHDHFGHHHHHEHEHHHHHHHHHHHHGHHH\n+@ERR030881.74468879 HWI-BRUNOP16X_0001:2:68:9428:199583#0/2\n+GGAGGCTGAAGTGCTGGACAGCCACGTAGGCCATGCCGAGGTAGGCAGCA\n++\n+HFHHHHHHHHHHIHGHHHHHHHHHHHHHHHHHEHHHHGGHH?FHHHHHGH\n+@ERR030881.74470889 HWI-BRUNOP16X_0001:2:68:4971:199775#0/2\n+GACATATTTGAGAGACACTGGGGAGACAGAATCGACCTGACCTTGCTGAC\n++\n+HHHHHHHHHHHHHHHHHHHHHHH@HHHEHHHFHHHHHGHAHFBEHHGFBG\n+@ERR030881.74471439 HWI-BRUNOP16X_0001:2:68:16981:199816#0/2\n+GTGACACTGCATTGCTGCTGCCAGCACCCCTTGTTAGGGTTTGTAATTGC\n++\n+F8HHHFGGG8DC>A>ADD1?##############################\n+@ERR030881.74471978 HWI-BRUNOP16X_0001:2:68:9605:199866#0/2\n+CTTGTCTTCCTTGGAGAGGTGGGCCTGGGAACCCAGCGCGGACAGCGAGA\n++\n+HHHHHHIHHHHHDHHGHHHGHHHHHHHHHHHHFGHHHHFHHDHHHCHHHH\n'
b
diff -r d4ca551ca300 -r adc0f7765d85 test-data/output1.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output1.txt Thu Sep 07 15:06:58 2017 -0400
b
b'@@ -0,0 +1,9901 @@\n+na\t0.0010\tchr10\t100260965,100261028\t100261028,100261044\t100261978,100262063\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t100273277,100275490\t100275490,100275493\t100280123,100280228\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t100286096,100286213\t100286213,100286332\t100286604,100286712\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t100329210,100329324\t100329324,100329346\t100329868,100330486\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t100523728,100523929\t100523929,100524139\t100526398,100526554\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t100827008,100827095\t100827542,100827561\t100827561,100829941\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t100981167,100981229\t100983304,100984489\t100984489,100985616\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t101062541,101063103\t101064205,101064260\t101064260,101064421\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t101134174,101134376\t101136667,101136690\t101136690,101137789\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t101600737,101600862\t101601169,101601202\t101601202,101601336\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t101605139,101605212\t101608834,101608866\t101608866,101608937\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t101601202,101601336\t101608834,101608870\t101608870,101608937\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t101774731,101774879\t101774879,101774912\t101775129,101775216\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t101774731,101774879\t101774879,101774912\t101775739,101775776\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t101828626,101828696\t101828696,101828717\t101829074,101829199\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t101829843,101829897\t101829897,101829942\t101831071,101831167\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t102137849,102138038\t102138618,102138719\t102138719,102138765\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t102148371,102148521\t102148627,102148633\t102148633,102148694\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t102157018,102157074\t102157185,102157188\t102157188,102157328\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t102358505,102358729\t102359266,102359269\t102359269,102359435\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t102402047,102402159\t102402251,102402254\t102402254,102402529\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t102416384,102416631\t102416631,102417121\t102418700,102418759\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t102656105,102656137\t102656244,102656269\t102656269,102656385\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t103451082,103451182\t103451182,103451285\t103452155,103452405\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t106573662,106577285\t106577285,106577555\t106579368,106579474\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t106573662,106577285\t106577285,106577555\t106579090,106579251\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t110875965,110876070\t110881232,110881274\t110881274,110881535\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t112182793,112182887\t112182887,112182891\t112183692,112183779\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t112443514,112443594\t112443594,112443606\t112445169,112445650\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t112446987,112447467\t112448377,112448381\t112448381,112448489\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t113146010,113146097\t113150982,113150997\t113150997,113151123\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t11320856,11320898\t11321188,11321206\t11321206,11321386\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t11314138,11314258\t11321188,11321206\t11321206,11321386\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t113588823,113589079\t113589079,113589082\t113589665,113589797\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t113679668,113679978\t113691963,113691996\t113691996,113692070\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t113697493,113697603\t113721000,113721031\t113721031,113721168\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t114129890,114129971\t114129971,114130669\t114131146,114131336\t-\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.0010\tchr10\t114447879,114448'..b'1387,94571429\t94573959,94574187\t94574553,94574736\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94573959,94574187\t94574553,94574736\t94576661,94576709\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94573959,94574187\t94574553,94574736\t94576661,94576805\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94574553,94574736\t94576661,94576709\t94588228,94588390\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94574553,94574736\t94576661,94576709\t94581325,94581522\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94574553,94574736\t94576661,94576709\t94577081,94577096\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94574553,94574736\t94576661,94576805\t94581325,94581522\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94574553,94574736\t94576661,94576805\t94588228,94588390\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94574553,94574736\t94576661,94576805\t94582962,94583059\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94576661,94576709\t94581325,94581522\t94582962,94583059\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94577081,94577096\t94581325,94581522\t94582962,94583059\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94581325,94581522\t94582962,94583059\t94588228,94588390\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94581325,94581522\t94582962,94583059\t94583987,94584125\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94582962,94583059\t94588228,94588390\t94590412,94590552\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94583987,94584125\t94588228,94588390\t94590412,94590552\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94576661,94576805\t94588228,94588390\t94590412,94590552\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94576661,94576709\t94588228,94588390\t94590412,94590552\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94588228,94588390\t94590412,94590552\t94590637,94590776\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94590412,94590552\t94590637,94590776\t94592228,94592312\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94590637,94590776\t94592228,94592312\t94592394,94592514\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94592228,94592312\t94592394,94592514\t94593498,94593615\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94592394,94592514\t94593498,94593615\t94594694,94594854\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94593498,94593615\t94594694,94594854\t94596859,94596956\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94594694,94594854\t94596859,94596956\t94597034,94597111\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94596859,94596956\t94597034,94597111\t94601527,94602099\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94683493,94683987\t94687769,94687932\t94688124,94688274\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94687769,94687932\t94688124,94688274\t94694916,94695077\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94688124,94688274\t94694916,94695077\t94706783,94706960\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94688124,94688274\t94694916,94695077\t94720395,94720537\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94706783,94706960\t94720395,94720537\t94724345,94724533\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94694916,94695077\t94720395,94720537\t94724345,94724533\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94720395,94720537\t94724345,94724533\t94733296,94733438\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94724345,94724533\t94733296,94733438\t94735262,94736191\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94762680,94762873\t94775057,94775220\t94775389,94775539\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94775057,94775220\t94775389,94775539\t94780498,94780659\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94775389,94775539\t94780498,94780659\t94781820,94781997\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94780498,94780659\t94781820,94781997\t94820495,94820637\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94781820,94781997\t94820495,94820637\t94842836,94843024\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+na\t0.9990\tchr10\t94820495,94820637\t94842836,94843024\t94849916,94850058\t+\t0.0000\t0.0000\t0.0000\t0.0000\n+\t0.9990\tchr10\t94842836,94843024\t94849916,94850058\t94852732,94853206\t+\t0.0000\t\t\t\n'
b
diff -r d4ca551ca300 -r adc0f7765d85 test-data/output2.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output2.txt Thu Sep 07 15:06:58 2017 -0400
b
b'@@ -0,0 +1,9901 @@\n+AA-AA-10-100261028-100261044.0\t0.0010\t0.0010\tchr10\t100260965,100261028\t100261028,100261044\t100261978,100262063\t-\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t63\t16\t85\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-100275490-100275493.0\t0.0010\t0.0010\tchr10\t100273277,100275490\t100275490,100275493\t100280123,100280228\t-\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t2213\t3\t105\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-100286213-100286332.0\t0.0010\t0.0010\tchr10\t100286096,100286213\t100286213,100286332\t100286604,100286712\t-\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t117\t119\t108\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-100329324-100329346.0\t0.0010\t0.0010\tchr10\t100329210,100329324\t100329324,100329346\t100329868,100330486\t-\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t114\t22\t618\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-100523929-100524139.0\t0.0010\t0.0010\tchr10\t100523728,100523929\t100523929,100524139\t100526398,100526554\t-\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t201\t210\t156\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-100827542-100827561.0\t0.0010\t0.0010\tchr10\t100827008,100827095\t100827542,100827561\t100827561,100829941\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t87\t19\t2380\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-100983304-100984489.0\t0.0010\t0.0010\tchr10\t100981167,100981229\t100983304,100984489\t100984489,100985616\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t62\t1185\t1127\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-101064205-101064260.0\t0.0010\t0.0010\tchr10\t101062541,101063103\t101064205,101064260\t101064260,101064421\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t562\t55\t161\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-101136667-101136690.0\t0.0010\t0.0010\tchr10\t101134174,101134376\t101136667,101136690\t101136690,101137789\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t202\t23\t1099\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-101601169-101601202.0\t0.0010\t0.0010\tchr10\t101600737,101600862\t101601169,101601202\t101601202,101601336\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t125\t33\t134\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-101608834-101608866.0\t0.0010\t0.0010\tchr10\t101605139,101605212\t101608834,101608866\t101608866,101608937\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t73\t32\t71\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-101608834-101608870.0\t0.0010\t0.0010\tchr10\t101601202,101601336\t101608834,101608870\t101608870,101608937\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t134\t36\t67\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-101774879-101774912.0\t0.0010\t0.0010\tchr10\t101774731,101774879\t101774879,101774912\t101775129,101775216\t-\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t148\t33\t87\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-101774879-101774912.1\t0.0010\t0.0010\tchr10\t101774731,101774879\t101774879,101774912\t101775739,101775776\t-\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t148\t33\t37\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-101828696-101828717.0\t0.0010\t0.0010\tchr10\t101828626,101828696\t101828696,101828717\t101829074,101829199\t-\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t70\t21\t125\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-101829897-101829942.0\t0.0010\t0.0010\tchr10\t101829843,101829897\t101829897,101829942\t101831071,101831167\t-\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t54\t45\t96\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-102138618-102138719.0\t0.0010\t0.0010\tchr10\t102137849,102138038\t102138618,102138719\t102138719,102138765\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t189\t101\t46\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-102148627-102148633.0\t0.0010\t0.0010\tchr10\t102148371,102148521\t102148627,102148633\t102148633,102148694\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t150\t6\t61\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-102157185-102157188.0\t0.0010\t0.0010\tchr10\t102157018,102157074\t102157185,102157188\t102157188,102157328\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t56\t3\t140\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-102359266-102359269.0\t0.0010\t0.0010\tchr10\t102358505,102358729\t102359266,102359269\t102359269,102359435\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t224\t3\t166\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-102402251-102402254.0\t0.0010\t0.0010\tchr10\t102402047,102402159\t102402251,102402254\t102402254,102402529\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t112\t3\t275\texon1=no\texon2=no\texon3=no\tna\n+AA-AA-10-102416631-10'..b'40\t139\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94590637-94590776.0\t0.0010\t0.9990\tchr10\t94590412,94590552\t94590637,94590776\t94592228,94592312\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t140\t139\t84\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94592228-94592312.0\t0.0010\t0.9990\tchr10\t94590637,94590776\t94592228,94592312\t94592394,94592514\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t139\t84\t120\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94592394-94592514.0\t0.0010\t0.9990\tchr10\t94592228,94592312\t94592394,94592514\t94593498,94593615\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t84\t120\t117\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94593498-94593615.0\t0.0010\t0.9990\tchr10\t94592394,94592514\t94593498,94593615\t94594694,94594854\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t120\t117\t160\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94594694-94594854.0\t0.0010\t0.9990\tchr10\t94593498,94593615\t94594694,94594854\t94596859,94596956\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t117\t160\t97\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94596859-94596956.0\t0.0010\t0.9990\tchr10\t94594694,94594854\t94596859,94596956\t94597034,94597111\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t160\t97\t77\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94597034-94597111.0\t0.0010\t0.9990\tchr10\t94596859,94596956\t94597034,94597111\t94601527,94602099\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t97\t77\t572\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94687769-94687932.0\t0.0010\t0.9990\tchr10\t94683493,94683987\t94687769,94687932\t94688124,94688274\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t494\t163\t150\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94688124-94688274.0\t0.0010\t0.9990\tchr10\t94687769,94687932\t94688124,94688274\t94694916,94695077\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t163\t150\t161\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94694916-94695077.0\t0.0010\t0.9990\tchr10\t94688124,94688274\t94694916,94695077\t94706783,94706960\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t150\t161\t177\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94694916-94695077.1\t0.0010\t0.9990\tchr10\t94688124,94688274\t94694916,94695077\t94720395,94720537\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t150\t161\t142\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94720395-94720537.0\t0.0010\t0.9990\tchr10\t94706783,94706960\t94720395,94720537\t94724345,94724533\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t177\t142\t188\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94720395-94720537.1\t0.0010\t0.9990\tchr10\t94694916,94695077\t94720395,94720537\t94724345,94724533\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t161\t142\t188\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94724345-94724533.0\t0.0010\t0.9990\tchr10\t94720395,94720537\t94724345,94724533\t94733296,94733438\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t142\t188\t142\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94733296-94733438.0\t0.0010\t0.9990\tchr10\t94724345,94724533\t94733296,94733438\t94735262,94736191\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t188\t142\t929\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94775057-94775220.0\t0.0010\t0.9990\tchr10\t94762680,94762873\t94775057,94775220\t94775389,94775539\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t193\t163\t150\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94775389-94775539.0\t0.0010\t0.9990\tchr10\t94775057,94775220\t94775389,94775539\t94780498,94780659\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t163\t150\t161\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94780498-94780659.0\t0.0010\t0.9990\tchr10\t94775389,94775539\t94780498,94780659\t94781820,94781997\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t150\t161\t177\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94781820-94781997.0\t0.0010\t0.9990\tchr10\t94780498,94780659\t94781820,94781997\t94820495,94820637\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t161\t177\t142\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94820495-94820637.0\t0.0010\t0.9990\tchr10\t94781820,94781997\t94820495,94820637\t94842836,94843024\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t177\t142\t188\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94842836-94843024.0\t0.0010\t0.9990\tchr10\t94820495,94820637\t94842836,94843024\t94849916,94850058\t+\t0\t0\t0\t0.0000\t0.0000\t0.0000\t0.0000\t142\t188\t142\texon1=no\texon2=no\texon3=no\tna\n+CA-CS-10-94849916-94850058.0\t0.0010\t0.9990\tchr10\t94842836,94843024\t94849916,94850058\t94852732,94853206\t+\t0\t0\t0\t0.0000\t\n\\ No newline at end of file\n'