diff SpliceTrap.pl @ 1:adc0f7765d85 draft

planemo upload
author bioitcore
date Thu, 07 Sep 2017 15:06:58 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SpliceTrap.pl	Thu Sep 07 15:06:58 2017 -0400
@@ -0,0 +1,285 @@
+#!/usr/bin/perl
+# Author: wuj@cshl.edu
+# Modified: Baekdoo Kim (baegi7942@gmail.com)
+use strict;
+use Getopt::Long;
+use Data::Dumper;
+####################
+use Cwd;
+my $PROG = $0;
+my $CUR_DIR = Cwd::abs_path(Cwd::cwd());
+my $PROG_ABS_PATH = Cwd::abs_path($PROG);
+#my $SrcFolder=`dirname $PROG_ABS_PATH`;
+#chomp($SrcFolder);
+#my %config=do "$ENV{HOME}/.SpliceTrap.pl.ini";
+#my $SrcFolder=$config{SrcFolder};
+
+my @programs = ('R','echo','cat','bash','perl','ln','mkdir','paste','grep','sort','basename','awk','wc','mv','cd','rm','split','head' );
+foreach my $program (@programs)
+{
+        die ("CHECK: $program not found\n") if(system("hash $program >/dev/null"));
+
+}
+
+####################
+my $SrcFolder="";
+my $MapSoftware="bowtie";
+my $DatabasePrefix="hg38";
+my $ReadFileFormat="";
+my $ReadFile1Name="";
+my $ReadFile2Name="";
+my $CutoffLevel="M";
+my $Outputfolder=$CUR_DIR;
+my $OutputPrefix="Result";
+#my $CutoffOnly=0;
+my $ReadSize=36;
+my $JunctionCut=5;
+my $onGalaxy_raw="";
+my $onGalaxy_txt="";
+my $BowtieThreads=1;
+my $noIRMstr="";
+my $noIRM = 0;
+
+my $num_args = $#ARGV;
+$onGalaxy_raw = $ARGV[$num_args-1];
+$onGalaxy_txt = $ARGV[$num_args];
+
+GetOptions (
+	"l:s"=>\$SrcFolder,
+        "m:s"=>\$MapSoftware,
+        "d:s"=>\$DatabasePrefix,
+#       "f:s"=>\$ReadFileFormat,
+        "1:s"=>\$ReadFile1Name,
+        "2:s"=>\$ReadFile2Name,
+        "c:s"=>\$CutoffLevel,
+        "outdir:s"=>\$Outputfolder,
+        "o:s"=>\$OutputPrefix,
+        "j:i"=>\$JunctionCut,
+        "s:i"=>\$ReadSize,
+        "p:i"=>\$BowtieThreads,
+        "noIRM|noirm"=>\$noIRM
+#       "local:s"=>\$local,
+#       "rerun"=>\$CutoffOnly
+);
+#-O for galaxy output
+
+
+my $InputParaDes="      Usage of the script:
+	-l	Base Location (required)
+        -m      Mapping software: [bowtie]/rmap
+        -d      Database prefix: [hg18]/mm9/rn4/userdefined
+        -1      Read File 1
+        -2      Read File 2
+        -c      Cutoff Level:H/[M]/L
+                Means High, Middle or Low
+        -j      Junction reads requirement per junction for each exon-isoform [5]
+        -o      Output prefix {Result}
+        -s      Read Size [36]
+        --outdir Output folder [./]
+        -p      Bowtie parameter, threads number, only use this when you don't use qsub [1]
+        --noIRM Skip the IRM correction step
+        
+        This is a quick help, please refer to the README file for details.
+";
+
+
+if($SrcFolder eq "") {
+	print "[CHECK] - Please provide the location of the script (option '-l')\n\n";
+	exit;
+}
+
+if($ReadFile2Name eq "")
+{
+        $ReadFile2Name = $ReadFile1Name;
+        #trigger singled end mode
+}
+
+if($ReadFile1Name eq "" or $ReadFile2Name eq "" )
+{
+        print $InputParaDes;
+        exit;
+}
+
+if($BowtieThreads < 1)
+{
+        print $InputParaDes;
+        exit;
+}
+
+if (! -e "$SrcFolder/db/$DatabasePrefix/parallel")
+{
+        print "CHECK: Error, the database you specified is not properly installed.\n";
+        #print $InputParaDes;
+        exit;
+
+}
+
+if($CutoffLevel ne "H" and  $CutoffLevel ne "M" and  $CutoffLevel ne "L")
+{
+        print $InputParaDes;
+        exit;
+}
+
+
+$ReadFile1Name = Cwd::abs_path($ReadFile1Name);
+$ReadFile2Name = Cwd::abs_path($ReadFile2Name);
+
+#check the files
+open(check,$ReadFile1Name) or die ("CHECK: Error when opening $ReadFile1Name\n");
+my $checkoneline = <check>;
+if(substr($checkoneline,0,1) eq ">")
+{
+        $ReadFileFormat = "fasta";
+}
+elsif(substr($checkoneline,0,1) eq "@")
+{
+        $ReadFileFormat = "fastq";
+}
+else
+{
+        die("CHECK: ERROR:Please check $ReadFile1Name\n");
+}
+close(check);
+
+open(check,$ReadFile2Name) or die ("CHECK: Error when opening $ReadFile2Name\n");
+my $checkoneline = <check>;
+if(substr($checkoneline,0,1) eq ">")
+{
+        die("CHECK: $ReadFile2Name has a different format as $ReadFile1Name\n") if ($ReadFileFormat ne "fasta");
+}
+elsif(substr($checkoneline,0,1) eq "@")
+{
+        die("CHECK: $ReadFile2Name has a different format as $ReadFile1Name\n") if ($ReadFileFormat ne "fastq");
+}
+else
+{
+        die("CHECK: ERROR:Please check $ReadFile2Name\n");
+}
+close(check);
+
+$Outputfolder= Cwd::abs_path($Outputfolder);
+if($Outputfolder eq "/tmp")
+{
+        while(-e $Outputfolder)
+        {
+                my $random_foldername = random_sessid();
+                $Outputfolder = "/tmp/".$random_foldername;
+        }
+}
+
+
+if(! -e $Outputfolder)
+{
+        mkdir $Outputfolder or die "CHECK: cannot mkdir $Outputfolder\n";
+}
+if(! -d $Outputfolder)
+{
+        die "CHECK: $Outputfolder is not a folder\n";
+}
+
+if($MapSoftware eq "bowtie")
+{
+        print "CHECK: whether bowtie installed and in PATH\n";
+        my $bowtiechecker=`bowtie --version`;
+        if($bowtiechecker ne "")
+        {
+                print "CHECK: bowtie found, information below:\n";
+                print $bowtiechecker,"\n";
+        }
+        else
+        {
+                die "CHECK: No bowtie found in PATH, EXIT!\n";
+        }
+}
+elsif($MapSoftware eq "rmap")
+{
+        print "CHECK: checking rmap...\n";
+        if(system("type rmap &>/dev/null") ==0 )
+        {
+                print "CHECK: rmap found, continue\n";
+        }
+        else
+        {
+                die "CHECK: No rmap found in PATH, EXIT!\n";
+        }
+}
+else
+{
+        die "CHECK: option -m only takes rmap or bowtie as inputs\n";
+}
+
+if($ReadSize == 0)
+{
+        die "CHECK: Please check option -s Read size\n";
+}
+
+if($noIRM)
+{
+        $noIRMstr= "noirm";
+}
+
+#write more checks later
+print "PARAMETERS:\tMapping software:  ",$MapSoftware,"\n";
+print "PARAMETERS:\tDatabase prefix:   ",$DatabasePrefix,"\n";
+print "PARAMETERS:\tRead end 1:        ",$ReadFile1Name,"\n";
+print "PARAMETERS:\tRead end 2:        ",$ReadFile2Name,"\n" if($ReadFile2Name ne $ReadFile1Name);
+print "PARAMETERS:\tGalaxy_raw:        ",$onGalaxy_raw,"\n"; #if($onGalaxy_raw ne "");
+print "PARAMETERS:\tGalaxy_txt:        ",$onGalaxy_txt,"\n"; #if($onGalaxy_txt ne "");
+print "PARAMETERS:\tCutoff level:      ",$CutoffLevel,"\n";
+print "PARAMETERS:\tJunction reads.min:",$JunctionCut,"\n";
+print "PARAMETERS:\tOutput folder:     ",$Outputfolder,"\n";
+print "PARAMETERS:\tOutput prefix:     ",$OutputPrefix,"\n";
+print "PARAMETERS:\tRead size:         ",$ReadSize,"\n";
+print "PARAMETERS:\tBowtie threads #:  ",$BowtieThreads,"\n";
+print "PARAMETERS:\tNo IRM.\n" if ($noIRM);
+
+if($MapSoftware eq "bowtie")
+{
+        print "=================STAGE 1 MAPPING===================\n";
+        system("bash $SrcFolder/bin/mapping_bowtie.sh $ReadFile1Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder $BowtieThreads");
+        system("bash $SrcFolder/bin/mapping_bowtie.sh $ReadFile2Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder $BowtieThreads") if($ReadFile2Name ne $ReadFile1Name);
+        print "=================STAGE 2 ESTIMATION================\n";
+        #  ratio, log, nums
+        system("bash $SrcFolder/bin/batch_para_cov10p_fit.sh $ReadFile1Name $ReadFile2Name $OutputPrefix $ReadSize $DatabasePrefix $Outputfolder $SrcFolder $noIRMstr") ;
+        print "=================STAGE 3 CUTOFF====================\n";
+        #   raw
+        system("bash $SrcFolder/bin/apply_cutoff.sh $OutputPrefix $CutoffLevel $Outputfolder $JunctionCut $SrcFolder $noIRMstr");
+
+
+}
+
+if($MapSoftware eq "rmap")
+{
+        print "=================STAGE 1 MAPPING===================\n";
+
+        system("bash $SrcFolder/bin/mapping_rmap.sh $ReadFile1Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder") ;
+        system("bash $SrcFolder/bin/mapping_rmap.sh $ReadFile2Name $ReadFileFormat $DatabasePrefix $Outputfolder $SrcFolder") if($ReadFile2Name ne $ReadFile1Name);
+        print "=================STAGE 2 ESTIMATION================\n";
+
+        system("bash $SrcFolder/bin/batch_para_cov10p_fit.sh $ReadFile1Name $ReadFile2Name $OutputPrefix $ReadSize $DatabasePrefix $Outputfolder $SrcFolder $noIRMstr") ;
+        print "=================STAGE 3 CUTOFF====================\n";
+        system("bash $SrcFolder/bin/apply_cutoff.sh $OutputPrefix $CutoffLevel $Outputfolder $JunctionCut $SrcFolder $noIRMstr");
+
+
+}
+
+#print "============ALL DONE, OUTPUTFILE:$OutputPrefix.txt\n";
+
+if($onGalaxy_raw ne "" && $onGalaxy_txt ne "")
+{
+        print "OUTPUTFILE:$OutputPrefix.raw\n";
+        system("grep -v na $Outputfolder/$OutputPrefix.raw >$onGalaxy_raw");
+        print "OUTPUTFILE:$OutputPrefix.txt\n";
+        system("grep -v na $Outputfolder/$OutputPrefix.txt >$onGalaxy_txt");
+}
+
+print "============Clean up\n";
+system("rm -r $Outputfolder/$OutputPrefix.*");
+
+sub random_sessid {
+        #my @chars = (0..9,a..z,A..Z);
+        my @chars = ('a'..'z','A'..'Z');
+        my $len = 10;
+        my $string = join '', map {$chars[rand(@chars)]} (1..$len);
+        return $string;
+}