diff bin/SpliceChange.pl @ 1:adc0f7765d85 draft

planemo upload
author bioitcore
date Thu, 07 Sep 2017 15:06:58 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/SpliceChange.pl	Thu Sep 07 15:06:58 2017 -0400
@@ -0,0 +1,174 @@
+# compare two outputs from SpliceTrap
+use strict;
+
+# the information needed
+# inclusion ratio	input file
+# filtered out or not	input file
+# minimal inclusion ratio	at least 0.1 for one condition
+# minimal splicing changes	parameter
+# orignial pipeline written by Martin Akerman
+# re-organized and re-written by Jie Wu
+
+use FileHandle;
+
+use Getopt::Long;
+
+my @programs = ('grep','mkdir','R','paste','awk','sort');
+foreach my $program (@programs)
+{
+        die ("CHECK: $program not found\n") if(system("hash $program >/dev/null"));
+}
+
+
+my $InputFileName1 = "";
+my $InputFileName2 = "";
+my $OutputFileName = "";
+my $minchange = 0.3;
+my $mininc = 0.1;
+my $noIRM = 0;
+
+
+GetOptions (
+	"1:s"=>\$InputFileName1,
+	"2:s"=>\$InputFileName2,
+	"o:s"=>\$OutputFileName,
+	"noIRM|noirm"=>\$noIRM,
+	"m:f"=>\$mininc,
+	"c:f"=>\$minchange
+);
+
+my $InputParaDes="	Usage of the script:
+	-1	input file 1, output from SpliceTrap, *.raw file in the output folder 
+	-2	input file 2. see above.
+	-o	output file prefix.
+	-c	minimal change required, [default:0.3]
+	-m	minimal inclusion ratio for at least one condition. [defualt:0.1]
+	--noIRM	Use the unadjusted inclusion ratios (before IRM correction)
+";
+
+if($InputFileName1 eq "" or $InputFileName2 eq "" or $OutputFileName eq "")
+{
+	print $InputParaDes;
+	exit;	
+}
+
+
+
+if(-d "$OutputFileName.cache" )
+{
+	print "Aborted! output cache folder exists: $OutputFileName.cache \n";
+	exit;
+}
+else
+{
+	system("mkdir $OutputFileName.cache");
+}
+
+#
+my %ir1; # records ir from file1
+my %ir2; # records ir from file2
+# only records trios above the cutoffs
+
+open(input1, $InputFileName1) or die "$InputFileName1 open error!\n";
+while(my $line=<input1>)
+{
+	chomp($line);
+	my @a = split("\t", $line);
+	if($a[21] ne "na")
+	{
+		if($noIRM)
+		{
+			$ir1{$a[0]} = $a[1];
+		}
+		else
+		{
+			$ir1{$a[0]} = $a[2];
+		}
+	}
+}
+print scalar(keys (%ir1) )," records loaded from $InputFileName1\n";
+close(input1);
+
+open(input2, $InputFileName2) or die "$InputFileName2 open error!\n";
+while(my $line=<input2>)
+{
+        chomp($line);
+        my @a = split("\t", $line);
+        if($a[21] ne "na")
+        {
+		if($noIRM)
+		{
+			$ir2{$a[0]} = $a[1];
+		}
+		else
+		{
+                	$ir2{$a[0]} = $a[2];
+		}
+        }
+}
+print scalar(keys (%ir2) )," records loaded from $InputFileName2\n";
+
+
+close(input2);
+
+
+##
+my %mean;
+my %sd;
+
+my %num;
+
+my %filehandles;
+ 
+my @types = ("CA", "IR", "AD","AA");
+
+foreach my $type (@types)
+{
+	my $fh = new FileHandle;
+	open($fh, ">$OutputFileName.cache/$type") or die "Cannot open $OutputFileName.cache/$type\n";
+	$filehandles{$type} = $fh;
+}
+
+
+foreach my $key (keys %ir1)
+{
+	if(exists $ir2{$key})
+	{
+		if(($ir1{$key} + $ir2{$key}) > 0)
+		{
+			#find the type
+			my $type = substr($key, 0, 2);
+			$type = "CA" if $type eq "CS";
+			$num{$type}++;
+			
+			my $change = ($ir2{$key} - $ir1{$key})/ ($ir1{$key} + $ir2{$key});
+			$mean{$type} = $mean{$type} + $change;
+			$sd{$type} = $change*$change + $sd{$type};
+
+			$change = sprintf("%.4f",$change);
+
+			my $fout =  $filehandles{$type};
+			print $fout $key,"\t",$ir1{$key},"\t",$ir2{$key},"\t",$change,"\n";
+		}
+	}
+}
+
+foreach my $type (keys %filehandles)
+{
+	close($filehandles{$type});
+	if($num{$type} == 0)
+	{
+		warn  "no AS events passed filters for both files\n";
+		next;
+	}
+	$mean{$type} = $mean{$type}/$num{$type};
+	$sd{$type} = sqrt($sd{$type}/$num{$type});
+	system("R  --slave --args  $OutputFileName.cache/$type $mean{$type} $sd{$type} $num{$type} <$SrcFolder/calc_pval.R");
+	system("paste $OutputFileName.cache/$type $OutputFileName.cache/$type.p |awk '(\$2>$mininc||\$3>$mininc)&&(\$4>$minchange||\$4<-$minchange)' |sort -k4nr >$OutputFileName.$type.report");
+	print "$num{$type} $type events processed...\n";
+	#print $mean{$type},"\t",  $sd{$type} ,"\t",$num{$type},"\n";
+
+}
+system("rm $OutputFileName.cache -rf");
+
+