# compare two outputs from SpliceTrap
use strict;

# the information needed
# inclusion ratio	input file
# filtered out or not	input file
# minimal inclusion ratio	at least 0.1 for one condition
# minimal splicing changes	parameter
# orignial pipeline written by Martin Akerman
# re-organized and re-written by Jie Wu

use FileHandle;

use Getopt::Long;

my @programs = ('grep','mkdir','R','paste','awk','sort');
foreach my $program (@programs)
{
        die ("CHECK: $program not found\n") if(system("hash $program >/dev/null"));
}


my $InputFileName1 = "";
my $InputFileName2 = "";
my $OutputFileName = "";
my $minchange = 0.3;
my $mininc = 0.1;
my $noIRM = 0;


GetOptions (
	"1:s"=>\$InputFileName1,
	"2:s"=>\$InputFileName2,
	"o:s"=>\$OutputFileName,
	"noIRM|noirm"=>\$noIRM,
	"m:f"=>\$mininc,
	"c:f"=>\$minchange
);

my $InputParaDes="	Usage of the script:
	-1	input file 1, output from SpliceTrap, *.raw file in the output folder 
	-2	input file 2. see above.
	-o	output file prefix.
	-c	minimal change required, [default:0.3]
	-m	minimal inclusion ratio for at least one condition. [defualt:0.1]
	--noIRM	Use the unadjusted inclusion ratios (before IRM correction)
";

if($InputFileName1 eq "" or $InputFileName2 eq "" or $OutputFileName eq "")
{
	print $InputParaDes;
	exit;	
}



if(-d "$OutputFileName.cache" )
{
	print "Aborted! output cache folder exists: $OutputFileName.cache \n";
	exit;
}
else
{
	system("mkdir $OutputFileName.cache");
}

#
my %ir1; # records ir from file1
my %ir2; # records ir from file2
# only records trios above the cutoffs

open(input1, $InputFileName1) or die "$InputFileName1 open error!\n";
while(my $line=<input1>)
{
	chomp($line);
	my @a = split("\t", $line);
	if($a[21] ne "na")
	{
		if($noIRM)
		{
			$ir1{$a[0]} = $a[1];
		}
		else
		{
			$ir1{$a[0]} = $a[2];
		}
	}
}
print scalar(keys (%ir1) )," records loaded from $InputFileName1\n";
close(input1);

open(input2, $InputFileName2) or die "$InputFileName2 open error!\n";
while(my $line=<input2>)
{
        chomp($line);
        my @a = split("\t", $line);
        if($a[21] ne "na")
        {
		if($noIRM)
		{
			$ir2{$a[0]} = $a[1];
		}
		else
		{
                	$ir2{$a[0]} = $a[2];
		}
        }
}
print scalar(keys (%ir2) )," records loaded from $InputFileName2\n";


close(input2);


##
my %mean;
my %sd;

my %num;

my %filehandles;
 
my @types = ("CA", "IR", "AD","AA");

foreach my $type (@types)
{
	my $fh = new FileHandle;
	open($fh, ">$OutputFileName.cache/$type") or die "Cannot open $OutputFileName.cache/$type\n";
	$filehandles{$type} = $fh;
}


foreach my $key (keys %ir1)
{
	if(exists $ir2{$key})
	{
		if(($ir1{$key} + $ir2{$key}) > 0)
		{
			#find the type
			my $type = substr($key, 0, 2);
			$type = "CA" if $type eq "CS";
			$num{$type}++;
			
			my $change = ($ir2{$key} - $ir1{$key})/ ($ir1{$key} + $ir2{$key});
			$mean{$type} = $mean{$type} + $change;
			$sd{$type} = $change*$change + $sd{$type};

			$change = sprintf("%.4f",$change);

			my $fout =  $filehandles{$type};
			print $fout $key,"\t",$ir1{$key},"\t",$ir2{$key},"\t",$change,"\n";
		}
	}
}

foreach my $type (keys %filehandles)
{
	close($filehandles{$type});
	if($num{$type} == 0)
	{
		warn  "no AS events passed filters for both files\n";
		next;
	}
	$mean{$type} = $mean{$type}/$num{$type};
	$sd{$type} = sqrt($sd{$type}/$num{$type});
	system("R  --slave --args  $OutputFileName.cache/$type $mean{$type} $sd{$type} $num{$type} <$SrcFolder/calc_pval.R");
	system("paste $OutputFileName.cache/$type $OutputFileName.cache/$type.p |awk '(\$2>$mininc||\$3>$mininc)&&(\$4>$minchange||\$4<-$minchange)' |sort -k4nr >$OutputFileName.$type.report");
	print "$num{$type} $type events processed...\n";
	#print $mean{$type},"\t",  $sd{$type} ,"\t",$num{$type},"\n";

}
system("rm $OutputFileName.cache -rf");


