changeset 26:859999cb135b draft

revised routines. Better handling of collapsing, isoforms and mutiple mapppings. Barplots revised to visualise average base coverage in exon instead of total number of reads in exon
author geert-vandeweyer
date Wed, 29 Nov 2017 08:12:27 -0500
parents 6cb012c8497a
children 576bfc1586f9
files CoverageReport.pl
diffstat 1 files changed, 195 insertions(+), 70 deletions(-) [+]
line wrap: on
line diff
--- a/CoverageReport.pl	Thu Feb 12 09:54:03 2015 -0500
+++ b/CoverageReport.pl	Wed Nov 29 08:12:27 2017 -0500
@@ -26,18 +26,20 @@
 # m : (m)inimal Coverage threshold
 # f : fraction of average as threshold
 # n : sample (n)ame.
- 
+# T : collapse overlapping Target regions. 
 
 getopts('b:t:o:z:rsSALm:n:f:T', \%opts) ;
 
 # make output directory in (tmp) working dir
 our $wd = "/tmp/Coverage.".int(rand(1000));
+
 while (-d $wd) {
 	$wd = "/tmp/Coverage.".int(rand(1000));
 
 }
 system("mkdir $wd");
-
+$wd = "/tmp/Coverage.993";
+print "wd : $wd\n";
 ## variables
 our %commandsrun = ();
 
@@ -89,38 +91,103 @@
 	my $tmptargets = "$wd/collapsedtargets.bed";
 	system("sort -k1,1 -k2,2n $targets > $wd/sorted.targets.bed");
 	system("bedtools merge -s -scores max -nms -i $wd/sorted.targets.bed > $tmptargets");
-	$opts{'t'} = $tmptargets;
+	open IN, $tmptargets;
+	open OUT, ">$wd/collapsed.targets.renamed.bed";
+	# we assume that overlapping fragments come from isoforms, not from different genes.
+	my %counters = ();
+	my @genes = ();
+	while (<IN>) {
+		chomp;
+		my @p = split(/\t/,$_);
+		my @g = split(/,/,$p[3]);
+		$g[0] =~ m/(\S+)\(.*/;
+		my $gene = $1;
+		if (!defined($counters{$gene})) {
+			push(@genes,$gene);
+			$counters{$gene}{'lines'} = ();
+			$counters{$gene}{'orient'} = $p[5];
+		}
+		$p[3] = $gene."(COLLAPSED)";
+		push(@{$counters{$gene}{'lines'}},\@p);
+	}
+	close IN;
+	foreach my $gene (@genes) {
+		if ($counters{$gene}{'orient'} eq '-') {
+			my $idx = scalar(@{$counters{$gene}{'lines'}}) + 1;
+			foreach my $line (@{$counters{$gene}{'lines'}}) {
+				$idx--;
+				$line->[3] .= "|Region_$idx";
+				print OUT join("\t",@$line)."\n";
+			}
+		}
+		else {
+			my $idx = 0;
+                        foreach my $line (@{$counters{$gene}{'lines'}}) {
+                                $idx++;
+                                $line->[3] .= "|Region_$idx";
+                                print OUT join("\t",@$line)."\n";
+                        }
+		}
+	}
+	close OUT;
+	$opts{'t'} = "$wd/collapsed.targets.renamed.bed";
 }
-
-# 1. Global Summary => default
-&GlobalSummary($opts{'b'}, $opts{'t'});
+# 1. Coverage per exon
+# included in 2.
 
 # 2. Coverage per position
 &SubRegionCoverage($opts{'b'}, $opts{'t'});
 our %filehash;
+our $tcov;
 if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) {
-	system("mkdir $wd/SplitFiles");
+	system("mkdir -p $wd/SplitFiles");
+	system("rm $wd/SplitFiles/*");
 	## get position coverages
 	## split input files
 	open IN, "$wd/Targets.Position.Coverage";
+	open BCOVSUM, ">$wd/Results/".$opts{'n'}.".Average_Region_Coverage.txt";
 	my $fileidx = 0;
 	my $currreg = '';
+	my $elength = 0;
+	my $esum = 0;
+	my $eline = "";
+	my %out = ();
 	while (<IN>) {
 		my $line = $_;
 		chomp($line);
 		my @p = split(/\t/,$line);
-		my $reg = $p[0].'-'.$p[1].'-'.$p[2]; #.$p[3];
+		my $reg = $p[0].'-'.$p[1].'-'.$p[2]. ": $p[3]";
 		my $ex = $p[3];
+		my $epos = $p[1];
+		# average exon coverage calculation.
+		if (!defined($out{$ex})) {
+			$out{$ex} = ();
+		}
+		if (!defined($out{$ex}{$p[0]})) {
+			$out{$ex}{$p[0]} = ();
+		}
+		# needs to be transcript specific ($ex) and position specific ($epos) to handle both isoforms and PAR/multiple mapping situations.
+		if (!defined($out{$ex}{$p[0]}{$epos})) {
+			$out{$ex}{$p[0]}{$epos}{'r'} = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]";
+			$out{$ex}{$p[0]}{$epos}{'c'} = ();
+
+		}
+		push(@{$out{$ex}{$p[0]}{$epos}{'c'}},$p[-1]);
+		# splitting files 
 		if ($reg ne $currreg) {
 			## new exon open new outfile
 			if ($currreg ne '') {
+				print BCOVSUM "$eline\t".($esum/$elength)."\n";
 				## filehandle is open. close it
 				close OUT; 
 			}
+			$eline = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]"; #\t$p[6]\t$p[7]\t$p[8]";
+			$esum = 0;
+			$elength = 0;
 			if (!exists($filehash{$reg})) {
 				$fileidx++;
 				$filehash{$reg}{'idx'} = $fileidx;
-				$filehash{$reg}{'exon'} = $ex;
+				$filehash{$reg}{'exon'} = $reg;
 				open OUT, ">> $wd/SplitFiles/File_$fileidx.txt";
 				$currreg = $reg;
 			}
@@ -131,30 +198,49 @@
 		}
 		## print the line to the open filehandle. 	
 		print OUT "$line\n";
+		$esum += $p[-1];
+		$elength++;
 	}
 	close OUT;
 	close IN;
+	if ($esum > 0) {
+		print BCOVSUM "$eline\t".($esum/$elength)."\n";
+	}
+	close BCOVSUM;
+	open OUT, ">$wd/avg.tcov.txt";
+
+	foreach my $tr_ex (sort {$a cmp $b} keys(%out)) {
+	   foreach my $chr (sort {$a cmp $b} keys(%{$out{$tr_ex}})) {	
+		foreach(sort {$a <=> $b} keys(%{$out{$tr_ex}{$chr}})) {
+			my ($avg,$nr,$nrcov) = GetStats(\@{$out{$tr_ex}{$chr}{$_}{'c'}});
+			my $frac = 0;
+			if ($nr > 0) {
+				$frac = ($nrcov / $nr);
+			}
+			print OUT $out{$tr_ex}{$chr}{$_}{'r'}."\t".$avg."\t".$nrcov."\t".$nr."\t".$frac."\n";
+		}
+	   }
+	}
+	close OUT;
+	$tcov = "$wd/avg.tcov.txt";
 
 }
-
 ## sort output files according to targets file
-if (exists($opts{'r'}) ) {
-	my %hash = ();
-	open IN, "$wd/Targets.Global.Coverage";
-	while (<IN>) {
-		my @p = split(/\t/,$_) ;
-		$hash{$p[3]} = $_;
-	}
-	close IN;
-	open OUT, ">$wd/Targets.Global.Coverage";
-	open IN, $opts{'t'};
-	while (<IN>) {
-		my @p = split(/\t/,$_) ;
-		print OUT $hash{$p[3]};
-	}
-	close IN;
-	close OUT;
+my %hash = ();
+open IN, $tcov;
+while (<IN>) {
+	my @p = split(/\t/,$_) ;
+	$hash{$p[3].':'.$p[1]} = $_;
 }
+close IN;
+open OUT, ">$tcov";
+open IN, $opts{'t'};
+while (<IN>) {
+	my @p = split(/\t/,$_) ;
+	print OUT $hash{$p[3].':'.$p[1]};
+}
+close IN;
+close OUT;
 
 
 ####################################
@@ -211,24 +297,28 @@
 $totalmapped = $s[2];
 $totalmapped =~ s/^(\d+)(\s.+)/$1/;
 # count columns
-my $head = `head -n 1 $wd/Targets.Global.Coverage`;
+my $head = `head -n 1 $tcov`;
 chomp($head);
 my @cols = split(/\t/,$head);
 my $nrcols = scalar(@cols);
 my $covcol = $nrcols - 3;
 # get min/max/median/average coverage => values
-my $covs = `cut -f $covcol $wd/Targets.Global.Coverage`;
+my $covs = `cut -f $covcol $tcov`;
 my @coverages = split(/\n/,$covs);
 my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages);
-my $spec = sprintf("%.1f",($ontarget / $totalmapped)*100);
+my $spec = '';
+if ($totalmapped != 0 && $totalmapped ne '') {
+	$spec = sprintf("%.1f",($ontarget / $totalmapped)*100);
+}
 # get min/max/median/average coverage => boxplot in R
 open OUT, ">$wd/Rout/boxplot.R";
-print OUT 'coverage <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n";
+print OUT 'coverage <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
 print OUT 'coverage <- coverage[,'.$covcol.']'."\n";
 print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480,type=c("cairo"))'."\n";
 print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n";
 print OUT 'graphics.off()'."\n";
 close OUT;
+print "Running boxplot.R : \n";
 system("cd $wd/Rout && Rscript boxplot.R");
 
 ## global nt coverage plot
@@ -323,6 +413,7 @@
 print OUT 'graphics.off()'."\n";
 
 close OUT;
+print "Running ntplot.r\n";
 system("cd $wd/Rout && Rscript ntplot.R");
 ## PRINT TO .TEX FILE
 open OUT, ">>$wd/Report/Report.tex";
@@ -359,6 +450,8 @@
 	$two =~ s/>=/\$\\ge\$/g;
 	$two = ucfirst($two);
 	print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n";
+	
+
 }
 print OUT '\end{tabular}\end{minipage}'."\n";
 print OUT '\hspace{1.5cm}'."\n";
@@ -388,15 +481,18 @@
 @allexons;
 @allregions;
 @failedregions;
-if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})) {
+%failednames;
+%allnames;
+
+if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})|| exists($opts{'A'})) {
 	# count columns
-	my $head = `head -n 1 $wd/Targets.Global.Coverage`;
+	my $head = `head -n 1 '$tcov'`;
 	chomp($head);
 	my @cols = split(/\t/,$head);
 	my $nrcols = scalar(@cols);
 	my $covcol = $nrcols - 3;
 	# Coverage Plots for each gene => barplots in R, table here.
-	open IN, "$wd/Targets.Global.Coverage";
+	open IN, "$tcov";
 	my $currgroup = '';
 	my $startline = 0;
 	my $stopline = 0;
@@ -405,22 +501,31 @@
 		$linecounter++;
 		chomp($_);
 		my @c = split(/\t/,$_);
-		push(@allregions,$c[0].'-'.$c[1].'-'.$c[2]);
-		my $group = $c[3];
+		my $reg = $c[0].'-'.$c[1].'-'.$c[2];
+		push(@allregions,$reg);
+		my $group = $reg .": ".$c[3];
+		#my $gene = $c[3];
 		## coverage failure?
 		if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) {
 			push(@failedexons,$group);
 			push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]);
+			$failednames{$group} = $c[0].'-'.$c[1].'-'.$c[2];
 		}
 		## store exon
 		push(@allexons,$group);
+		$allnames{$group} = $c[0].'-'.$c[1].'-'.$c[2];
+		if (!exists($opts{'r'}) && !exists($opts{'s'}) && !exists($opts{'S'}) && exists($opts{'A'})) {
+			## no need for barplots
+			next;
+		}
 		## extract and check gene
-		$group =~ s/^(\S+)[\|\s](.+)/$1/;
-		if ($group ne $currgroup ) {
+		my $gene = $group;
+		$gene =~ s/^chr\S+: (\S+)[\|\s](.+)/$1/;
+		if ($gene ne $currgroup ) {
 		    if ($currgroup ne '') {
 			# new gene, make plot. 
 			open OUT, ">$wd/Rout/barplot.R";
-			print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n";
+			print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
 			print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n";
 			print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n";
 			print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n";
@@ -457,7 +562,7 @@
 			}
 
 		    }
-		    $currgroup = $group;
+		    $currgroup = $gene;
 		    $startline = $linecounter;
 		}
 		$stopline = $linecounter;
@@ -466,7 +571,7 @@
 	if ($currgroup ne '') {
 		# last gene, make plot. 
 		open OUT, ">$wd/Rout/barplot.R";
-		print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n";
+		print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
 		print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n";
 		print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n";
 		print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n";
@@ -502,8 +607,8 @@
 	open OUT, ">>$wd/Report/Report.tex";
 	print OUT '\subsection*{Gene Summaries}'."\n";
 	print OUT '\underline{Legend:} \\\\'."\n";
-	print OUT '{\color{red}\textbf{RED:} Coverage did not reach set threshold of '.$thresh.'} \\\\'."\n";
-	print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon. Overruled by red.} \\\\' ."\n";
+	print OUT '{\color{red}\textbf{RED:} Average coverage did not reach set threshold of '.$thresh.'} \\\\'."\n";
+	print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon (section with zero coverage found). Overruled by red.} \\\\' ."\n";
 	$col = 1;
 	foreach (@small) {
 		if ($col > 2) {
@@ -540,16 +645,19 @@
 	print TEX '\subsection*{Failed Exon Plots}'."\n";
 	$col = 1;
 	print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n";
-	foreach(@failedregions) {
+	foreach(sort(keys(%failednames)) ) {
+	#foreach(@failedregions) {
 		if ($col > 2) {
 			$col = 1;
 			print TEX "\n";
 		}
 		# which exon
-		my $region = $_;
-		my $exon = $filehash{$region}{'exon'};
+		my $group = $_;
+		my ($region,$name) = split(/: /,$group);
+		#my $region = $failednames{$_};
+		my $exon = $filehash{$group}{'exon'};
 		# link exon to tmp file
-		my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt";
+		my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt";
 		## determine transcript orientation and location
 		my $firstline = `head -n 1 $exonfile`;
 		my @firstcols = split(/\t/,$firstline);
@@ -576,8 +684,10 @@
 		my $height = 240 ;
 		my $exonstr = $exon;
 		$exonstr =~ s/\s/_/g;
+		$exonstr =~ s/:/_/g;
 		$exon =~ s/_/ /g;
 		$exon =~ s/\|/ /g;
+		$exon =~ s/chr.*: (.*)$/$1/;
 		print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n";
 		print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n";
 		if ($orient eq '-') {
@@ -624,16 +734,21 @@
 		print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n";
 	}
 	$col = 1;
-	foreach(@allregions) {
+	foreach(sort(keys(%allnames))) {
+
+	#foreach(@allregions) {
 		if ($col > 2) {
 			$col = 1;
 			print TEX "\n";
 		}
+		my $group = $_;
+		my ($region,$name) = split(/: /,$group);
 		# which exon
-		my $region = $_;
-		my $exon = $filehash{$region}{'exon'};
+		#my $region = $_;
+		#my $region = $allnames{$_};
+		my $exon = $filehash{$group}{'exon'};
 		# grep exon to tmp file
-		my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt";
+		my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt";
 		## determine transcript orientation.
                 my $firstline = `head -n 1 $exonfile`;
                 my @firstcols = split(/\t/,$firstline);
@@ -673,8 +788,10 @@
 		my $height = 240 ;
 		my $exonstr = $exon;
 		$exonstr =~ s/\s/_/g;
+		$exonstr =~ s/:/_/g;
 		$exon =~ s/_/ /g;
 		$exon =~ s/\|/ /g;
+		$exon =~ s/^chr.*: (.*)$/$1/;
 		print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n";
 		print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n";
 		if ($orient eq '-') {
@@ -739,12 +856,12 @@
 		my $genomicstop = $firstcols[2];
 
 		if ($orient eq '+') {
-			$bps = $genomicstop - $genomicstart + 1;
+			#$bps = $genomicstop - $genomicstart + 1;
 			$subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);
 
 		}
 		else {
-			$bps = $genomicstop - $genomicstart + 1;
+			#$bps = $genomicstop - $genomicstart + 1;
 			$subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);
 		}
 
@@ -796,13 +913,16 @@
 
 system("cd $wd  && tar czf '$tarfile' Results/");
 ## clean up (galaxy stores outside wd)
-system("rm -Rf $wd");
+#system("rm -Rf $wd");
 ###############
 ## FUNCTIONS ##
 ###############
 sub arraystats{
 	my @array = @_;
 	my $count = scalar(@array);
+	if ($count == 0 ) {
+		return (0,0,0,0,0,0,0);
+	}
 	@array = sort { $a <=> $b } @array;
 	# median
 	my $median = 0;
@@ -824,26 +944,31 @@
 	return ($average,$median,$min,$max,$first,$third,$sum);
 }
 
-sub GlobalSummary {
-	my ($bam,$targets) = @_;
-
-	my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage";
-	if (exists($commandsrun{$command})) {
-		return;
+sub GetStats {
+	my ($aref) = @_;
+	if (scalar(@$aref) == 0) {
+		return qw/0 0/;
 	}
-	system($command);
-	$commandsrun{$command} = 1;
+	# median
+	my @s = sort {$a <=> $b } @$aref;
+	my $nrzero = 0;
+	my $len = scalar(@s);
+	for (my $i = 0; $i< $len;$i++) {
+		if ($s[$i] == 0) {
+			$nrzero++;
+		}
+		else {
+			last;
+		}
+	}
+	my $nrcov = $len - $nrzero;
+	# avg
+	my $avg = 0;
+    	foreach (@s) { $avg += $_ };
+	$avg = sprintf("%.1f",($avg / scalar(@s)));
+	return($avg,$len,$nrcov);
 }
 
-sub CoveragePerRegion {
-	my ($bam,$targets) = @_;
-	my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage";
-	if (exists($commandsrun{$command})) {
-		return;
-	}
-	system($command);
-	$commandsrun{$command} = 1;
-}
 
 sub SubRegionCoverage {
 	my ($bam,$targets) = @_;