coverage_report: CoverageReport.pl comparison

comparison CoverageReport.pl @ 26:859999cb135b draft

revised routines. Better handling of collapsing, isoforms and mutiple mapppings. Barplots revised to visualise average base coverage in exon instead of total number of reads in exon

author	geert-vandeweyer
date	Wed, 29 Nov 2017 08:12:27 -0500
parents	6cb012c8497a
children	576bfc1586f9

comparison

equal deleted inserted replaced

-:6cb012c8497a
+:859999cb135b
 # A : (A)ll exons will be plotted.
 # L : (L)ist failed exons instead of plotting
 # m : (m)inimal Coverage threshold
 # f : fraction of average as threshold
 # n : sample (n)ame.
+# T : collapse overlapping Target regions.
 getopts('b:t:o:z:rsSALm:n:f:T', \%opts) ;
 # make output directory in (tmp) working dir
 our $wd = "/tmp/Coverage.".int(rand(1000));
 while (-d $wd) {
 	$wd = "/tmp/Coverage.".int(rand(1000));
 }
 system("mkdir $wd");
+$wd = "/tmp/Coverage.993";
+print "wd : $wd\n";
 ## variables
 our %commandsrun = ();
 if (!exists($opts{'b'}) || !-e $opts{'b'}) {
 	die('Bam File not found');
 	}
 	my $targets = $opts{'t'};
 	my $tmptargets = "$wd/collapsedtargets.bed";
 	system("sort -k1,1 -k2,2n $targets > $wd/sorted.targets.bed");
 	system("bedtools merge -s -scores max -nms -i $wd/sorted.targets.bed > $tmptargets");
-	$opts{'t'} = $tmptargets;
+	open IN, $tmptargets;
-}
+	open OUT, ">$wd/collapsed.targets.renamed.bed";
+	# we assume that overlapping fragments come from isoforms, not from different genes.
-# 1. Global Summary => default
+	my %counters = ();
-&GlobalSummary($opts{'b'}, $opts{'t'});
+	my @genes = ();
+	while (<IN>) {
+		chomp;
+		my @p = split(/\t/,$_);
+		my @g = split(/,/,$p[3]);
+		$g[0] =~ m/(\S+)\(.*/;
+		my $gene = $1;
+		if (!defined($counters{$gene})) {
+			push(@genes,$gene);
+			$counters{$gene}{'lines'} = ();
+			$counters{$gene}{'orient'} = $p[5];
+		}
+		$p[3] = $gene."(COLLAPSED)";
+		push(@{$counters{$gene}{'lines'}},\@p);
+	}
+	close IN;
+	foreach my $gene (@genes) {
+		if ($counters{$gene}{'orient'} eq '-') {
+			my $idx = scalar(@{$counters{$gene}{'lines'}}) + 1;
+			foreach my $line (@{$counters{$gene}{'lines'}}) {
+				$idx--;
+				$line->[3] .= "|Region_$idx";
+				print OUT join("\t",@$line)."\n";
+			}
+		}
+		else {
+			my $idx = 0;
+foreach my $line (@{$counters{$gene}{'lines'}}) {
+$idx++;
+$line->[3] .= "|Region_$idx";
+print OUT join("\t",@$line)."\n";
+}
+		}
+	}
+	close OUT;
+	$opts{'t'} = "$wd/collapsed.targets.renamed.bed";
+}
+# 1. Coverage per exon
+# included in 2.
 # 2. Coverage per position
 &SubRegionCoverage($opts{'b'}, $opts{'t'});
 our %filehash;
+our $tcov;
 if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) {
-	system("mkdir $wd/SplitFiles");
+	system("mkdir -p $wd/SplitFiles");
+	system("rm $wd/SplitFiles/*");
 	## get position coverages
 	## split input files
 	open IN, "$wd/Targets.Position.Coverage";
+	open BCOVSUM, ">$wd/Results/".$opts{'n'}.".Average_Region_Coverage.txt";
 	my $fileidx = 0;
 	my $currreg = '';
+	my $elength = 0;
+	my $esum = 0;
+	my $eline = "";
+	my %out = ();
 	while (<IN>) {
 		my $line = $_;
 		chomp($line);
 		my @p = split(/\t/,$line);
-		my $reg = $p[0].'-'.$p[1].'-'.$p[2]; #.$p[3];
+		my $reg = $p[0].'-'.$p[1].'-'.$p[2]. ": $p[3]";
 		my $ex = $p[3];
+		my $epos = $p[1];
+		# average exon coverage calculation.
+		if (!defined($out{$ex})) {
+			$out{$ex} = ();
+		}
+		if (!defined($out{$ex}{$p[0]})) {
+			$out{$ex}{$p[0]} = ();
+		}
+		# needs to be transcript specific ($ex) and position specific ($epos) to handle both isoforms and PAR/multiple mapping situations.
+		if (!defined($out{$ex}{$p[0]}{$epos})) {
+			$out{$ex}{$p[0]}{$epos}{'r'} = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]";
+			$out{$ex}{$p[0]}{$epos}{'c'} = ();
+		}
+		push(@{$out{$ex}{$p[0]}{$epos}{'c'}},$p[-1]);
+		# splitting files
 		if ($reg ne $currreg) {
 			## new exon open new outfile
 			if ($currreg ne '') {
+				print BCOVSUM "$eline\t".($esum/$elength)."\n";
 				## filehandle is open. close it
 				close OUT;
 			}
+			$eline = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]"; #\t$p[6]\t$p[7]\t$p[8]";
+			$esum = 0;
+			$elength = 0;
 			if (!exists($filehash{$reg})) {
 				$fileidx++;
 				$filehash{$reg}{'idx'} = $fileidx;
-				$filehash{$reg}{'exon'} = $ex;
+				$filehash{$reg}{'exon'} = $reg;
 				open OUT, ">> $wd/SplitFiles/File_$fileidx.txt";
 				$currreg = $reg;
 			}
 			else {
 				open OUT, ">> $wd/SplitFiles/File_".$filehash{$reg}{'idx'}.".txt";
 				$currreg = $reg;
 			}
 		}
 		## print the line to the open filehandle.
 		print OUT "$line\n";
+		$esum += $p[-1];
+		$elength++;
 	}
 	close OUT;
 	close IN;
+	if ($esum > 0) {
-}
+		print BCOVSUM "$eline\t".($esum/$elength)."\n";
+	}
+	close BCOVSUM;
+	open OUT, ">$wd/avg.tcov.txt";
+	foreach my $tr_ex (sort {$a cmp $b} keys(%out)) {
+	   foreach my $chr (sort {$a cmp $b} keys(%{$out{$tr_ex}})) {
+		foreach(sort {$a <=> $b} keys(%{$out{$tr_ex}{$chr}})) {
+			my ($avg,$nr,$nrcov) = GetStats(\@{$out{$tr_ex}{$chr}{$_}{'c'}});
+			my $frac = 0;
+			if ($nr > 0) {
+				$frac = ($nrcov / $nr);
+			}
+			print OUT $out{$tr_ex}{$chr}{$_}{'r'}."\t".$avg."\t".$nrcov."\t".$nr."\t".$frac."\n";
+		}
+	   }
+	}
+	close OUT;
+	$tcov = "$wd/avg.tcov.txt";
+}
 ## sort output files according to targets file
-if (exists($opts{'r'}) ) {
+my %hash = ();
-	my %hash = ();
+open IN, $tcov;
-	open IN, "$wd/Targets.Global.Coverage";
+while (<IN>) {
-	while (<IN>) {
+	my @p = split(/\t/,$_) ;
-		my @p = split(/\t/,$_) ;
+	$hash{$p[3].':'.$p[1]} = $_;
-		$hash{$p[3]} = $_;
+}
-	}
+close IN;
-	close IN;
+open OUT, ">$tcov";
-	open OUT, ">$wd/Targets.Global.Coverage";
+open IN, $opts{'t'};
-	open IN, $opts{'t'};
+while (<IN>) {
-	while (<IN>) {
+	my @p = split(/\t/,$_) ;
-		my @p = split(/\t/,$_) ;
+	print OUT $hash{$p[3].':'.$p[1]};
-		print OUT $hash{$p[3]};
+}
-	}
+close IN;
-	close IN;
+close OUT;
-	close OUT;
-}
 ####################################
 ## PROCESS RESULTS & CREATE PLOTS ##
 ####################################
 # Get number of reads mapped in total
 ## updated on 2012-10-1 !!
 $totalmapped = $s[2];
 $totalmapped =~ s/^(\d+)(\s.+)/$1/;
 # count columns
-my $head = `head -n 1 $wd/Targets.Global.Coverage`;
+my $head = `head -n 1 $tcov`;
 chomp($head);
 my @cols = split(/\t/,$head);
 my $nrcols = scalar(@cols);
 my $covcol = $nrcols - 3;
 # get min/max/median/average coverage => values
-my $covs = `cut -f $covcol $wd/Targets.Global.Coverage`;
+my $covs = `cut -f $covcol $tcov`;
 my @coverages = split(/\n/,$covs);
 my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages);
-my $spec = sprintf("%.1f",($ontarget / $totalmapped)*100);
+my $spec = '';
+if ($totalmapped != 0 && $totalmapped ne '') {
+	$spec = sprintf("%.1f",($ontarget / $totalmapped)*100);
+}
 # get min/max/median/average coverage => boxplot in R
 open OUT, ">$wd/Rout/boxplot.R";
-print OUT 'coverage <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n";
+print OUT 'coverage <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
 print OUT 'coverage <- coverage[,'.$covcol.']'."\n";
 print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480,type=c("cairo"))'."\n";
 print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n";
 print OUT 'graphics.off()'."\n";
 close OUT;
+print "Running boxplot.R : \n";
 system("cd $wd/Rout && Rscript boxplot.R");
 ## global nt coverage plot
 ## use perl to make histogram (lower memory)
 open IN, "$wd/Targets.Position.Coverage";
 print OUT 'text(1,82,pos=2,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n";
 print OUT 'graphics.off()'."\n";
 close OUT;
+print "Running ntplot.r\n";
 system("cd $wd/Rout && Rscript ntplot.R");
 ## PRINT TO .TEX FILE
 open OUT, ">>$wd/Report/Report.tex";
 # average coverage overviews
 print OUT '\subsection*{Overall Summary}'."\n";
 	$two =~ s/(\s\+.*)|(:.*)/\)/;
 	$two =~ s/%/\\%/g;
 	$two =~ s/>=/\$\\ge\$/g;
 	$two = ucfirst($two);
 	print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n";
 }
 print OUT '\end{tabular}\end{minipage}'."\n";
 print OUT '\hspace{1.5cm}'."\n";
 # target coverage statistics
 print OUT '\begin{minipage}{0.4\linewidth}'."\n";
 # 2. GLOBAL COVERAGE OVERVIEW PER GENE
 @failedexons;
 @allexons;
 @allregions;
 @failedregions;
-if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})) {
+%failednames;
+%allnames;
+if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})|| exists($opts{'A'})) {
 	# count columns
-	my $head = `head -n 1 $wd/Targets.Global.Coverage`;
+	my $head = `head -n 1 '$tcov'`;
 	chomp($head);
 	my @cols = split(/\t/,$head);
 	my $nrcols = scalar(@cols);
 	my $covcol = $nrcols - 3;
 	# Coverage Plots for each gene => barplots in R, table here.
-	open IN, "$wd/Targets.Global.Coverage";
+	open IN, "$tcov";
 	my $currgroup = '';
 	my $startline = 0;
 	my $stopline = 0;
 	$linecounter = 0;
 	while (<IN>) {
 		$linecounter++;
 		chomp($_);
 		my @c = split(/\t/,$_);
-		push(@allregions,$c[0].'-'.$c[1].'-'.$c[2]);
+		my $reg = $c[0].'-'.$c[1].'-'.$c[2];
-		my $group = $c[3];
+		push(@allregions,$reg);
+		my $group = $reg .": ".$c[3];
+		#my $gene = $c[3];
 		## coverage failure?
 		if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) {
 			push(@failedexons,$group);
 			push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]);
+			$failednames{$group} = $c[0].'-'.$c[1].'-'.$c[2];
 		}
 		## store exon
 		push(@allexons,$group);
+		$allnames{$group} = $c[0].'-'.$c[1].'-'.$c[2];
+		if (!exists($opts{'r'}) && !exists($opts{'s'}) && !exists($opts{'S'}) && exists($opts{'A'})) {
+			## no need for barplots
+			next;
+		}
 		## extract and check gene
-		$group =~ s/^(\S+)[\|\s](.+)/$1/;
+		my $gene = $group;
-		if ($group ne $currgroup ) {
+		$gene =~ s/^chr\S+: (\S+)[\|\s](.+)/$1/;
+		if ($gene ne $currgroup ) {
 		    if ($currgroup ne '') {
 			# new gene, make plot.
 			open OUT, ">$wd/Rout/barplot.R";
-			print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n";
+			print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
 			print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n";
 			print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n";
 			print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n";
 			print OUT 'coverage[coverage < 1] <- 1'."\n";
 			print OUT 'colors <- c(rep("grey",length(coverage)))'."\n";
 			else {
 				push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
 			}
 		    }
-		    $currgroup = $group;
+		    $currgroup = $gene;
 		    $startline = $linecounter;
 		}
 		$stopline = $linecounter;
 	}
 	close IN;
 	if ($currgroup ne '') {
 		# last gene, make plot.
 		open OUT, ">$wd/Rout/barplot.R";
-		print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n";
+		print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
 		print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n";
 		print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n";
 		print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n";
 		print OUT 'coverage[coverage < 1] <- 1'."\n";
 		print OUT 'colors <- c(rep("grey",length(coverage)))'."\n";
 	}
 	## print to TEX
 	open OUT, ">>$wd/Report/Report.tex";
 	print OUT '\subsection*{Gene Summaries}'."\n";
 	print OUT '\underline{Legend:} \\\\'."\n";
-	print OUT '{\color{red}\textbf{RED:} Coverage did not reach set threshold of '.$thresh.'} \\\\'."\n";
+	print OUT '{\color{red}\textbf{RED:} Average coverage did not reach set threshold of '.$thresh.'} \\\\'."\n";
-	print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon. Overruled by red.} \\\\' ."\n";
+	print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon (section with zero coverage found). Overruled by red.} \\\\' ."\n";
 	$col = 1;
 	foreach (@small) {
 		if ($col > 2) {
 			$col = 1;
 			print OUT "\n";
 	# tex section header
 	open TEX, ">>$wd/Report/Report.tex";
 	print TEX '\subsection*{Failed Exon Plots}'."\n";
 	$col = 1;
 	print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n";
-	foreach(@failedregions) {
+	foreach(sort(keys(%failednames)) ) {
+	#foreach(@failedregions) {
 		if ($col > 2) {
 			$col = 1;
 			print TEX "\n";
 		}
 		# which exon
-		my $region = $_;
+		my $group = $_;
-		my $exon = $filehash{$region}{'exon'};
+		my ($region,$name) = split(/: /,$group);
+		#my $region = $failednames{$_};
+		my $exon = $filehash{$group}{'exon'};
 		# link exon to tmp file
-		my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt";
+		my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt";
 		## determine transcript orientation and location
 		my $firstline = `head -n 1 $exonfile`;
 		my @firstcols = split(/\t/,$firstline);
 		my $orient = $firstcols[5];
 		my $genomicchr = $firstcols[0];
 		my $width = 480 ;
 		my $height = 240 ;
 		my $exonstr = $exon;
 		$exonstr =~ s/\s/_/g;
+		$exonstr =~ s/:/_/g;
 		$exon =~ s/_/ /g;
 		$exon =~ s/\|/ /g;
+		$exon =~ s/chr.*: (.*)$/$1/;
 		print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n";
 		print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n";
 		if ($orient eq '-') {
 			print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n";
 			print OUT 'mtext("'.$subtitle.'")'."\n";
 	}
 	elsif (exists($opts{'A'})) {
 		print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n";
 	}
 	$col = 1;
-	foreach(@allregions) {
+	foreach(sort(keys(%allnames))) {
+	#foreach(@allregions) {
 		if ($col > 2) {
 			$col = 1;
 			print TEX "\n";
 		}
+		my $group = $_;
+		my ($region,$name) = split(/: /,$group);
 		# which exon
-		my $region = $_;
+		#my $region = $_;
-		my $exon = $filehash{$region}{'exon'};
+		#my $region = $allnames{$_};
+		my $exon = $filehash{$group}{'exon'};
 		# grep exon to tmp file
-		my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt";
+		my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt";
 		## determine transcript orientation.
 my $firstline = `head -n 1 $exonfile`;
 my @firstcols = split(/\t/,$firstline);
 my $orient = $firstcols[5];
 		my $genomicchr = $firstcols[0];
 		print OUT 'positions <- coveragetable[,'.$poscol.']'."\n";
 		my $width = 480 ;
 		my $height = 240 ;
 		my $exonstr = $exon;
 		$exonstr =~ s/\s/_/g;
+		$exonstr =~ s/:/_/g;
 		$exon =~ s/_/ /g;
 		$exon =~ s/\|/ /g;
+		$exon =~ s/^chr.*: (.*)$/$1/;
 		print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n";
 		print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n";
 		if ($orient eq '-') {
 			print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n";
 			print OUT 'mtext("'.$subtitle.'")'."\n";
 		my $genomicchr = $firstcols[0];
 		my $genomicstart = $firstcols[1];
 		my $genomicstop = $firstcols[2];
 		if ($orient eq '+') {
-			$bps = $genomicstop - $genomicstart + 1;
+			#$bps = $genomicstop - $genomicstart + 1;
 			$subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);
 		}
 		else {
-			$bps = $genomicstop - $genomicstart + 1;
+			#$bps = $genomicstop - $genomicstart + 1;
 			$subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);
 		}
 		# check if failed
 		my $cs = `cut -f $covcol '$exonfile' `;
 	system("cp -Rf $wd/Targets.Position.Coverage $wd/Results/");
 }
 system("cd $wd  && tar czf '$tarfile' Results/");
 ## clean up (galaxy stores outside wd)
-system("rm -Rf $wd");
+#system("rm -Rf $wd");
 ###############
 ## FUNCTIONS ##
 ###############
 sub arraystats{
 	my @array = @_;
 	my $count = scalar(@array);
+	if ($count == 0 ) {
+		return (0,0,0,0,0,0,0);
+	}
 	@array = sort { $a <=> $b } @array;
 	# median
 	my $median = 0;
 	if ($count % 2) {
 		$median = $array[int($count/2)];
 	my $min = $array[0];
 	my $max = $array[($count-1)];
 	return ($average,$median,$min,$max,$first,$third,$sum);
 }
-sub GlobalSummary {
+sub GetStats {
-	my ($bam,$targets) = @_;
+	my ($aref) = @_;
+	if (scalar(@$aref) == 0) {
-	my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage";
+		return qw/0 0/;
-	if (exists($commandsrun{$command})) {
+	}
-		return;
+	# median
-	}
+	my @s = sort {$a <=> $b } @$aref;
-	system($command);
+	my $nrzero = 0;
-	$commandsrun{$command} = 1;
+	my $len = scalar(@s);
-}
+	for (my $i = 0; $i< $len;$i++) {
+		if ($s[$i] == 0) {
-sub CoveragePerRegion {
+			$nrzero++;
-	my ($bam,$targets) = @_;
+		}
-	my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage";
+		else {
-	if (exists($commandsrun{$command})) {
+			last;
-		return;
+		}
 	}
-	system($command);
+	my $nrcov = $len - $nrzero;
-	$commandsrun{$command} = 1;
+	# avg
-}
+	my $avg = 0;
+	foreach (@s) { $avg += $_ };
+	$avg = sprintf("%.1f",($avg / scalar(@s)));
+	return($avg,$len,$nrcov);
+}
 sub SubRegionCoverage {
 	my ($bam,$targets) = @_;
 	my $command = "cd $wd && coverageBed -abam $bam -b $targets -d > $wd/Targets.Position.Coverage";
 	system($command);

Mercurial > repos > geert-vandeweyer > coverage_report

comparison CoverageReport.pl @ 26:859999cb135b draft