Mercurial > repos > geert-vandeweyer > coverage_report
comparison CoverageReport.pl @ 26:859999cb135b draft
revised routines. Better handling of collapsing, isoforms and mutiple mapppings. Barplots revised to visualise average base coverage in exon instead of total number of reads in exon
author | geert-vandeweyer |
---|---|
date | Wed, 29 Nov 2017 08:12:27 -0500 |
parents | 6cb012c8497a |
children | 576bfc1586f9 |
comparison
equal
deleted
inserted
replaced
25:6cb012c8497a | 26:859999cb135b |
---|---|
24 # A : (A)ll exons will be plotted. | 24 # A : (A)ll exons will be plotted. |
25 # L : (L)ist failed exons instead of plotting | 25 # L : (L)ist failed exons instead of plotting |
26 # m : (m)inimal Coverage threshold | 26 # m : (m)inimal Coverage threshold |
27 # f : fraction of average as threshold | 27 # f : fraction of average as threshold |
28 # n : sample (n)ame. | 28 # n : sample (n)ame. |
29 | 29 # T : collapse overlapping Target regions. |
30 | 30 |
31 getopts('b:t:o:z:rsSALm:n:f:T', \%opts) ; | 31 getopts('b:t:o:z:rsSALm:n:f:T', \%opts) ; |
32 | 32 |
33 # make output directory in (tmp) working dir | 33 # make output directory in (tmp) working dir |
34 our $wd = "/tmp/Coverage.".int(rand(1000)); | 34 our $wd = "/tmp/Coverage.".int(rand(1000)); |
35 | |
35 while (-d $wd) { | 36 while (-d $wd) { |
36 $wd = "/tmp/Coverage.".int(rand(1000)); | 37 $wd = "/tmp/Coverage.".int(rand(1000)); |
37 | 38 |
38 } | 39 } |
39 system("mkdir $wd"); | 40 system("mkdir $wd"); |
40 | 41 $wd = "/tmp/Coverage.993"; |
42 print "wd : $wd\n"; | |
41 ## variables | 43 ## variables |
42 our %commandsrun = (); | 44 our %commandsrun = (); |
43 | 45 |
44 if (!exists($opts{'b'}) || !-e $opts{'b'}) { | 46 if (!exists($opts{'b'}) || !-e $opts{'b'}) { |
45 die('Bam File not found'); | 47 die('Bam File not found'); |
87 } | 89 } |
88 my $targets = $opts{'t'}; | 90 my $targets = $opts{'t'}; |
89 my $tmptargets = "$wd/collapsedtargets.bed"; | 91 my $tmptargets = "$wd/collapsedtargets.bed"; |
90 system("sort -k1,1 -k2,2n $targets > $wd/sorted.targets.bed"); | 92 system("sort -k1,1 -k2,2n $targets > $wd/sorted.targets.bed"); |
91 system("bedtools merge -s -scores max -nms -i $wd/sorted.targets.bed > $tmptargets"); | 93 system("bedtools merge -s -scores max -nms -i $wd/sorted.targets.bed > $tmptargets"); |
92 $opts{'t'} = $tmptargets; | 94 open IN, $tmptargets; |
93 } | 95 open OUT, ">$wd/collapsed.targets.renamed.bed"; |
94 | 96 # we assume that overlapping fragments come from isoforms, not from different genes. |
95 # 1. Global Summary => default | 97 my %counters = (); |
96 &GlobalSummary($opts{'b'}, $opts{'t'}); | 98 my @genes = (); |
99 while (<IN>) { | |
100 chomp; | |
101 my @p = split(/\t/,$_); | |
102 my @g = split(/,/,$p[3]); | |
103 $g[0] =~ m/(\S+)\(.*/; | |
104 my $gene = $1; | |
105 if (!defined($counters{$gene})) { | |
106 push(@genes,$gene); | |
107 $counters{$gene}{'lines'} = (); | |
108 $counters{$gene}{'orient'} = $p[5]; | |
109 } | |
110 $p[3] = $gene."(COLLAPSED)"; | |
111 push(@{$counters{$gene}{'lines'}},\@p); | |
112 } | |
113 close IN; | |
114 foreach my $gene (@genes) { | |
115 if ($counters{$gene}{'orient'} eq '-') { | |
116 my $idx = scalar(@{$counters{$gene}{'lines'}}) + 1; | |
117 foreach my $line (@{$counters{$gene}{'lines'}}) { | |
118 $idx--; | |
119 $line->[3] .= "|Region_$idx"; | |
120 print OUT join("\t",@$line)."\n"; | |
121 } | |
122 } | |
123 else { | |
124 my $idx = 0; | |
125 foreach my $line (@{$counters{$gene}{'lines'}}) { | |
126 $idx++; | |
127 $line->[3] .= "|Region_$idx"; | |
128 print OUT join("\t",@$line)."\n"; | |
129 } | |
130 } | |
131 } | |
132 close OUT; | |
133 $opts{'t'} = "$wd/collapsed.targets.renamed.bed"; | |
134 } | |
135 # 1. Coverage per exon | |
136 # included in 2. | |
97 | 137 |
98 # 2. Coverage per position | 138 # 2. Coverage per position |
99 &SubRegionCoverage($opts{'b'}, $opts{'t'}); | 139 &SubRegionCoverage($opts{'b'}, $opts{'t'}); |
100 our %filehash; | 140 our %filehash; |
141 our $tcov; | |
101 if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) { | 142 if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) { |
102 system("mkdir $wd/SplitFiles"); | 143 system("mkdir -p $wd/SplitFiles"); |
144 system("rm $wd/SplitFiles/*"); | |
103 ## get position coverages | 145 ## get position coverages |
104 ## split input files | 146 ## split input files |
105 open IN, "$wd/Targets.Position.Coverage"; | 147 open IN, "$wd/Targets.Position.Coverage"; |
148 open BCOVSUM, ">$wd/Results/".$opts{'n'}.".Average_Region_Coverage.txt"; | |
106 my $fileidx = 0; | 149 my $fileidx = 0; |
107 my $currreg = ''; | 150 my $currreg = ''; |
151 my $elength = 0; | |
152 my $esum = 0; | |
153 my $eline = ""; | |
154 my %out = (); | |
108 while (<IN>) { | 155 while (<IN>) { |
109 my $line = $_; | 156 my $line = $_; |
110 chomp($line); | 157 chomp($line); |
111 my @p = split(/\t/,$line); | 158 my @p = split(/\t/,$line); |
112 my $reg = $p[0].'-'.$p[1].'-'.$p[2]; #.$p[3]; | 159 my $reg = $p[0].'-'.$p[1].'-'.$p[2]. ": $p[3]"; |
113 my $ex = $p[3]; | 160 my $ex = $p[3]; |
161 my $epos = $p[1]; | |
162 # average exon coverage calculation. | |
163 if (!defined($out{$ex})) { | |
164 $out{$ex} = (); | |
165 } | |
166 if (!defined($out{$ex}{$p[0]})) { | |
167 $out{$ex}{$p[0]} = (); | |
168 } | |
169 # needs to be transcript specific ($ex) and position specific ($epos) to handle both isoforms and PAR/multiple mapping situations. | |
170 if (!defined($out{$ex}{$p[0]}{$epos})) { | |
171 $out{$ex}{$p[0]}{$epos}{'r'} = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]"; | |
172 $out{$ex}{$p[0]}{$epos}{'c'} = (); | |
173 | |
174 } | |
175 push(@{$out{$ex}{$p[0]}{$epos}{'c'}},$p[-1]); | |
176 # splitting files | |
114 if ($reg ne $currreg) { | 177 if ($reg ne $currreg) { |
115 ## new exon open new outfile | 178 ## new exon open new outfile |
116 if ($currreg ne '') { | 179 if ($currreg ne '') { |
180 print BCOVSUM "$eline\t".($esum/$elength)."\n"; | |
117 ## filehandle is open. close it | 181 ## filehandle is open. close it |
118 close OUT; | 182 close OUT; |
119 } | 183 } |
184 $eline = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]"; #\t$p[6]\t$p[7]\t$p[8]"; | |
185 $esum = 0; | |
186 $elength = 0; | |
120 if (!exists($filehash{$reg})) { | 187 if (!exists($filehash{$reg})) { |
121 $fileidx++; | 188 $fileidx++; |
122 $filehash{$reg}{'idx'} = $fileidx; | 189 $filehash{$reg}{'idx'} = $fileidx; |
123 $filehash{$reg}{'exon'} = $ex; | 190 $filehash{$reg}{'exon'} = $reg; |
124 open OUT, ">> $wd/SplitFiles/File_$fileidx.txt"; | 191 open OUT, ">> $wd/SplitFiles/File_$fileidx.txt"; |
125 $currreg = $reg; | 192 $currreg = $reg; |
126 } | 193 } |
127 else { | 194 else { |
128 open OUT, ">> $wd/SplitFiles/File_".$filehash{$reg}{'idx'}.".txt"; | 195 open OUT, ">> $wd/SplitFiles/File_".$filehash{$reg}{'idx'}.".txt"; |
129 $currreg = $reg; | 196 $currreg = $reg; |
130 } | 197 } |
131 } | 198 } |
132 ## print the line to the open filehandle. | 199 ## print the line to the open filehandle. |
133 print OUT "$line\n"; | 200 print OUT "$line\n"; |
201 $esum += $p[-1]; | |
202 $elength++; | |
134 } | 203 } |
135 close OUT; | 204 close OUT; |
136 close IN; | 205 close IN; |
137 | 206 if ($esum > 0) { |
138 } | 207 print BCOVSUM "$eline\t".($esum/$elength)."\n"; |
139 | 208 } |
209 close BCOVSUM; | |
210 open OUT, ">$wd/avg.tcov.txt"; | |
211 | |
212 foreach my $tr_ex (sort {$a cmp $b} keys(%out)) { | |
213 foreach my $chr (sort {$a cmp $b} keys(%{$out{$tr_ex}})) { | |
214 foreach(sort {$a <=> $b} keys(%{$out{$tr_ex}{$chr}})) { | |
215 my ($avg,$nr,$nrcov) = GetStats(\@{$out{$tr_ex}{$chr}{$_}{'c'}}); | |
216 my $frac = 0; | |
217 if ($nr > 0) { | |
218 $frac = ($nrcov / $nr); | |
219 } | |
220 print OUT $out{$tr_ex}{$chr}{$_}{'r'}."\t".$avg."\t".$nrcov."\t".$nr."\t".$frac."\n"; | |
221 } | |
222 } | |
223 } | |
224 close OUT; | |
225 $tcov = "$wd/avg.tcov.txt"; | |
226 | |
227 } | |
140 ## sort output files according to targets file | 228 ## sort output files according to targets file |
141 if (exists($opts{'r'}) ) { | 229 my %hash = (); |
142 my %hash = (); | 230 open IN, $tcov; |
143 open IN, "$wd/Targets.Global.Coverage"; | 231 while (<IN>) { |
144 while (<IN>) { | 232 my @p = split(/\t/,$_) ; |
145 my @p = split(/\t/,$_) ; | 233 $hash{$p[3].':'.$p[1]} = $_; |
146 $hash{$p[3]} = $_; | 234 } |
147 } | 235 close IN; |
148 close IN; | 236 open OUT, ">$tcov"; |
149 open OUT, ">$wd/Targets.Global.Coverage"; | 237 open IN, $opts{'t'}; |
150 open IN, $opts{'t'}; | 238 while (<IN>) { |
151 while (<IN>) { | 239 my @p = split(/\t/,$_) ; |
152 my @p = split(/\t/,$_) ; | 240 print OUT $hash{$p[3].':'.$p[1]}; |
153 print OUT $hash{$p[3]}; | 241 } |
154 } | 242 close IN; |
155 close IN; | 243 close OUT; |
156 close OUT; | |
157 } | |
158 | 244 |
159 | 245 |
160 #################################### | 246 #################################### |
161 ## PROCESS RESULTS & CREATE PLOTS ## | 247 ## PROCESS RESULTS & CREATE PLOTS ## |
162 #################################### | 248 #################################### |
209 # Get number of reads mapped in total | 295 # Get number of reads mapped in total |
210 ## updated on 2012-10-1 !! | 296 ## updated on 2012-10-1 !! |
211 $totalmapped = $s[2]; | 297 $totalmapped = $s[2]; |
212 $totalmapped =~ s/^(\d+)(\s.+)/$1/; | 298 $totalmapped =~ s/^(\d+)(\s.+)/$1/; |
213 # count columns | 299 # count columns |
214 my $head = `head -n 1 $wd/Targets.Global.Coverage`; | 300 my $head = `head -n 1 $tcov`; |
215 chomp($head); | 301 chomp($head); |
216 my @cols = split(/\t/,$head); | 302 my @cols = split(/\t/,$head); |
217 my $nrcols = scalar(@cols); | 303 my $nrcols = scalar(@cols); |
218 my $covcol = $nrcols - 3; | 304 my $covcol = $nrcols - 3; |
219 # get min/max/median/average coverage => values | 305 # get min/max/median/average coverage => values |
220 my $covs = `cut -f $covcol $wd/Targets.Global.Coverage`; | 306 my $covs = `cut -f $covcol $tcov`; |
221 my @coverages = split(/\n/,$covs); | 307 my @coverages = split(/\n/,$covs); |
222 my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages); | 308 my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages); |
223 my $spec = sprintf("%.1f",($ontarget / $totalmapped)*100); | 309 my $spec = ''; |
310 if ($totalmapped != 0 && $totalmapped ne '') { | |
311 $spec = sprintf("%.1f",($ontarget / $totalmapped)*100); | |
312 } | |
224 # get min/max/median/average coverage => boxplot in R | 313 # get min/max/median/average coverage => boxplot in R |
225 open OUT, ">$wd/Rout/boxplot.R"; | 314 open OUT, ">$wd/Rout/boxplot.R"; |
226 print OUT 'coverage <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; | 315 print OUT 'coverage <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n"; |
227 print OUT 'coverage <- coverage[,'.$covcol.']'."\n"; | 316 print OUT 'coverage <- coverage[,'.$covcol.']'."\n"; |
228 print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480,type=c("cairo"))'."\n"; | 317 print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480,type=c("cairo"))'."\n"; |
229 print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n"; | 318 print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n"; |
230 print OUT 'graphics.off()'."\n"; | 319 print OUT 'graphics.off()'."\n"; |
231 close OUT; | 320 close OUT; |
321 print "Running boxplot.R : \n"; | |
232 system("cd $wd/Rout && Rscript boxplot.R"); | 322 system("cd $wd/Rout && Rscript boxplot.R"); |
233 | 323 |
234 ## global nt coverage plot | 324 ## global nt coverage plot |
235 ## use perl to make histogram (lower memory) | 325 ## use perl to make histogram (lower memory) |
236 open IN, "$wd/Targets.Position.Coverage"; | 326 open IN, "$wd/Targets.Position.Coverage"; |
321 print OUT 'text(1,82,pos=2,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n"; | 411 print OUT 'text(1,82,pos=2,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n"; |
322 | 412 |
323 print OUT 'graphics.off()'."\n"; | 413 print OUT 'graphics.off()'."\n"; |
324 | 414 |
325 close OUT; | 415 close OUT; |
416 print "Running ntplot.r\n"; | |
326 system("cd $wd/Rout && Rscript ntplot.R"); | 417 system("cd $wd/Rout && Rscript ntplot.R"); |
327 ## PRINT TO .TEX FILE | 418 ## PRINT TO .TEX FILE |
328 open OUT, ">>$wd/Report/Report.tex"; | 419 open OUT, ">>$wd/Report/Report.tex"; |
329 # average coverage overviews | 420 # average coverage overviews |
330 print OUT '\subsection*{Overall Summary}'."\n"; | 421 print OUT '\subsection*{Overall Summary}'."\n"; |
357 $two =~ s/(\s\+.*)|(:.*)/\)/; | 448 $two =~ s/(\s\+.*)|(:.*)/\)/; |
358 $two =~ s/%/\\%/g; | 449 $two =~ s/%/\\%/g; |
359 $two =~ s/>=/\$\\ge\$/g; | 450 $two =~ s/>=/\$\\ge\$/g; |
360 $two = ucfirst($two); | 451 $two = ucfirst($two); |
361 print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n"; | 452 print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n"; |
453 | |
454 | |
362 } | 455 } |
363 print OUT '\end{tabular}\end{minipage}'."\n"; | 456 print OUT '\end{tabular}\end{minipage}'."\n"; |
364 print OUT '\hspace{1.5cm}'."\n"; | 457 print OUT '\hspace{1.5cm}'."\n"; |
365 # target coverage statistics | 458 # target coverage statistics |
366 print OUT '\begin{minipage}{0.4\linewidth}'."\n"; | 459 print OUT '\begin{minipage}{0.4\linewidth}'."\n"; |
386 # 2. GLOBAL COVERAGE OVERVIEW PER GENE | 479 # 2. GLOBAL COVERAGE OVERVIEW PER GENE |
387 @failedexons; | 480 @failedexons; |
388 @allexons; | 481 @allexons; |
389 @allregions; | 482 @allregions; |
390 @failedregions; | 483 @failedregions; |
391 if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})) { | 484 %failednames; |
485 %allnames; | |
486 | |
487 if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})|| exists($opts{'A'})) { | |
392 # count columns | 488 # count columns |
393 my $head = `head -n 1 $wd/Targets.Global.Coverage`; | 489 my $head = `head -n 1 '$tcov'`; |
394 chomp($head); | 490 chomp($head); |
395 my @cols = split(/\t/,$head); | 491 my @cols = split(/\t/,$head); |
396 my $nrcols = scalar(@cols); | 492 my $nrcols = scalar(@cols); |
397 my $covcol = $nrcols - 3; | 493 my $covcol = $nrcols - 3; |
398 # Coverage Plots for each gene => barplots in R, table here. | 494 # Coverage Plots for each gene => barplots in R, table here. |
399 open IN, "$wd/Targets.Global.Coverage"; | 495 open IN, "$tcov"; |
400 my $currgroup = ''; | 496 my $currgroup = ''; |
401 my $startline = 0; | 497 my $startline = 0; |
402 my $stopline = 0; | 498 my $stopline = 0; |
403 $linecounter = 0; | 499 $linecounter = 0; |
404 while (<IN>) { | 500 while (<IN>) { |
405 $linecounter++; | 501 $linecounter++; |
406 chomp($_); | 502 chomp($_); |
407 my @c = split(/\t/,$_); | 503 my @c = split(/\t/,$_); |
408 push(@allregions,$c[0].'-'.$c[1].'-'.$c[2]); | 504 my $reg = $c[0].'-'.$c[1].'-'.$c[2]; |
409 my $group = $c[3]; | 505 push(@allregions,$reg); |
506 my $group = $reg .": ".$c[3]; | |
507 #my $gene = $c[3]; | |
410 ## coverage failure? | 508 ## coverage failure? |
411 if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) { | 509 if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) { |
412 push(@failedexons,$group); | 510 push(@failedexons,$group); |
413 push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]); | 511 push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]); |
512 $failednames{$group} = $c[0].'-'.$c[1].'-'.$c[2]; | |
414 } | 513 } |
415 ## store exon | 514 ## store exon |
416 push(@allexons,$group); | 515 push(@allexons,$group); |
516 $allnames{$group} = $c[0].'-'.$c[1].'-'.$c[2]; | |
517 if (!exists($opts{'r'}) && !exists($opts{'s'}) && !exists($opts{'S'}) && exists($opts{'A'})) { | |
518 ## no need for barplots | |
519 next; | |
520 } | |
417 ## extract and check gene | 521 ## extract and check gene |
418 $group =~ s/^(\S+)[\|\s](.+)/$1/; | 522 my $gene = $group; |
419 if ($group ne $currgroup ) { | 523 $gene =~ s/^chr\S+: (\S+)[\|\s](.+)/$1/; |
524 if ($gene ne $currgroup ) { | |
420 if ($currgroup ne '') { | 525 if ($currgroup ne '') { |
421 # new gene, make plot. | 526 # new gene, make plot. |
422 open OUT, ">$wd/Rout/barplot.R"; | 527 open OUT, ">$wd/Rout/barplot.R"; |
423 print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; | 528 print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n"; |
424 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; | 529 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; |
425 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; | 530 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; |
426 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; | 531 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; |
427 print OUT 'coverage[coverage < 1] <- 1'."\n"; | 532 print OUT 'coverage[coverage < 1] <- 1'."\n"; |
428 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n"; | 533 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n"; |
455 else { | 560 else { |
456 push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}'); | 561 push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}'); |
457 } | 562 } |
458 | 563 |
459 } | 564 } |
460 $currgroup = $group; | 565 $currgroup = $gene; |
461 $startline = $linecounter; | 566 $startline = $linecounter; |
462 } | 567 } |
463 $stopline = $linecounter; | 568 $stopline = $linecounter; |
464 } | 569 } |
465 close IN; | 570 close IN; |
466 if ($currgroup ne '') { | 571 if ($currgroup ne '') { |
467 # last gene, make plot. | 572 # last gene, make plot. |
468 open OUT, ">$wd/Rout/barplot.R"; | 573 open OUT, ">$wd/Rout/barplot.R"; |
469 print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; | 574 print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n"; |
470 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; | 575 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; |
471 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; | 576 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; |
472 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; | 577 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; |
473 print OUT 'coverage[coverage < 1] <- 1'."\n"; | 578 print OUT 'coverage[coverage < 1] <- 1'."\n"; |
474 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n"; | 579 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n"; |
500 } | 605 } |
501 ## print to TEX | 606 ## print to TEX |
502 open OUT, ">>$wd/Report/Report.tex"; | 607 open OUT, ">>$wd/Report/Report.tex"; |
503 print OUT '\subsection*{Gene Summaries}'."\n"; | 608 print OUT '\subsection*{Gene Summaries}'."\n"; |
504 print OUT '\underline{Legend:} \\\\'."\n"; | 609 print OUT '\underline{Legend:} \\\\'."\n"; |
505 print OUT '{\color{red}\textbf{RED:} Coverage did not reach set threshold of '.$thresh.'} \\\\'."\n"; | 610 print OUT '{\color{red}\textbf{RED:} Average coverage did not reach set threshold of '.$thresh.'} \\\\'."\n"; |
506 print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon. Overruled by red.} \\\\' ."\n"; | 611 print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon (section with zero coverage found). Overruled by red.} \\\\' ."\n"; |
507 $col = 1; | 612 $col = 1; |
508 foreach (@small) { | 613 foreach (@small) { |
509 if ($col > 2) { | 614 if ($col > 2) { |
510 $col = 1; | 615 $col = 1; |
511 print OUT "\n"; | 616 print OUT "\n"; |
538 # tex section header | 643 # tex section header |
539 open TEX, ">>$wd/Report/Report.tex"; | 644 open TEX, ">>$wd/Report/Report.tex"; |
540 print TEX '\subsection*{Failed Exon Plots}'."\n"; | 645 print TEX '\subsection*{Failed Exon Plots}'."\n"; |
541 $col = 1; | 646 $col = 1; |
542 print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n"; | 647 print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n"; |
543 foreach(@failedregions) { | 648 foreach(sort(keys(%failednames)) ) { |
649 #foreach(@failedregions) { | |
544 if ($col > 2) { | 650 if ($col > 2) { |
545 $col = 1; | 651 $col = 1; |
546 print TEX "\n"; | 652 print TEX "\n"; |
547 } | 653 } |
548 # which exon | 654 # which exon |
549 my $region = $_; | 655 my $group = $_; |
550 my $exon = $filehash{$region}{'exon'}; | 656 my ($region,$name) = split(/: /,$group); |
657 #my $region = $failednames{$_}; | |
658 my $exon = $filehash{$group}{'exon'}; | |
551 # link exon to tmp file | 659 # link exon to tmp file |
552 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt"; | 660 my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt"; |
553 ## determine transcript orientation and location | 661 ## determine transcript orientation and location |
554 my $firstline = `head -n 1 $exonfile`; | 662 my $firstline = `head -n 1 $exonfile`; |
555 my @firstcols = split(/\t/,$firstline); | 663 my @firstcols = split(/\t/,$firstline); |
556 my $orient = $firstcols[5]; | 664 my $orient = $firstcols[5]; |
557 my $genomicchr = $firstcols[0]; | 665 my $genomicchr = $firstcols[0]; |
574 | 682 |
575 my $width = 480 ; | 683 my $width = 480 ; |
576 my $height = 240 ; | 684 my $height = 240 ; |
577 my $exonstr = $exon; | 685 my $exonstr = $exon; |
578 $exonstr =~ s/\s/_/g; | 686 $exonstr =~ s/\s/_/g; |
687 $exonstr =~ s/:/_/g; | |
579 $exon =~ s/_/ /g; | 688 $exon =~ s/_/ /g; |
580 $exon =~ s/\|/ /g; | 689 $exon =~ s/\|/ /g; |
690 $exon =~ s/chr.*: (.*)$/$1/; | |
581 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n"; | 691 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n"; |
582 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; | 692 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; |
583 if ($orient eq '-') { | 693 if ($orient eq '-') { |
584 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n"; | 694 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n"; |
585 print OUT 'mtext("'.$subtitle.'")'."\n"; | 695 print OUT 'mtext("'.$subtitle.'")'."\n"; |
622 } | 732 } |
623 elsif (exists($opts{'A'})) { | 733 elsif (exists($opts{'A'})) { |
624 print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n"; | 734 print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n"; |
625 } | 735 } |
626 $col = 1; | 736 $col = 1; |
627 foreach(@allregions) { | 737 foreach(sort(keys(%allnames))) { |
738 | |
739 #foreach(@allregions) { | |
628 if ($col > 2) { | 740 if ($col > 2) { |
629 $col = 1; | 741 $col = 1; |
630 print TEX "\n"; | 742 print TEX "\n"; |
631 } | 743 } |
744 my $group = $_; | |
745 my ($region,$name) = split(/: /,$group); | |
632 # which exon | 746 # which exon |
633 my $region = $_; | 747 #my $region = $_; |
634 my $exon = $filehash{$region}{'exon'}; | 748 #my $region = $allnames{$_}; |
749 my $exon = $filehash{$group}{'exon'}; | |
635 # grep exon to tmp file | 750 # grep exon to tmp file |
636 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt"; | 751 my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt"; |
637 ## determine transcript orientation. | 752 ## determine transcript orientation. |
638 my $firstline = `head -n 1 $exonfile`; | 753 my $firstline = `head -n 1 $exonfile`; |
639 my @firstcols = split(/\t/,$firstline); | 754 my @firstcols = split(/\t/,$firstline); |
640 my $orient = $firstcols[5]; | 755 my $orient = $firstcols[5]; |
641 my $genomicchr = $firstcols[0]; | 756 my $genomicchr = $firstcols[0]; |
671 print OUT 'positions <- coveragetable[,'.$poscol.']'."\n"; | 786 print OUT 'positions <- coveragetable[,'.$poscol.']'."\n"; |
672 my $width = 480 ; | 787 my $width = 480 ; |
673 my $height = 240 ; | 788 my $height = 240 ; |
674 my $exonstr = $exon; | 789 my $exonstr = $exon; |
675 $exonstr =~ s/\s/_/g; | 790 $exonstr =~ s/\s/_/g; |
791 $exonstr =~ s/:/_/g; | |
676 $exon =~ s/_/ /g; | 792 $exon =~ s/_/ /g; |
677 $exon =~ s/\|/ /g; | 793 $exon =~ s/\|/ /g; |
794 $exon =~ s/^chr.*: (.*)$/$1/; | |
678 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n"; | 795 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n"; |
679 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; | 796 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; |
680 if ($orient eq '-') { | 797 if ($orient eq '-') { |
681 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n"; | 798 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n"; |
682 print OUT 'mtext("'.$subtitle.'")'."\n"; | 799 print OUT 'mtext("'.$subtitle.'")'."\n"; |
737 my $genomicchr = $firstcols[0]; | 854 my $genomicchr = $firstcols[0]; |
738 my $genomicstart = $firstcols[1]; | 855 my $genomicstart = $firstcols[1]; |
739 my $genomicstop = $firstcols[2]; | 856 my $genomicstop = $firstcols[2]; |
740 | 857 |
741 if ($orient eq '+') { | 858 if ($orient eq '+') { |
742 $bps = $genomicstop - $genomicstart + 1; | 859 #$bps = $genomicstop - $genomicstart + 1; |
743 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop); | 860 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop); |
744 | 861 |
745 } | 862 } |
746 else { | 863 else { |
747 $bps = $genomicstop - $genomicstart + 1; | 864 #$bps = $genomicstop - $genomicstart + 1; |
748 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop); | 865 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop); |
749 } | 866 } |
750 | 867 |
751 # check if failed | 868 # check if failed |
752 my $cs = `cut -f $covcol '$exonfile' `; | 869 my $cs = `cut -f $covcol '$exonfile' `; |
794 system("cp -Rf $wd/Targets.Position.Coverage $wd/Results/"); | 911 system("cp -Rf $wd/Targets.Position.Coverage $wd/Results/"); |
795 } | 912 } |
796 | 913 |
797 system("cd $wd && tar czf '$tarfile' Results/"); | 914 system("cd $wd && tar czf '$tarfile' Results/"); |
798 ## clean up (galaxy stores outside wd) | 915 ## clean up (galaxy stores outside wd) |
799 system("rm -Rf $wd"); | 916 #system("rm -Rf $wd"); |
800 ############### | 917 ############### |
801 ## FUNCTIONS ## | 918 ## FUNCTIONS ## |
802 ############### | 919 ############### |
803 sub arraystats{ | 920 sub arraystats{ |
804 my @array = @_; | 921 my @array = @_; |
805 my $count = scalar(@array); | 922 my $count = scalar(@array); |
923 if ($count == 0 ) { | |
924 return (0,0,0,0,0,0,0); | |
925 } | |
806 @array = sort { $a <=> $b } @array; | 926 @array = sort { $a <=> $b } @array; |
807 # median | 927 # median |
808 my $median = 0; | 928 my $median = 0; |
809 if ($count % 2) { | 929 if ($count % 2) { |
810 $median = $array[int($count/2)]; | 930 $median = $array[int($count/2)]; |
822 my $min = $array[0]; | 942 my $min = $array[0]; |
823 my $max = $array[($count-1)]; | 943 my $max = $array[($count-1)]; |
824 return ($average,$median,$min,$max,$first,$third,$sum); | 944 return ($average,$median,$min,$max,$first,$third,$sum); |
825 } | 945 } |
826 | 946 |
827 sub GlobalSummary { | 947 sub GetStats { |
828 my ($bam,$targets) = @_; | 948 my ($aref) = @_; |
829 | 949 if (scalar(@$aref) == 0) { |
830 my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage"; | 950 return qw/0 0/; |
831 if (exists($commandsrun{$command})) { | 951 } |
832 return; | 952 # median |
833 } | 953 my @s = sort {$a <=> $b } @$aref; |
834 system($command); | 954 my $nrzero = 0; |
835 $commandsrun{$command} = 1; | 955 my $len = scalar(@s); |
836 } | 956 for (my $i = 0; $i< $len;$i++) { |
837 | 957 if ($s[$i] == 0) { |
838 sub CoveragePerRegion { | 958 $nrzero++; |
839 my ($bam,$targets) = @_; | 959 } |
840 my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage"; | 960 else { |
841 if (exists($commandsrun{$command})) { | 961 last; |
842 return; | 962 } |
843 } | 963 } |
844 system($command); | 964 my $nrcov = $len - $nrzero; |
845 $commandsrun{$command} = 1; | 965 # avg |
846 } | 966 my $avg = 0; |
967 foreach (@s) { $avg += $_ }; | |
968 $avg = sprintf("%.1f",($avg / scalar(@s))); | |
969 return($avg,$len,$nrcov); | |
970 } | |
971 | |
847 | 972 |
848 sub SubRegionCoverage { | 973 sub SubRegionCoverage { |
849 my ($bam,$targets) = @_; | 974 my ($bam,$targets) = @_; |
850 my $command = "cd $wd && coverageBed -abam $bam -b $targets -d > $wd/Targets.Position.Coverage"; | 975 my $command = "cd $wd && coverageBed -abam $bam -b $targets -d > $wd/Targets.Position.Coverage"; |
851 system($command); | 976 system($command); |