| Previous changeset 0:ea32a329aced (2013-09-05) Next changeset 2:a9555d172896 (2014-02-13) |
|
Commit message:
Initial Uploaded |
|
added:
CoverageReport.pl |
| b |
| diff -r ea32a329aced -r 864d0ccfbe6f CoverageReport.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CoverageReport.pl Thu Sep 05 02:37:03 2013 -0400 |
| [ |
| b'@@ -0,0 +1,837 @@\n+#!/usr/bin/perl\n+\n+# load modules\n+use Getopt::Std;\n+use File::Basename;\n+use Number::Format;\n+\n+# number format\n+my $de = new Number::Format(-thousands_sep =>\',\',-decimal_point => \'.\');\n+\n+##########\n+## opts ##\n+##########\n+## input files\n+# b : path to input (b)am file\n+# t : path to input (t)arget regions in BED format\n+## output files\n+# o : report pdf (o)utput file\n+# z : all plots and tables in tar.g(z) format\n+## entries in the report\n+# r : Coverage per (r)egion (boolean)\n+# s : (s)ubregion coverage if average < specified (plots for positions along target region) (boolean)\n+# S : (S)ubregion coverage for ALL failed exons => use either s OR S or you will have double plots. \n+# A : (A)ll exons will be plotted. \n+# L : (L)ist failed exons instead of plotting\n+# m : (m)inimal Coverage threshold\n+# f : fraction of average as threshold\n+# n : sample (n)ame.\n+ \n+\n+getopts(\'b:t:o:z:rsSALm:n:f:\', \\%opts) ;\n+\n+# make output directory in (tmp) working dir\n+our $wd = "/tmp/Coverage.".int(rand(1000));\n+while (-d $wd) {\n+\t$wd = "/tmp/Coverage.".int(rand(1000));\n+}\n+system("mkdir $wd");\n+\n+## variables\n+our %commandsrun = ();\n+\n+if (!exists($opts{\'b\'}) || !-e $opts{\'b\'}) {\n+\tdie(\'Bam File not found\');\n+}\n+if (!exists($opts{\'t\'}) || !-e $opts{\'t\'}) {\n+\tdie(\'Target File (BED) not found\');\n+}\n+\n+if (exists($opts{\'m\'})) {\n+\t$thresh = $opts{\'m\'};\n+}\n+else {\n+\t$thresh = 40;\n+}\n+\n+if (exists($opts{\'f\'})) {\n+\t$frac = $opts{\'f\'};\n+}\n+else {\n+\t$frac = 0.2;\n+}\n+\n+if (exists($opts{\'o\'})) {\n+\t$pdffile = $opts{\'o\'};\n+}\n+else {\n+\t$pdffile = "$wd/CoverageReport.pdf";\n+}\n+\n+if (exists($opts{\'z\'})) {\n+\t$tarfile = $opts{\'z\'};\n+}\n+else {\n+\t$tarfile = "$wd/Results.tar.gz";\n+}\n+\n+# 1. Global Summary => default\n+&GlobalSummary($opts{\'b\'}, $opts{\'t\'});\n+\n+# 2. Coverage per position\n+&SubRegionCoverage($opts{\'b\'}, $opts{\'t\'});\n+our %filehash;\n+if (exists($opts{\'s\'}) || exists($opts{\'S\'}) || exists($opts{\'A\'}) || exists($opts{\'L\'})) {\n+\tsystem("mkdir $wd/SplitFiles");\n+\t## get position coverages\n+\t## split input files\n+\topen IN, "$wd/Targets.Position.Coverage";\n+\tmy $fileidx = 0;\n+\tmy $currreg = \'\';\n+\twhile (<IN>) {\n+\t\tmy $line = $_;\n+\t\tchomp($line);\n+\t\tmy @p = split(/\\t/,$line);\n+\t\tmy $reg = $p[0].\'-\'.$p[1].\'-\'.$p[2]; #.$p[3];\n+\t\tmy $ex = $p[3];\n+\t\tif ($reg ne $currreg) {\n+\t\t\t## new exon open new outfile\n+\t\t\tif ($currreg ne \'\') {\n+\t\t\t\t## filehandle is open. close it\n+\t\t\t\tclose OUT; \n+\t\t\t}\n+\t\t\tif (!exists($filehash{$reg})) {\n+\t\t\t\t$fileidx++;\n+\t\t\t\t$filehash{$reg}{\'idx\'} = $fileidx;\n+\t\t\t\t$filehash{$reg}{\'exon\'} = $ex;\n+\t\t\t\topen OUT, ">> $wd/SplitFiles/File_$fileidx.txt";\n+\t\t\t\t$currreg = $reg;\n+\t\t\t}\n+\t\t\telse {\n+\t\t\t\topen OUT, ">> $wd/SplitFiles/File_".$filehash{$reg}{\'idx\'}.".txt";\n+\t\t\t\t$currreg = $reg;\n+\t\t\t}\n+\t\t}\n+\t\t## print the line to the open filehandle. \t\n+\t\tprint OUT "$line\\n";\n+\t}\n+\tclose OUT;\n+\tclose IN;\n+\n+}\n+\n+## sort output files according to targets file\n+if (exists($opts{\'r\'}) ) {\n+\tmy %hash = ();\n+\topen IN, "$wd/Targets.Global.Coverage";\n+\twhile (<IN>) {\n+\t\tmy @p = split(/\\t/,$_) ;\n+\t\t$hash{$p[3]} = $_;\n+\t}\n+\tclose IN;\n+\topen OUT, ">$wd/Targets.Global.Coverage";\n+\topen IN, $opts{\'t\'};\n+\twhile (<IN>) {\n+\t\tmy @p = split(/\\t/,$_) ;\n+\t\tprint OUT $hash{$p[3]};\n+\t}\n+\tclose IN;\n+\tclose OUT;\n+}\n+\n+\n+####################################\n+## PROCESS RESULTS & CREATE PLOTS ##\n+####################################\n+system("mkdir $wd/Report");\n+\n+system("mkdir $wd/Rout");\n+system("mkdir $wd/Plots");\n+\n+$samplename = $opts{\'n\'};\n+$samplename =~ s/_/\\\\_/g;\n+\n+# 0. Preamble\n+## compose preamble\n+open OUT, ">$wd/Report/Report.tex";\n+print OUT \'\\documentclass[a4paper,10pt]{article}\'."\\n";\n+print OUT \'\\usepackage[left=2cm,top=1.5cm,right=1.5cm,bottom=2.5cm,nohead]{geometry}\'."\\n";\n+print OUT \'\\usepackage{longtable}\'."\\n";\n+print OUT \'\\usepackage[T1]{fontenc}\'."\\n";\n+print OUT \'\\usepackage{fancyhdr}\'."\\n";\n+print OUT \'\\usepackage[latin9]{inputenc}\'."\\n";\n+print OUT \'\\usepackage{color}\'."\\n";\n+print OUT \'\\usepackage[pdftex]{graphicx}\'."\\n";\n+print '..b'f{Target Name} & \\textbf{Genomic Position} & \\textbf{Avg.Coverage} & \\textbf{Min.Coverage} \\\\\\\\\'."\\n".\'\\hline\'."\\n";\n+\tprint TEX \'\\endhead\'."\\n";\n+\tprint TEX \'\\hline \'."\\n".\'\\multicolumn{4}{r}{{\\textsl{\\footnotesize Continued on next page}}} \\\\\\\\ \'."\\n".\'\\hline\' ."\\n". \'\\endfoot\' . "\\n". \'\\endlastfoot\' . "\\n";\n+ \n+\t$col = 1;\n+\topen IN, "$wd/Targets.Global.Coverage";\n+\twhile (<IN>) {\n+\t\tchomp($_);\n+\t\tmy @p = split(/\\t/,$_);\n+\t\tmy $region = $p[0].\'-\'.$p[1].\'-\'.$p[2];\n+\t\tmy $exon = $filehash{$region}{\'exon\'};\n+\t\t# grep exon to tmp file\n+\t\tmy $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{\'idx\'}.".txt";\n+\t\t## determine transcript orientation.\n+ my $firstline = `head -n 1 $exonfile`;\n+ my @firstcols = split(/\\t/,$firstline);\n+ my $orient = $firstcols[5];\n+\t\tmy $genomicchr = $firstcols[0];\n+\t\tmy $genomicstart = $firstcols[1];\n+\t\tmy $genomicstop = $firstcols[2];\n+\n+\t\tif ($orient eq \'+\') {\n+\t\t\t$bps = $genomicstop - $genomicstart + 1;\n+\t\t\t$subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);\n+\n+\t\t}\n+\t\telse {\n+\t\t\t$bps = $genomicstop - $genomicstart + 1;\n+\t\t\t$subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);\n+\t\t}\n+\n+\t\t# check if failed\n+\t\tmy $cs = `cut -f $covcol \'$exonfile\' `;\n+\t\tmy @c = split(/\\n/,$cs);\n+\t\tmy ($avg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@c);\n+\t\t\n+\t\tif ($min >= $thresh) {\n+\t\t\t# lowest coverage > threshold => skip\n+\t\t\tnext;\n+\t\t}\n+\t\t\n+\t\t# print to .tex table \n+\t\tif (length($exon) > 30) {\n+\t\t\t$exon = substr($exon,0,27) . \'...\';\n+\t\t}\n+\t\t$exon =~ s/_/ /g;\n+\t\t$exon =~ s/\\|/ /g;\n+\n+\t\tprint TEX "$exon & $subtitle & ".int($avg)." & $min ".\'\\\\\\\\\'."\\n"; \n+\t}\n+\tclose IN;\n+\tprint TEX \'\\hline\'."\\n";\n+\tprint TEX \'\\end{longtable}}\'."\\n";\n+\tclose TEX;\n+}\n+\n+\n+## Close document\n+open OUT, ">>$wd/Report/Report.tex";\n+print OUT \'\\label{endofdoc}\'."\\n";\n+print OUT \'\\end{document}\'."\\n";\n+close OUT;\n+system("cd $wd/Report && pdflatex Report.tex > /dev/null 2>&1 && pdflatex Report.tex > /dev/null 2>&1 ");\n+\n+## mv report to output file\n+system("cp -f $wd/Report/Report.pdf \'$pdffile\'");\n+##create tar.gz file\n+system("mkdir $wd/Results");\n+system("cp -Rf $wd/Plots $wd/Results/");\n+system("cp -Rf $wd/Report/ $wd/Results/");\n+if (-e "$wd/Targets.Global.Coverage") {\n+\tsystem("cp -Rf $wd/Targets.Global.Coverage $wd/Results/");\n+}\n+if (-e "$wd/Targets.Position.Coverage") {\n+\tsystem("cp -Rf $wd/Targets.Position.Coverage $wd/Results/");\n+}\n+\n+system("cd $wd && tar czf \'$tarfile\' Results/");\n+## clean up (galaxy stores outside wd)\n+system("rm -Rf $wd");\n+###############\n+## FUNCTIONS ##\n+###############\n+sub arraystats{\n+\tmy @array = @_;\n+\tmy $count = scalar(@array);\n+\t@array = sort { $a <=> $b } @array;\n+\t# median\n+\tmy $median = 0;\n+\tif ($count % 2) { \n+\t\t$median = $array[int($count/2)]; \n+\t} else { \n+\t\t$median = ($array[$count/2] + $array[$count/2 - 1]) / 2; \n+\t} \n+\t# average \n+\tmy $sum = 0;\n+\tforeach (@array) { $sum += $_; } \n+\tmy $average = $sum / $count;\n+\t# quantiles (rounded)\n+\tmy $quart = int($count/4) ;\t\n+\tmy $first = $array[$quart];\n+\tmy $third = $array[($quart*3)];\n+\tmy $min = $array[0];\n+\tmy $max = $array[($count-1)];\n+\treturn ($average,$median,$min,$max,$first,$third,$sum);\n+}\n+\n+sub GlobalSummary {\n+\tmy ($bam,$targets) = @_;\n+\n+\tmy $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage";\n+\tif (exists($commandsrun{$command})) {\n+\t\treturn;\n+\t}\n+\tsystem($command);\n+\t$commandsrun{$command} = 1;\n+}\n+\n+sub CoveragePerRegion {\n+\tmy ($bam,$targets) = @_;\n+\tmy $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage";\n+\tif (exists($commandsrun{$command})) {\n+\t\treturn;\n+\t}\n+\tsystem($command);\n+\t$commandsrun{$command} = 1;\n+}\n+\n+sub SubRegionCoverage {\n+\tmy ($bam,$targets) = @_;\n+\tmy $command = "cd $wd && coverageBed -abam $bam -b $targets -d > $wd/Targets.Position.Coverage";\n+\tsystem($command);\n+\t$commandsrun{$command} = 1;\n+}\n+\n' |