Mercurial > repos > yusuf > depth_report
comparison depth_reports @ 0:6b2e640c8c6d
initial commit
| author | Yusuf Ali <ali@yusuf.email> |
|---|---|
| date | Wed, 25 Mar 2015 13:31:40 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6b2e640c8c6d |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use Bio::DB::Sam; | |
| 4 use GD::Graph::bars; | |
| 5 use File::Basename; | |
| 6 use strict; | |
| 7 use warnings; | |
| 8 use vars qw($good_homo_coverage); | |
| 9 | |
| 10 $good_homo_coverage = 3; | |
| 11 | |
| 12 if(@ARGV == 1 and $ARGV[0] eq "-v"){ | |
| 13 print "Version 1.0\n"; | |
| 14 exit; | |
| 15 } | |
| 16 | |
| 17 # configuration file parsing/loading | |
| 18 my %config; | |
| 19 my $dirname = dirname(__FILE__); | |
| 20 my $tool_data = shift @ARGV; | |
| 21 if(not -e "$tool_data/depth_report.loc"){ | |
| 22 system("cp $dirname/tool-data/depth_report.loc $tool_data/depth_report.loc"); | |
| 23 } | |
| 24 | |
| 25 open CONFIG, '<', "$tool_data/depth_report.loc"; | |
| 26 while(<CONFIG>){ | |
| 27 (my $key, my $value) = split(/\s+/, $_ ); | |
| 28 $config{$key} = $value; | |
| 29 } | |
| 30 close CONFIG; | |
| 31 | |
| 32 my $quiet = 0; | |
| 33 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){ | |
| 34 $quiet = 1; | |
| 35 shift @ARGV; | |
| 36 } | |
| 37 | |
| 38 my $good_coverage_basic = 20; | |
| 39 @ARGV == 7 or @ARGV == 8 or @ARGV == 9 | |
| 40 or die "Usage: $0 [-q(uiet)] <mapped reads.bam> <targeted regions.bed> <out_summary_table> <out_poor_coverage_bed> <out_depth_graph> <out_depth_table> <out_mapped_percentile_table> [input_dud_regions.bed] [chr#]\n"; | |
| 41 | |
| 42 if(not -e $ARGV[0]){ | |
| 43 die "The specified BAM read alignment file ($ARGV[0]) does not exist\n"; | |
| 44 } | |
| 45 if(not -r $ARGV[0]){ | |
| 46 die "The specified BAM read alignment file ($ARGV[0]) is not readable\n"; | |
| 47 } | |
| 48 | |
| 49 if(@ARGV == 7 or $ARGV[7] eq "None"){ | |
| 50 $ARGV[7] = "/dev/null"; | |
| 51 } | |
| 52 if(not -r $ARGV[7]){ | |
| 53 die "The specified dud regions BED file ($ARGV[7]) is not readable\n"; | |
| 54 } | |
| 55 | |
| 56 my $target_contig; | |
| 57 if(@ARGV == 9){ | |
| 58 $target_contig = $ARGV[8]; # for debugging or whole genomes | |
| 59 } | |
| 60 | |
| 61 my $bed_file = $config{"capture_kits_dir"} . $ARGV[1]; | |
| 62 my $targeted_total = 0; | |
| 63 my $targeted_regions = 0; | |
| 64 my $targeted_coverage = 0; | |
| 65 my %regions; # contigname => %region{start+-end} | |
| 66 open(BED, $bed_file ) | |
| 67 or die "Cannot open target regions BED file $bed_file for reading: $!\n"; | |
| 68 | |
| 69 open(STATS, ">$ARGV[2]") | |
| 70 or die "Cannot open summary table $ARGV[2] for writing: $!\n"; | |
| 71 open(POOR, ">$ARGV[3]") | |
| 72 or die "Cannot open poor coverage BED file $ARGV[3] for writing: $!\n"; | |
| 73 open(HISTOGRAM, ">$ARGV[4]") | |
| 74 or die "Cannot open cumulative depth graph $ARGV[4] for writing: $!\n"; | |
| 75 open(COVER, ">$ARGV[5]") | |
| 76 or die "Cannot open read depth table $ARGV[5] for writing: $!\n"; | |
| 77 open(PERCENTILE, ">$ARGV[6]") | |
| 78 or die "Cannot open percentile table $ARGV[6] for writing: $!\n"; | |
| 79 print PERCENTILE "# Percentile of mapped bases\tNum reference targeted positions covered\n"; | |
| 80 | |
| 81 print STATS "# Summary statistics for BAM file $ARGV[0] using targeted regions from $bed_file\n" unless $quiet; | |
| 82 | |
| 83 print STDERR "Reading in target regions from BED file\n" unless $quiet; | |
| 84 while(<BED>){ | |
| 85 next if /^\s*(?:track\s|#)/; | |
| 86 tr/\r//d; | |
| 87 chomp; | |
| 88 my @fields = split /\t/, $_; | |
| 89 next if defined $target_contig and $fields[0] ne $target_contig; | |
| 90 if(not exists $regions{$fields[0]}){ | |
| 91 $regions{$fields[0]} = {}; | |
| 92 } | |
| 93 $regions{$fields[0]}->{$fields[1].":".$fields[2]} = 0; | |
| 94 $targeted_regions++; | |
| 95 $targeted_total += $fields[2]-$fields[1]+1; | |
| 96 } | |
| 97 close(BED); | |
| 98 | |
| 99 | |
| 100 print STDERR "Reading in dud regions from BED file\n" unless $quiet; | |
| 101 my %duds; # contigname => %region{start+-end} | |
| 102 my $duds_total = 0; | |
| 103 open(DUDS, $ARGV[7]) | |
| 104 or die "Cannot open dud regions BED file $ARGV[7] for reading: $!\n"; | |
| 105 while(<DUDS>){ | |
| 106 next if /^\s*#/; | |
| 107 chomp; | |
| 108 my @fields = split /\t/, $_; | |
| 109 next if defined $target_contig and $fields[0] ne $target_contig; | |
| 110 if(not exists $regions{$fields[0]}->{$fields[1].$fields[5].$fields[2]}){ | |
| 111 die "The duds file includes a region ($fields[0]:$fields[1]$fields[5]$fields[2]) ", | |
| 112 "not listed in the targeted regions BED file, ", | |
| 113 "please make sure the two are synched.\n"; | |
| 114 } | |
| 115 if(not exists $duds{$fields[0]}){ | |
| 116 $duds{$fields[0]} = {}; | |
| 117 } | |
| 118 $duds{$fields[0]}->{$fields[1].$fields[5].$fields[2]} = 1; | |
| 119 $duds_total += $fields[2]-$fields[1]+1; | |
| 120 } | |
| 121 close(DUDS); | |
| 122 | |
| 123 | |
| 124 my $tick_count = int($targeted_total/100); # for progress bar | |
| 125 | |
| 126 # Read the alignment info | |
| 127 # Load the BAM file | |
| 128 my $sam = Bio::DB::Sam->new(-bam => $ARGV[0], -autoindex => 1); | |
| 129 my $bam_header = $sam->bam->header; | |
| 130 my $contig_names = $bam_header->target_name; | |
| 131 | |
| 132 # Make sure the BED file and reference sequence refer to the same contigs | |
| 133 for my $contig_name (@$contig_names){ | |
| 134 next if defined $target_contig and $contig_name ne $target_contig; | |
| 135 if(not exists $regions{$contig_name}){ | |
| 136 print STATS "#Warning: The BED file makes no reference to the BAM's $contig_name reference sequence, ", | |
| 137 "no reporting will be done for this contig's mappings\n" unless $quiet; | |
| 138 } | |
| 139 } | |
| 140 | |
| 141 print STATS "Total number of targeted reference contigs\t", scalar(keys %regions), "\n"; | |
| 142 print STATS "Total number of targeted regions\t$targeted_regions\n"; | |
| 143 print STATS "Total number of targeted nucleotide bases\t$targeted_total\n"; | |
| 144 print STATS "Total number of targeted bases considered pre-emptively as duds\t$duds_total\n"; | |
| 145 | |
| 146 # Heuristic | |
| 147 my $isMale = &isMale($sam); | |
| 148 #print STDERR "Male: $isMale\n" unless $quiet; | |
| 149 | |
| 150 # Check each BED region for coverage stats | |
| 151 print STDERR "Reading coverage data from BAM file (each dot represents $tick_count bases)\n" unless $quiet; | |
| 152 print STDERR "0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%\n" unless $quiet; | |
| 153 my $base_count = 0; | |
| 154 my @coverages; | |
| 155 for my $contig_name (sort keys %regions){ | |
| 156 next if defined $target_contig and $contig_name ne $target_contig; | |
| 157 my $c_name = $contig_name; | |
| 158 if(not grep {$_ eq $c_name} @$contig_names){ | |
| 159 $c_name =~ s/^chr//; | |
| 160 if(not grep {$_ eq $c_name} @$contig_names){ | |
| 161 print STATS "#Warning: The BAM file makes no reference to the BED's $contig_name reference sequence, ", | |
| 162 "no reporting will be done for this contig's mappings\n" unless $quiet; | |
| 163 next; | |
| 164 } | |
| 165 } | |
| 166 my $good_coverage = $good_coverage_basic; | |
| 167 $good_coverage = $good_homo_coverage if $contig_name =~ /X$/ and $isMale; # males are hemizygous for X, so adjust the poor coverage threshold accordingly | |
| 168 for my $range_key (keys %{$regions{$contig_name}}){ | |
| 169 #print STDERR "Processing range $range_key\n"; | |
| 170 my ($range_min, $range_max) = $range_key =~ /^(\d+):(\d+)$/; | |
| 171 my @cov = $sam->get_features_by_location(-seq_id => $c_name, | |
| 172 -type => "coverage", | |
| 173 -start => $range_min, | |
| 174 -end => $range_max); | |
| 175 if(not @cov){ | |
| 176 if(not $quiet){ | |
| 177 die "No BAM data for $c_name (", $range_min,",", | |
| 178 $range_max, "). No coverage data returned\n"; | |
| 179 } | |
| 180 next; | |
| 181 } | |
| 182 @cov = $cov[0]->coverage; | |
| 183 | |
| 184 if(@cov == 0){ | |
| 185 @cov = (0) x ($range_max-$range_min+1); | |
| 186 } | |
| 187 | |
| 188 # Gather min, Q1, median, Q3, max | |
| 189 $regions{$contig_name}->{$range_key} = []; | |
| 190 my $ignore = exists $duds{$contig_name}->{$range_key}; | |
| 191 my $low_coverage_start = -1; | |
| 192 my @low_covs; | |
| 193 for(my $i = 0; $i <= $#cov; $i++){ | |
| 194 $targeted_coverage += $cov[$i]; | |
| 195 if(not $ignore){ | |
| 196 $coverages[$base_count++] = $cov[$i]; | |
| 197 if($cov[$i] >= $good_coverage){ | |
| 198 if(@low_covs){ | |
| 199 &print_low($contig_name, \@low_covs, $range_min, $low_coverage_start); | |
| 200 @low_covs = (); # clear! | |
| 201 } | |
| 202 } | |
| 203 else{ | |
| 204 if(not @low_covs){ | |
| 205 # starting a low coverage section | |
| 206 $low_coverage_start = $i; | |
| 207 } | |
| 208 # else continuing a low coverage section | |
| 209 push @low_covs, $cov[$i]; | |
| 210 } | |
| 211 } | |
| 212 print STDERR "." if not $quiet and $base_count%$tick_count == 0; | |
| 213 } | |
| 214 if(@low_covs){ | |
| 215 &print_low($contig_name, \@low_covs, $range_min, $low_coverage_start); | |
| 216 } | |
| 217 } | |
| 218 } | |
| 219 print STDERR "$base_count/",($targeted_total-$duds_total),"\n" unless $quiet; | |
| 220 | |
| 221 print STATS "Total number of bases mapped to targeted regions\t$targeted_coverage\n"; | |
| 222 | |
| 223 my $percentile_size = int($targeted_coverage/100); | |
| 224 | |
| 225 # Generate a cumulative distribution of coverage | |
| 226 print STDERR "Sorting mapped read depth statistics\n" unless $quiet; | |
| 227 @coverages = sort {$a <=> $b} @coverages; | |
| 228 | |
| 229 print STATS "Minimum coverage in targeted regions\t$coverages[0]\n"; | |
| 230 print STATS "Median coverage in targeted regions\t", $coverages[int($#coverages/2)], "\n"; | |
| 231 print STATS "Mean coverage in targeted regions\t", ($targeted_coverage/$targeted_total), "\n"; | |
| 232 print STATS "Maximum coverage in targeted regions\t", $coverages[$#coverages], "\n"; | |
| 233 | |
| 234 # Estimate the false negative rates | |
| 235 print STDERR "Estimating SNP discovery sensitivity...\n" unless $quiet; | |
| 236 print COVER "# Mapped read depth\tNumber of bases\n"; | |
| 237 my $last_coverage = 0; | |
| 238 my $coverage_count = 0; | |
| 239 my $homo_misses = 0; | |
| 240 my $het_misses = 0; | |
| 241 my @depth_labels; | |
| 242 my @cumu_pct; | |
| 243 $#coverages = int($#coverages*0.98); # Look at percentiles 0-98, otherwise chart is too big due to long tail | |
| 244 for my $c (@coverages){ | |
| 245 if($c != $last_coverage){ | |
| 246 if($last_coverage != 0){ | |
| 247 # fill in any missing values for the x axis of the cumulative distribution graph | |
| 248 for(my $i = $#depth_labels+1; $i < $last_coverage; $i++){ | |
| 249 push @depth_labels, ($i%10?"":$i); | |
| 250 push @cumu_pct, $cumu_pct[$i-1]; | |
| 251 } | |
| 252 } | |
| 253 push @depth_labels, ($last_coverage%10?"":$last_coverage); # label every 10 bars | |
| 254 if($last_coverage == 0){ | |
| 255 push @cumu_pct, $coverage_count/($targeted_total-$duds_total)*100; # cumulative percentage | |
| 256 } | |
| 257 else{ | |
| 258 push @cumu_pct, $cumu_pct[$#cumu_pct]+$coverage_count/($targeted_total-$duds_total)*100; # cumulative percentage | |
| 259 } | |
| 260 print COVER "$last_coverage\t$coverage_count\n"; | |
| 261 if($c <= $good_coverage_basic){ | |
| 262 $homo_misses += false_neg_homo_count($last_coverage, $coverage_count); | |
| 263 $het_misses += false_neg_het_count($last_coverage, $coverage_count); | |
| 264 } | |
| 265 $coverage_count = 0; | |
| 266 $last_coverage = $c; | |
| 267 } | |
| 268 $coverage_count++; | |
| 269 } | |
| 270 $homo_misses = int($homo_misses+0.5); | |
| 271 $het_misses = int($het_misses+0.5); | |
| 272 print STATS "Estimated percentage of homozygous SNPs missed\t", $homo_misses/($targeted_total*0.000358*0.35)*100, "\n"; | |
| 273 print STATS "Estimated number of homozygous SNPs missed\t$homo_misses\n"; | |
| 274 print STATS "Estimated percentage of heterozygous SNPs missed\t", $het_misses/($targeted_total*0.000358*0.65)*100, "\n"; | |
| 275 print STATS "Estimated number of heterozygous SNPs missed\t$het_misses\n"; | |
| 276 | |
| 277 print STDERR "Generating coverage percentiles and false negative/false positive estimates\n" unless $quiet; | |
| 278 | |
| 279 my $percentile_reporting = 1; | |
| 280 my $lt_fold_10 = 0; | |
| 281 my $lt_fold_20 = 0; | |
| 282 my $poor_cutoff_percentile = 5; | |
| 283 my $poor_cutoff_coverage = 0; | |
| 284 my $running_cov_tot; | |
| 285 for (my $i = 0; $i <= $#coverages; $i++){ | |
| 286 $running_cov_tot += $coverages[$i]; | |
| 287 if($coverages[$i] < $good_coverage_basic){ | |
| 288 $lt_fold_20++; | |
| 289 if($coverages[$i] < $good_homo_coverage){ | |
| 290 $lt_fold_10++; | |
| 291 } | |
| 292 } | |
| 293 if($running_cov_tot >= $percentile_reporting*$percentile_size){ | |
| 294 print PERCENTILE "$percentile_reporting\t$i\n"; | |
| 295 $percentile_reporting++; | |
| 296 } | |
| 297 } | |
| 298 close(PERCENTILE); | |
| 299 | |
| 300 print STATS "Total bases with less than $good_homo_coverage-fold coverage\t$lt_fold_10\n"; | |
| 301 print STATS "Total bases with less than $good_coverage_basic-fold coverage\t$lt_fold_20\n"; | |
| 302 close(STATS); | |
| 303 | |
| 304 print STDERR "Generating read depth cumulative distribution graph\n" unless $quiet; | |
| 305 # Using three colors to show trouble coverage levels | |
| 306 my @low_coverage = @cumu_pct[0..$good_homo_coverage]; | |
| 307 my @medium_coverage = (("0") x ($good_homo_coverage-1), @cumu_pct[$good_homo_coverage..($good_coverage_basic-1)]); | |
| 308 for(my $i = 0; $i < $good_coverage_basic; $i++){ | |
| 309 $cumu_pct[$i] = 0; | |
| 310 } | |
| 311 my $graph = new GD::Graph::bars(($#cumu_pct*2)+100, 600); | |
| 312 $graph->set(x_label => "Mapped read depth", | |
| 313 y_label => "Percentage of target reference", | |
| 314 title => "Cumulative distribution of mapped read depth (0 to 98th percentiles)", | |
| 315 cumulate => 1, | |
| 316 transparent => 0, | |
| 317 bar_spacing => 0, | |
| 318 bar_width => 2, | |
| 319 fgclr => "black", | |
| 320 dclrs => ["red", "yellow", "green"], | |
| 321 x_ticks => 0, | |
| 322 long_ticks => 1, | |
| 323 tick_length => 0, | |
| 324 y_tick_number => 10, | |
| 325 y_number_format =>'%d%%', | |
| 326 box_axis => 0) | |
| 327 or die "While setting up chart", $graph->error; | |
| 328 my $gd = $graph->plot([\@depth_labels, \@low_coverage, \@medium_coverage, \@cumu_pct]) or die $graph->error; | |
| 329 binmode HISTOGRAM; | |
| 330 print HISTOGRAM $gd->png; | |
| 331 close(HISTOGRAM); | |
| 332 | |
| 333 # args: read depth, count of bases with this read depth | |
| 334 sub false_neg_homo_count{ | |
| 335 if($_[0] < $good_homo_coverage){ # 0.000358 based on doi:10.1371/journal.pgen.1000160 | |
| 336 return 0.000358*0.35*$_[1]; # 35% of SNPs are homo based on own observations | |
| 337 } | |
| 338 elsif($_[0] > $good_coverage_basic){ | |
| 339 return 0; | |
| 340 } | |
| 341 return (-0.118*log($_[0])+0.27)*0.000358*$_[1]; | |
| 342 } | |
| 343 | |
| 344 sub false_neg_het_count{ | |
| 345 if($_[0] < $good_homo_coverage){ | |
| 346 return 0.000358*0.65*$_[1]; # 65% of SNPs are het based on own observations | |
| 347 } | |
| 348 elsif($_[0] >= $good_coverage_basic){ | |
| 349 return 0; | |
| 350 } | |
| 351 return (-0.0004*($_[0]**2)-0.00048*$_[0]+0.2483)*0.000358*$_[1]; | |
| 352 } | |
| 353 | |
| 354 sub print_low{ | |
| 355 my ($contig_name, $low_covs_ref, $base_pos, $low_offset) = @_; | |
| 356 # exiting a low coverage section, print it, split into poor het call regions, and poor homo call regions | |
| 357 my $start = $base_pos+$low_offset; | |
| 358 my $last_i = 0; | |
| 359 for(my $i = 1; $i <= $#{$low_covs_ref}; $i++){ | |
| 360 if($low_covs_ref->[$i] >= $good_homo_coverage){ | |
| 361 if($low_covs_ref->[$last_i] < $good_homo_coverage){ | |
| 362 my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..($i-1)]; | |
| 363 print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t0\n"; | |
| 364 $last_i = $i; | |
| 365 } | |
| 366 if($i == $#{$low_covs_ref}){ | |
| 367 my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..$i]; | |
| 368 print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t1\n"; | |
| 369 } | |
| 370 # else continuation of a good homo coverage region | |
| 371 } | |
| 372 else{ | |
| 373 if($low_covs_ref->[$last_i] >= $good_homo_coverage or $i == $#{$low_covs_ref}){ | |
| 374 my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..($i-1)]; | |
| 375 print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t1\n"; | |
| 376 $last_i = $i; | |
| 377 } | |
| 378 if($i == $#{$low_covs_ref}){ | |
| 379 my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..$i]; | |
| 380 print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t0\n"; | |
| 381 } | |
| 382 # else continuation of a poor homo region | |
| 383 } | |
| 384 } | |
| 385 } | |
| 386 | |
| 387 sub isMale{ | |
| 388 my ($sam) = @_; | |
| 389 # Local observation: | |
| 390 # A robust measure across whole genome and exome kits (generally not capturing any Y genes) | |
| 391 # to detect female is an exceptionally low ratio for the number of reads mapped to chrY:9200000-9300000 | |
| 392 # (which contains several testes-specific genes), and chrY:13800000-13900000 (which is highly repetitive) | |
| 393 # in hg19 (UCSC). | |
| 394 # This holds regardless of the read depth for the experiment, so should be robust. Females have | |
| 395 # an average ratio of 0.000583843, with a std dev of 0.00069961. We'll set the threshold to 0.004334299 (mu + 3*sigma) | |
| 396 # to be 99% sure it's not a female sample (I know normal dist isn't the correct model here, but it's close enough). -Paul G. 2013-11-01 | |
| 397 my $chrYName = "chrY"; | |
| 398 if(not grep {$_ eq "chrY"} $sam->seq_ids){ | |
| 399 if(grep {$_ eq "Y"} $sam->seq_ids){ | |
| 400 $chrYName = "Y"; | |
| 401 } | |
| 402 else{ # not human-ish? | |
| 403 return 0; | |
| 404 } | |
| 405 } | |
| 406 my $low_count_region = 0; | |
| 407 #print STDERR "Checking sex by chrY data...\n" unless $quiet; | |
| 408 $sam->fetch("$chrYName:9200000-9300000", sub {$low_count_region++}); | |
| 409 my $high_count_region = 0; | |
| 410 $sam->fetch("$chrYName:13800000-13900000", sub {$high_count_region++}); | |
| 411 if($high_count_region){ | |
| 412 #print STDERR "$low_count_region/$high_count_region = ", $low_count_region/$high_count_region, "\n" unless $quiet; | |
| 413 } | |
| 414 else{ | |
| 415 #print STDERR "No relevant chrY data\n" unless $quiet; | |
| 416 } | |
| 417 return $high_count_region ? ($low_count_region/$high_count_region > 0.004334299 ? 1 : 0) : 0; | |
| 418 } |
