comparison bam2html.pl @ 0:acc8d8bfeb9a

Uploaded
author jjohnson
date Wed, 08 Feb 2012 16:59:24 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc8d8bfeb9a
1 #!/usr/bin/perl -w
2 use strict;
3 use Carp;
4 use Getopt::Long;
5 use English;
6 use Pod::Usage;
7 use Data::Dumper;
8 use Bio::Assembly::IO;
9 use Bio::DB::Sam;
10 use Bio::DB::Sam::Constants;
11 use File::Spec;
12 use File::Path;
13 use File::Copy;
14 use File::Basename;
15
16 use constant HTML_COLOR => {
17 'MATCH' => '#000000', #black
18 'MISMATCH' => '#800000', #brown
19 'LOWQUAL' => '#C0C0C0', #grey
20 'INDEL' => '#FF8040', #Orange
21 'SCLIP' => '#6960EC', #Slate Blue2
22 };
23 my $lowqual_cutoff = 20;
24
25 my ($bam_d, $bam_g, $file);
26 my $ref_genome;
27 my $range;
28
29 my ( $help, $man, $version, $usage );
30 my $output;
31 my $RNASeq;
32 my $optionOK = GetOptions(
33 'd=s' => \$bam_d,
34 'g=s' => \$bam_g,
35 'f|i|file|input=s' => \$file,
36 'r|ref_genome=s' => \$ref_genome,
37 'range=s' => \$range,
38 'RNASeq' => \$RNASeq,
39 'o|output=s' => \$output,
40 'h|help|?' => \$help,
41 'man' => \$man,
42 'usage' => \$usage,
43 'v|version' => \$version,
44 );
45
46 pod2usage(-verbose=>2) if($man or $usage);
47 pod2usage(1) if($help or $version );
48 croak "you must specify at least one bam file" unless ($bam_d || $bam_g);
49 warn "Missing reference genome, the view is not correct" unless($ref_genome);
50 croak "You need either provide an input file or a range" unless ($file || $range);
51
52 my @bams;
53 push @bams, Bio::DB::Sam->new( -bam => $bam_d, -fasta => $ref_genome) if($bam_d);
54 push @bams, Bio::DB::Sam->new( -bam => $bam_g, -fasta => $ref_genome) if($bam_g);
55 open STDOUT, ">$output" if $output;
56
57 if($range) {
58 print STDOUT bam2html($range, @bams);
59 close STDOUT if $output;
60 exit(0);
61 }
62
63 open my $FILE, "<$file" or croak "can't open $file:$OS_ERROR";
64 while( my $line = <$FILE>) {
65 chomp $line;
66 my @fields = split /\t/, $line;
67 print "<h3 style=\"color:blue;text-align:center\">$line</h3>\n";
68 if(scalar(@fields) == 3) { # deal with file: chr start end format
69 my ($chr, $s, $e) = @fields;
70 print STDOUT bam2html("$chr:$s-$e", @bams);
71 next;
72 }
73 # deal with CREST output file
74 # if($fields[0] =~ /^chr/) {
75 # $fields[0] = substr($fields[0], 3);
76 # $fields[2] = substr($fields[2], 3);
77 # }
78 my ($chr, $s, $e) = ($fields[0], $fields[1], $fields[1]);
79 print STDOUT bam2html("$chr:$s-$e", @bams);
80 ($chr, $s, $e) = ($fields[4], $fields[5], $fields[5]);
81 print STDOUT bam2html("$chr:$s-$e", @bams);
82 }
83 close STDOUT if($output);
84 exit(0);
85 #print bam2html("12:11676858-11676858", $bam1, $bam2);
86
87
88 sub get_input_bam {
89 my ($raw_bam_dir, $sample, $group) = @_;
90
91 $raw_bam_dir = File::Spec->catdir($raw_bam_dir, $group);
92 opendir(my $dh, $raw_bam_dir) or croak "can't open directory: $raw_bam_dir: $OS_ERROR";
93 my @files = grep { /^$sample-.*bam$/ } readdir($dh);
94 close $dh;
95 return $files[0];
96 }
97
98
99 # this function returns a <pre> </pre> block for the specific contig in the ace file
100 sub bam2html {
101 my ($range, @bams) = @_;
102 my ($chr, $start, $end) = $range =~ m/^(.*?):(\d+)-(\d+)/;
103 if(!$chr or !$start or !$end) {
104 croak "The range format is not correct, use chr:start-end";
105 }
106 my ($r_start, $r_end) = ($start, $end);
107 my $name_len = length($range);
108 for my $bam (@bams) {
109 my $segment = $bam->segment(-seq_id => $chr, -start => $start, -end => $end);
110 my @alignments = $segment->features;
111 # return "There are too many reads in this region, this is must be a highly repetitive region, abort!"
112 # if(scalar(@alignments) > 500);
113 for my $a(@alignments) {
114 my $l = length($a->query->name);
115 $name_len = $l if($name_len < $l);
116 }
117 }
118 if($start == $end) { # we want to check a specific point
119 if($RNASeq) { # for RNASeq we just extend a little bit instead of dynamic do it
120 $r_start = $start - 100;
121 $r_end = $end + 100;
122 }
123 else {
124 for my $bam (@bams) {
125 my $segment = $bam->segment(-seq_id => $chr, -start => $start - 1, -end => $end + 1);
126 my @alignments = $segment->features;
127 for my $a (@alignments) {
128 next if($a->start > $end || $a->end < $start);
129 $r_start = $a->start if $a->start < $r_start;
130 $r_end = $a->end if $a->end > $r_end;
131 }
132 }
133 }
134 $range = "$chr:$r_start-$r_end";
135 }
136 my $rtn_str = "<pre>\n";
137 my($ref, $pos2padded) = get_padded_ref($range, @bams);
138 $rtn_str .= print_bam_ruler($r_start, $r_end, $pos2padded, $name_len) . "\n";
139 my $line = print_ref($ref, $range, $name_len);
140 my $line_len = length($line);
141 $rtn_str .= $line . "\n" . ' ' x $line_len . "\n";
142 $ref =~ s/\*//g; #remove * from ref sequence
143 for my $bam (@bams) {
144 my $segment = $bam->segment(-seq_id => $chr, -start => $start, -end => $end);
145 my @alignments = $segment->features;
146 for my $a (@alignments) {
147 #next if($a->cigar_str !~ m/S/);
148 if($a->strand == 1) {
149 my $tmp_str = print_bam_seq($a, $r_start, $r_end, $ref, $pos2padded, $name_len);
150 $rtn_str .= $tmp_str . "\n" if($tmp_str);
151 }
152 }
153 for my $a (@alignments) {
154 #next if($a->cigar_str !~ m/S/);
155 if($a->strand == -1) {
156 my $tmp_str = print_bam_seq($a, $r_start, $r_end, $ref, $pos2padded, $name_len);
157 $rtn_str .= $tmp_str . "\n" if($tmp_str);
158 }
159 }
160 $rtn_str .= '_' x $line_len . "\n";
161 }
162 $rtn_str .= "\n</pre>";
163 return $rtn_str;
164 }
165
166 # only print the unpadded position, which is meaningful
167 sub print_bam_ruler {
168 my($start, $end, $pos2padded, $name_len) = @_;
169 my ($string, $mark);
170 $string = $mark = " "x($name_len + 2);
171
172 #print | at 20 and . at 10
173 for( my $i = $start; $i <= $end; $i++) {
174 $mark .= $i % 10 == 0 ? ($i % 20 == 0 ? "|" : ".") : " ";
175 last if($i == $end);
176 $mark .= ' ' x ($pos2padded->[$i - $start + 2] - $pos2padded->[$i - $start + 1] - 1);
177 }
178 # print numbers above | for padded consensus
179 my $padded_i = 0;
180 for( my $i = 1; $i <= $end - $start + 1; $i++ ) {
181 my $j = $i + $start - 1;
182 my $overhead = $pos2padded->[$i] - $padded_i;
183 if($overhead > 0 ) {
184 $string .= ' ' x $overhead;
185 $padded_i += $overhead;
186 }
187 if( $j % 20 == 0) {
188 my $l = length($j);
189 my $half_l = int(($l+1)/2);
190 $string = substr($string, 0, length($string) - $half_l );
191 $string .= $j;
192 $padded_i += ($l - $half_l);
193 }
194 }
195 return join("\n", ($string, $mark));
196 }
197
198 sub print_ref {
199 my( $ref, $chr, $name_len) = @_;
200 return $chr . " " x ($name_len + 2 - length($chr)) . $ref;
201 }
202
203 sub get_padded_ref {
204 my ($range, @bams) = @_;
205 my $ref_str; #padded ref genome
206 my @pos2padded;
207 my($chr, $start, $end) = $range =~ m/(.*?):(\d+)-(\d+)/;
208
209 @pos2padded = 0 .. ($end - $start + 1);
210 push @pos2padded, 100 + $end - $start;
211
212 foreach my $bam (@bams) {
213 my ($d, $cum_pad) = (0, 0);
214 my $padded_fun = sub {
215 my ($seqid, $pos, $p) = @_;
216 return if($pos < $start || $pos > $end);
217 $d++;
218 $pos2padded[$d] += $cum_pad;
219 my $max_ins = 0;
220 for my $pileup (@$p) {
221 $max_ins = $pileup->indel if($max_ins < $pileup->indel);
222 }
223 if($max_ins > $pos2padded[$d+1] + $cum_pad - $pos2padded[$d] - 1) {
224 $cum_pad += $max_ins - ($pos2padded[$d+1] + $cum_pad- $pos2padded[$d] - 1);
225 }
226 };
227 $bam->pileup($range, $padded_fun);
228 if($d <= $end - $start + 1) {
229 for(my $i = $d + 1; $i <= $end-$start+1; $i++) {
230 $pos2padded[$i] += $cum_pad;
231 }
232 }
233 }
234 my $refseq = $bams[0]->segment($chr, $start, $end)->dna;
235 my @str = split //, "*" x $pos2padded[$end - $start + 1];
236 for my $i ( 1 .. ($end - $start + 1)) {
237 $str[$pos2padded[$i]] = substr($refseq, $i-1, 1);
238 }
239 shift @str; #remove the 0th base
240 return (join( '', @str), \@pos2padded );
241 }
242
243 sub print_bam_seq { # it's complicated
244 my ($align, $start, $end, $ref, $pos2padded, $name_len) = @_;
245 my $rtn = $align->query->name;
246 $rtn .= ' ' x ($name_len - length($rtn));
247 $rtn .= $align->strand == -1 ? '- ' : '+ ';
248 my $p_s = $align->start;
249 my $p_e = $align->end;
250 my $seq = $align->query->dna;
251 my @qual = $align->qscore;
252 my $repeat = 0;
253 if($align->has_tag("XT")) {
254 $repeat = 1 if($align->aux_get("XT") ne "U");
255 }
256
257 my $r_cigar = $align->cigar_array;
258 my ($s, $e) = ($align->query->start, $align->query->end);
259
260 # reads partial in the region
261 my $leading = "";
262 if($p_s < $start) {
263 ($r_cigar, $p_s, $s) = find_new_start($r_cigar, $p_s, $s, $start);
264 }
265
266 my $tailing = "";
267 if($p_e > $end) {
268 ($r_cigar, $p_e, $e) = find_new_end($r_cigar, $p_e, $e, $end);
269 }
270 return if($s >= $e);
271 my @cigar = @{$r_cigar};
272
273 # deal with leading softclip
274 my $op = shift @cigar;
275 my $l = $pos2padded->[$p_s - $start + 1] - $pos2padded->[1];
276 if($op->[0] eq 'S') {
277 my $ss = $op->[1] - $l;
278 if($ss < 0) {
279 $ss = -$ss;
280 $leading .= ' ' x $ss;
281 $ss = 0;
282 }
283 $leading .= '<font color="' . HTML_COLOR->{'SCLIP'} . '">';
284 for( my $i = $ss; $i < $op->[1]; $i++) {
285 my $c = substr($seq, $i, 1);
286 $c = lc $c if($qual[$i] < $lowqual_cutoff);
287 $leading .= $c;
288 }
289 $leading .= '</font>' ;
290 }
291 else {
292 $leading .= ' ' x $l;
293 unshift @cigar, $op;
294 }
295
296 #dealing with tailing softclip
297 $op = pop @cigar;
298 $l = $pos2padded->[$end - $start + 1] - $pos2padded->[$p_e - $start + 1];
299 if($op->[0] eq 'S') {
300 my $ee = $op->[1] - $l;
301 $ee = 0 if($ee < 0 );
302 $tailing = '<font color="' . HTML_COLOR->{'SCLIP'} . '">';
303 for( my $i = 0; $i < $op->[1] - $ee; $i++){
304 my $c = substr($seq, $i + $e, 1);
305 $c = lc $c if($qual[$i + $e] < $lowqual_cutoff);
306 $tailing .= $c;
307 }
308 $tailing .= '</font>';
309 }
310 else{
311 push @cigar, $op;
312 }
313
314 # generate the alignment part, only M, I, D and N
315 my $mid = '';
316 my $mode;
317 foreach $op (@cigar) {
318 my $l = $op->[1];
319 if($op->[0] eq 'M') {
320 my $line = '';
321 while($l > 0 ){
322 $l--;
323 my $c = substr($seq, $s - 1, 1); #seq is 0 based
324 my $newmode = "MISMATCH";
325 my $cc = substr($ref, $p_s - $start, 1);
326 #last unless ($c && $cc);
327 $newmode = "MATCH" if($cc eq $c);
328 if($qual[$s-1] < $lowqual_cutoff) {
329 $c = lc $c;
330 $newmode = 'LOWQUAL' if($newmode eq 'MATCH');
331 }
332 if(!$mode) {
333 $line .= '<font color="' . HTML_COLOR->{$newmode} . '">';
334 }
335 elsif($mode ne $newmode) {
336 $line .= '</font>' . '<font color="' . HTML_COLOR->{$newmode} . '">';
337 }
338 $mode = $newmode;
339 $line .= $c;
340 $s++; $p_s++;
341 # dealing with padded * in reference genome
342 if($p_s < $p_e && $s < $e && $l > 0) {
343 my $tmp = $pos2padded->[$p_s - $start + 1]-$pos2padded->[$p_s-1 - $start + 1];
344 if( $tmp > 1) {
345 $line .= '</font>';
346 $line .= '<font color="' . HTML_COLOR->{'INDEL'} . '">';
347 $line .= '*' x ($tmp - 1);
348 $mode = 'INDEL';
349 }
350 }
351 }
352 $mid .= $line;
353 }
354 if($op->[0] eq 'D' || $op->[0] eq 'I' || $op->[0] eq 'N') {
355 my $newmode = 'INDEL';
356 my $tmp; #extra padded * after indel
357 $mid .= '</font>' if($mode && $mode ne $newmode);
358 $mid .= '<font color="' . HTML_COLOR->{'INDEL'} . '">';
359
360 if($op->[0] eq 'D' || $op->[0] eq 'N') {
361 $mid .= ($op->[0] eq 'D' ? '*' : '=') x $l;
362 $p_s += $l;
363 $tmp = $pos2padded->[$p_s - $start + 1]-$pos2padded->[$p_s - $l - $start ] - $l if($p_s < $p_e);
364 }
365 else{
366 $tmp = $pos2padded->[$p_s - $start + 1] - $pos2padded->[$p_s - 1 - $start + 1] - $l if($p_s < $p_e);
367 while($l > 0 ) {
368 my $c = substr($seq, $s - 1, 1);
369 $c = lc $c if($qual[$s - 1] < $lowqual_cutoff);
370 $mid .= $c;
371 $l--; $s++;
372 }
373 }
374 $mode = $newmode;
375 if($p_s < $p_e && $tmp > 1) {
376 $mid .= '*' x ($tmp - 1);
377 }
378 }
379 }
380 $mid .= '</font>';
381 $rtn .= $leading . $mid . $tailing;
382 $rtn = '<b><i>' . $rtn . '</i></b>' if($repeat);
383 return $rtn;
384 }
385
386 sub find_new_start {
387 my ($r_cigar, $p_s, $s, $start) = @_;
388 my @cigar = @{$r_cigar};
389
390 while(1) {
391 my $op = shift @cigar;
392 next if( $op->[0] eq 'S' || $op->[0] eq 'H');
393 if( $op->[0] eq 'I') {
394 $s += $op->[1]; next;
395 }
396 if( $p_s + $op->[1] < $start ) {
397 $p_s += $op->[1];
398 $s += $op->[1] if $op->[0] eq 'M';
399 }
400 else {
401 $s += ($start - $p_s) if $op->[0] eq 'M';
402 unshift @cigar, [$op->[0], $op->[1] - ($start - $p_s)];
403 $p_s = $start;
404 return (\@cigar, $p_s, $s);
405 }
406 }
407 }
408
409 sub find_new_end {
410 my ($r_cigar, $p_e, $e, $end) = @_;
411 my @cigar = @{$r_cigar};
412
413 while(1) {
414 my $op = pop @cigar;
415 next if( $op->[0] eq 'S' || $op->[0] eq 'H');
416 if( $op->[0] eq 'I') {
417 $e -= $op->[1]; next;
418 }
419 if( $p_e - $op->[1] > $end ) {
420 $p_e -= $op->[1];
421 $e -= $op->[1] if $op->[0] eq 'M';
422 }
423 else {
424 $e -= ($p_e - $end) if $op->[0] eq 'M';
425 push @cigar, [$op->[0], $op->[1] - ($p_e - $end)];
426 $p_e = $end;
427 return (\@cigar, $p_e, $e);
428 }
429 }
430 }
431
432 =head1 NAME
433
434 bam2html.pl - a bam file viewer that just simple display part of the alignment as HTML file.
435
436
437 =head1 VERSION
438
439 This documentation refers to bam2html.pl version 0.0.1.
440
441
442 =head1 USAGE
443
444 Display part of a bam file:
445 bam2html.pl -r hg18.fa -d diag.bam --range 1:123566-123766 -o diag.html
446 Display part of two bam files, one diagnositc, one germlie for comparison.
447 bam2html.pl -r hg18.fa -d diag.bam -g germline.bam --range 1:123566-123766 -o diag.html
448 Display part of two bam files, one diagnositc, one germlie for comparison from
449 a list of positions in a file, each line should be tab sepearted as: chr, start, and end.
450 bam2html.pl -r hg18.fa -d diag.bam -g germline.bam -o diag.html -f position.txt
451 Display part of two bam files, one diagnositc, one germlie for comparison from a file
452 generated by CREST.pl.
453 bam2html.pl -r hg18.fa -d diag.bam -g germline.bam -o predSV.html -f predSV.txt
454
455 =head1 REQUIRED ARGUMENTS
456
457 To run the program, several parameter must specified.
458 -d: The input (diagnositic) bam file
459 -g: The input (germ line) bam file
460 -r, --ref_genome: The reference genome file in fa format
461
462 =head1 OPTIONS
463
464 The options that can be used for the program.
465 -o: The output file, default to STDOUT if missing.
466 --range: The range where SV will be detected, using chr1:100-200 format.
467 -f, -i: The input file from either CREST.pl or tab seperated chr, start, end.
468
469 -h, --help Help information
470 --man Man page.
471 --usage Usage information.
472 --version Software version.
473
474
475 =head1 DESCRIPTION
476
477 This is a bam file viewer that just simple display part of the alignment as
478 HTML file. The program is developed to view Structure Varitions around break
479 point. So manual review will a breeze. Any way this program can be used
480 otherway, but don't put a too big range to display as the view will not
481 be pretty as each read will occupy a line.
482
483
484 =head1 DIAGNOSTICS
485
486 If the program does not respond for minutes, please be a little bit patient as
487 sometimes generating the html file could take longer time.
488 If you provide a range as "chr1:50-100" and nothing was output, please make sure
489 the reference genomes for mapping and display are exact the same and the chrom
490 name is chr1 not 1.
491
492
493 =head1 DEPENDENCIES
494
495 The program depend on several packages:
496 1. Bioperl perl module.
497 2. Bio::DB::Sam, version 1.5 or later, it requires samtools lib installed.
498
499
500 =head1 BUGS AND LIMITATIONS
501
502 There are no known bugs in this module, but the method is limitted to bam file
503 that has soft-clipping cigar string generated.Please report problems to
504 Jianmin Wang (Jianmin.Wang@stjude.org)
505 Patches are welcome.
506
507 =head1 AUTHOR
508
509 Jianmin Wang (Jianmin.Wang@stjude.org)
510
511
512
513 =head1 LICENCE AND COPYRIGHT
514
515 Copyright (c) 2010 by St. Jude Children's Research Hospital.
516
517 This program is free software: you can redistribute it and/or modify
518 it under the terms of the GNU General Public License as published by
519 the Free Software Foundation, either version 2 of the License, or
520 (at your option) any later version.
521
522 This program is distributed in the hope that it will be useful,
523 but WITHOUT ANY WARRANTY; without even the implied warranty of
524 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
525 GNU General Public License for more details.
526
527 You should have received a copy of the GNU General Public License
528 along with this program. If not, see <http://www.gnu.org/licenses/>.