Mercurial > repos > jjohnson > crest

diff bam2html.pl @ 0:acc8d8bfeb9a
Uploaded
author: jjohnson
date: Wed, 08 Feb 2012 16:59:24 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bam2html.pl	Wed Feb 08 16:59:24 2012 -0500
@@ -0,0 +1,528 @@
+#!/usr/bin/perl -w
+use strict;
+use Carp;
+use Getopt::Long;
+use English;
+use Pod::Usage;
+use Data::Dumper;
+use Bio::Assembly::IO;
+use Bio::DB::Sam;
+use Bio::DB::Sam::Constants;
+use File::Spec;
+use File::Path;
+use File::Copy;
+use File::Basename;
+
+use constant HTML_COLOR => {
+	'MATCH'		=> '#000000', #black
+	'MISMATCH'	=> '#800000', #brown
+	'LOWQUAL'	=> '#C0C0C0', #grey
+	'INDEL'		=> '#FF8040', #Orange
+	'SCLIP'		=> '#6960EC', #Slate Blue2
+};
+my $lowqual_cutoff = 20;
+
+my ($bam_d, $bam_g,  $file);
+my $ref_genome;
+my $range;
+
+my ( $help, $man, $version, $usage );
+my $output;
+my $RNASeq;
+my $optionOK = GetOptions(
+	'd=s'			=> \$bam_d,
+	'g=s'			=> \$bam_g,
+	'f|i|file|input=s'	=> \$file,
+	'r|ref_genome=s'	=> \$ref_genome,
+	'range=s'		=> \$range,
+	'RNASeq'		=> \$RNASeq,
+	'o|output=s'	=> \$output,
+	'h|help|?'      => \$help,
+	'man'           => \$man,
+	'usage'         => \$usage,
+	'v|version'     => \$version,
+);
+
+pod2usage(-verbose=>2) if($man or $usage);
+pod2usage(1) if($help or $version );
+croak "you must specify at least one bam file" unless ($bam_d || $bam_g);
+warn "Missing reference genome, the view is not correct" unless($ref_genome);
+croak "You need either provide an input file or a range" unless ($file || $range);
+
+my @bams;
+push @bams, Bio::DB::Sam->new( -bam => $bam_d, -fasta => $ref_genome) if($bam_d);
+push @bams, Bio::DB::Sam->new( -bam => $bam_g, -fasta => $ref_genome) if($bam_g);
+open STDOUT, ">$output" if $output;
+
+if($range) {
+	print STDOUT bam2html($range, @bams);
+	close STDOUT if $output;
+	exit(0);
+}
+
+open my $FILE, "<$file" or croak "can't open $file:$OS_ERROR";
+while( my $line = <$FILE>) {
+	chomp $line;
+	my @fields = split /\t/, $line;
+	print "<h3 style=\"color:blue;text-align:center\">$line</h3>\n";
+	if(scalar(@fields) == 3) { # deal with file: chr	start	end format
+		my ($chr, $s, $e) = @fields;
+		print STDOUT bam2html("$chr:$s-$e", @bams);
+		next;
+	}
+	# deal with CREST output file
+#	if($fields[0] =~ /^chr/) {
+#		$fields[0] = substr($fields[0], 3);
+#		$fields[2] = substr($fields[2], 3);
+#	}
+	my ($chr, $s, $e) = ($fields[0], $fields[1], $fields[1]);
+	print STDOUT bam2html("$chr:$s-$e", @bams);
+	($chr, $s, $e) = ($fields[4], $fields[5], $fields[5]);
+	print STDOUT bam2html("$chr:$s-$e", @bams);
+}
+close STDOUT if($output);
+exit(0);
+#print bam2html("12:11676858-11676858", $bam1, $bam2);
+
+
+sub get_input_bam {
+    my ($raw_bam_dir, $sample, $group) = @_;
+
+    $raw_bam_dir = File::Spec->catdir($raw_bam_dir, $group);
+    opendir(my $dh, $raw_bam_dir) or croak "can't open directory: $raw_bam_dir: $OS_ERROR";
+    my @files = grep { /^$sample-.*bam$/ } readdir($dh);
+	close $dh;
+	return $files[0];
+}
+
+
+# this function returns a <pre> </pre> block for the specific contig in the ace file
+sub bam2html {
+	my ($range, @bams) = @_;
+	my ($chr, $start, $end) = $range =~ m/^(.*?):(\d+)-(\d+)/;
+	if(!$chr or !$start or !$end) {
+		croak "The range format is not correct, use chr:start-end";
+	}
+	my ($r_start, $r_end) = ($start, $end);
+	my $name_len = length($range);
+	for my $bam (@bams) {
+		my $segment = $bam->segment(-seq_id => $chr, -start => $start, -end => $end);
+		my @alignments = $segment->features;
+	#	return "There are too many reads in this region, this is must be a highly repetitive region, abort!"
+	#		if(scalar(@alignments) > 500);
+		for my $a(@alignments) {
+			my $l = length($a->query->name);
+			$name_len = $l if($name_len < $l);
+		}
+	}
+	if($start == $end) { # we want to check a specific point
+		if($RNASeq) { # for RNASeq we just extend a little bit instead of dynamic do it
+			$r_start = $start - 100;
+			$r_end = $end + 100;
+		}
+		else {	
+			for my $bam (@bams) {
+				my $segment = $bam->segment(-seq_id => $chr, -start => $start - 1, -end => $end + 1);
+				my @alignments = $segment->features;
+				for my $a (@alignments) {
+					next if($a->start > $end || $a->end < $start);
+					$r_start = $a->start if $a->start < $r_start;
+					$r_end = $a->end if $a->end > $r_end;
+				}
+			}
+		}
+		$range = "$chr:$r_start-$r_end";
+	}
+	my $rtn_str = "<pre>\n";
+	my($ref, $pos2padded) = get_padded_ref($range, @bams);
+	$rtn_str .= print_bam_ruler($r_start, $r_end, $pos2padded, $name_len) . "\n";
+	my $line = print_ref($ref, $range, $name_len);
+	my $line_len = length($line);
+	$rtn_str .= $line . "\n" . ' ' x $line_len . "\n";
+	$ref =~ s/\*//g; #remove * from ref sequence
+	for my $bam (@bams) {
+		my $segment = $bam->segment(-seq_id => $chr, -start => $start, -end => $end);
+		my @alignments = $segment->features;
+		for my $a (@alignments) {
+			#next if($a->cigar_str !~ m/S/);
+			if($a->strand == 1) {
+				my $tmp_str = print_bam_seq($a, $r_start, $r_end, $ref, $pos2padded, $name_len);
+				$rtn_str .= $tmp_str . "\n" if($tmp_str); 
+			}
+		}
+		for my $a (@alignments) {
+			#next if($a->cigar_str !~ m/S/);
+			if($a->strand == -1) {
+				my $tmp_str = print_bam_seq($a, $r_start, $r_end, $ref, $pos2padded, $name_len);
+				$rtn_str .= $tmp_str . "\n" if($tmp_str);
+			}
+		}
+		$rtn_str .= '_' x $line_len . "\n";
+	}
+	$rtn_str .= "\n</pre>";
+	return $rtn_str;
+}
+
+# only print the unpadded position, which is meaningful
+sub print_bam_ruler {
+	my($start, $end, $pos2padded, $name_len) = @_;
+	my ($string, $mark);
+	$string = $mark = " "x($name_len + 2);
+	
+	#print | at 20 and . at 10 
+	for( my $i = $start; $i <= $end; $i++) {
+		$mark .= $i % 10 == 0 ? ($i % 20 == 0 ? "|" : ".") : " ";
+		last if($i == $end);
+		$mark .= ' ' x ($pos2padded->[$i - $start + 2] - $pos2padded->[$i - $start + 1] - 1);
+	}
+	# print numbers above | for padded consensus
+	my $padded_i = 0;
+	for( my $i = 1; $i <= $end - $start + 1; $i++ ) {
+		my $j = $i + $start - 1;
+		my $overhead = $pos2padded->[$i] - $padded_i;
+		if($overhead > 0 ) {
+			$string .= ' ' x $overhead;
+			$padded_i += $overhead;
+		}
+		if( $j % 20 == 0) {
+			my $l = length($j);
+			my $half_l = int(($l+1)/2);
+			$string = substr($string, 0, length($string) - $half_l );
+			$string .= $j;
+			$padded_i += ($l - $half_l);
+		}
+	}
+	return join("\n", ($string, $mark));
+}
+
+sub print_ref {
+	my( $ref, $chr, $name_len) = @_;
+	return $chr . " " x ($name_len + 2 - length($chr)) . $ref;
+}
+
+sub get_padded_ref {
+	my ($range, @bams) = @_;
+	my $ref_str; #padded ref genome
+	my @pos2padded;
+	my($chr, $start, $end) = $range =~ m/(.*?):(\d+)-(\d+)/;
+	
+	@pos2padded = 0 .. ($end - $start + 1);
+	push @pos2padded, 100 + $end - $start;
+	
+	foreach my $bam (@bams) {
+		my ($d, $cum_pad) = (0, 0);
+		my $padded_fun = sub {
+			my ($seqid, $pos, $p) = @_;
+			return if($pos < $start || $pos > $end);
+			$d++; 
+			$pos2padded[$d] += $cum_pad;
+			my $max_ins = 0;
+			for my $pileup (@$p) {
+				$max_ins = $pileup->indel if($max_ins < $pileup->indel);
+			}
+			if($max_ins > $pos2padded[$d+1] + $cum_pad - $pos2padded[$d] - 1) {
+				$cum_pad += $max_ins - ($pos2padded[$d+1] + $cum_pad-  $pos2padded[$d] - 1);
+			}
+		};
+		$bam->pileup($range, $padded_fun);
+		if($d <= $end - $start + 1) {
+			for(my $i = $d + 1; $i <= $end-$start+1; $i++) {
+				$pos2padded[$i] += $cum_pad;
+			}
+		}
+	}
+	my $refseq = $bams[0]->segment($chr, $start, $end)->dna;
+	my @str = split //, "*" x $pos2padded[$end - $start + 1];
+	for my $i ( 1 .. ($end - $start + 1)) {
+		$str[$pos2padded[$i]] = substr($refseq, $i-1, 1);
+	}
+	shift @str; #remove the 0th base
+	return (join( '', @str), \@pos2padded );
+}
+
+sub print_bam_seq { # it's complicated
+	my ($align, $start, $end, $ref, $pos2padded, $name_len) = @_;
+	my $rtn = $align->query->name;
+	$rtn .= ' ' x ($name_len - length($rtn));
+	$rtn .= $align->strand == -1 ? '- ' : '+ ';
+	my $p_s = $align->start;	
+	my $p_e = $align->end;
+	my $seq = $align->query->dna;
+	my @qual = $align->qscore;
+	my $repeat = 0;
+	if($align->has_tag("XT")) {
+		$repeat = 1 if($align->aux_get("XT") ne "U");
+	}
+
+	my $r_cigar = $align->cigar_array;
+	my ($s, $e) = ($align->query->start, $align->query->end);
+	
+	# reads partial in the region
+	my $leading = "";
+	if($p_s < $start) {
+		($r_cigar, $p_s, $s) = find_new_start($r_cigar, $p_s, $s, $start);
+	}
+
+	my $tailing = "";
+	if($p_e > $end) {
+		($r_cigar, $p_e, $e) = find_new_end($r_cigar, $p_e, $e, $end);
+	}
+	return if($s >= $e);	
+	my @cigar = @{$r_cigar};
+
+	# deal with leading softclip
+    my	$op = shift @cigar;
+	my $l =  $pos2padded->[$p_s - $start + 1] - $pos2padded->[1];
+	if($op->[0] eq 'S') {
+		my $ss = $op->[1] - $l;
+		if($ss < 0) {
+			$ss = -$ss;
+			$leading .= ' ' x $ss;
+			$ss = 0;
+		}
+		$leading .= '<font color="' . HTML_COLOR->{'SCLIP'} . '">';
+		for( my $i = $ss; $i < $op->[1]; $i++) {
+			my $c = substr($seq, $i, 1);
+			$c = lc $c if($qual[$i] < $lowqual_cutoff);
+			$leading .= $c;
+		}
+		$leading .= '</font>' ;
+	}
+	else {
+		$leading .= ' ' x $l;	
+		unshift @cigar, $op;
+	}
+
+	#dealing with tailing softclip
+	$op = pop @cigar;
+	$l = $pos2padded->[$end - $start + 1] - $pos2padded->[$p_e - $start + 1];
+	if($op->[0] eq 'S') {
+		my $ee = $op->[1] - $l;
+		$ee = 0 if($ee < 0 );
+		$tailing = '<font color="' . HTML_COLOR->{'SCLIP'} . '">';
+		for( my $i = 0; $i < $op->[1] - $ee; $i++){
+			my $c = substr($seq, $i + $e, 1);
+			$c = lc $c if($qual[$i + $e] < $lowqual_cutoff);
+			$tailing .= $c;
+		}
+		$tailing .= '</font>';
+	}
+	else{
+		push @cigar, $op;
+	}
+
+	# generate the alignment part, only M, I, D and N
+	my $mid = '';
+	my $mode;
+	foreach $op (@cigar) {
+		my $l = $op->[1];
+		if($op->[0] eq 'M') {
+			my $line = '';
+			while($l > 0 ){
+				$l--;
+				my $c = substr($seq, $s - 1, 1); #seq is 0 based
+				my $newmode = "MISMATCH";
+				my $cc = substr($ref, $p_s - $start, 1); 
+				#last unless ($c && $cc);
+				$newmode = "MATCH" if($cc eq $c);
+				if($qual[$s-1] < $lowqual_cutoff) {
+					$c = lc $c;
+					$newmode = 'LOWQUAL' if($newmode eq 'MATCH');
+				}
+				if(!$mode) {
+					$line .= '<font color="' . HTML_COLOR->{$newmode} . '">';
+				}
+				elsif($mode ne $newmode) {
+					$line .= '</font>' . '<font color="' . HTML_COLOR->{$newmode} . '">';
+				}
+				$mode = $newmode;
+				$line .= $c;
+				$s++; $p_s++;
+				# dealing with padded * in reference genome
+				if($p_s < $p_e && $s < $e && $l > 0) { 
+					my $tmp = $pos2padded->[$p_s - $start + 1]-$pos2padded->[$p_s-1 - $start + 1];
+					if( $tmp > 1) {	
+						$line .= '</font>';
+						$line .= '<font color="' . HTML_COLOR->{'INDEL'} . '">';
+						$line .= '*' x ($tmp - 1);
+						$mode = 'INDEL';
+					}
+				}
+			}
+			$mid .= $line;
+		}
+		if($op->[0] eq 'D' || $op->[0] eq 'I' || $op->[0] eq 'N') {
+			my $newmode = 'INDEL';
+			my $tmp; #extra padded * after indel
+			$mid .= '</font>' if($mode && $mode ne $newmode);
+			$mid .= '<font color="' . HTML_COLOR->{'INDEL'} . '">';
+
+			if($op->[0] eq 'D' || $op->[0] eq 'N') {
+				$mid .= ($op->[0] eq 'D' ? '*' : '=') x $l;
+				$p_s += $l;
+				$tmp = $pos2padded->[$p_s - $start + 1]-$pos2padded->[$p_s - $l - $start ] - $l if($p_s < $p_e);
+			}
+			else{
+				$tmp = $pos2padded->[$p_s - $start + 1] - $pos2padded->[$p_s - 1 - $start + 1] - $l if($p_s < $p_e);
+				while($l > 0 ) {
+					my $c = substr($seq, $s - 1, 1);
+					$c = lc $c if($qual[$s - 1] < $lowqual_cutoff);
+					$mid .= $c;
+					$l--; $s++;
+				}
+			}
+			$mode = $newmode;
+			if($p_s < $p_e && $tmp > 1) {	
+				$mid .= '*' x ($tmp - 1);
+			}
+		}
+	}
+	$mid .= '</font>';
+	$rtn .= $leading . $mid . $tailing;		
+	$rtn = '<b><i>' . $rtn . '</i></b>' if($repeat);
+	return $rtn;
+}
+
+sub find_new_start {
+	my ($r_cigar, $p_s, $s, $start) = @_;
+	my @cigar = @{$r_cigar};
+ 
+	while(1) {
+		my $op = shift @cigar;
+		next if( $op->[0] eq 'S' || $op->[0] eq 'H');
+		if( $op->[0] eq 'I') { 
+			$s += $op->[1];	next;
+		}
+		if( $p_s + $op->[1] < $start ) { 
+			$p_s += $op->[1];	
+			$s += $op->[1] if $op->[0] eq 'M'; 
+		}
+		else {
+			$s += ($start - $p_s) if $op->[0] eq 'M'; 
+			unshift @cigar, [$op->[0], $op->[1] - ($start - $p_s)];
+			$p_s = $start;
+			return (\@cigar, $p_s, $s);
+		}
+	}
+}
+
+sub find_new_end {
+	my ($r_cigar, $p_e, $e, $end) = @_;
+	my @cigar = @{$r_cigar};
+
+	while(1) {
+		my $op = pop @cigar;
+		next if( $op->[0] eq 'S' || $op->[0] eq 'H');
+		if( $op->[0] eq 'I') { 
+			$e -= $op->[1];	next;
+		}
+		if( $p_e - $op->[1] > $end ) { 
+			$p_e -= $op->[1];	
+			$e -= $op->[1] if $op->[0] eq 'M'; 
+		}
+		else {
+			$e -= ($p_e - $end) if $op->[0] eq 'M';
+			push @cigar, [$op->[0], $op->[1] - ($p_e - $end)];
+			$p_e = $end;
+			return (\@cigar, $p_e, $e);
+		}
+	}
+}
+
+=head1 NAME
+
+bam2html.pl - a bam file viewer that just simple display part of the alignment as HTML file.
+
+
+=head1 VERSION
+
+This documentation refers to bam2html.pl version 0.0.1.
+
+
+=head1 USAGE
+	
+	Display part of a bam file:
+	    bam2html.pl -r hg18.fa -d diag.bam --range 1:123566-123766 -o diag.html
+	Display part of two bam files, one diagnositc, one germlie for comparison.
+	    bam2html.pl -r hg18.fa -d diag.bam -g germline.bam --range 1:123566-123766 -o diag.html
+	Display part of two bam files, one diagnositc, one germlie for comparison from
+	a list of positions in a file, each line should be tab sepearted as: chr, start, and end.
+	    bam2html.pl -r hg18.fa -d diag.bam -g germline.bam -o diag.html -f position.txt
+	Display part of two bam files, one diagnositc, one germlie for comparison from a file
+	generated by CREST.pl.
+	    bam2html.pl -r hg18.fa -d diag.bam -g germline.bam -o predSV.html -f predSV.txt
+
+=head1 REQUIRED ARGUMENTS
+
+	To run the program, several parameter must specified.
+	-d:					The input (diagnositic) bam file
+	-g:					The input (germ line) bam file
+	-r, --ref_genome: 	The reference genome file in fa format
+
+=head1 OPTIONS
+
+	The	options that can be used for the program.
+	-o: 		The output file, default to STDOUT if missing.
+	--range:	The range where SV will be detected, using chr1:100-200 format.
+	-f, -i:		The input file from either CREST.pl or tab seperated chr, start, end.
+
+	-h, --help	 Help information
+	--man		 Man page.
+	--usage		 Usage information.
+	--version	 Software version.
+
+
+=head1 DESCRIPTION
+
+ This is a bam file viewer that just simple display part of the alignment as 
+ HTML file. The program is developed to view Structure Varitions around break 
+ point. So manual review will a breeze.  Any way this program can be used
+ otherway, but don't put a too big range to display as the view will not
+ be pretty as each read will occupy a line.
+
+
+=head1 DIAGNOSTICS
+
+If the program does not respond for minutes, please be a little bit patient as
+sometimes generating the html file could take longer time.
+If you provide a range as "chr1:50-100" and nothing was output, please make sure
+the reference genomes for mapping and display are exact the same and the chrom
+name is chr1 not 1.
+
+
+=head1 DEPENDENCIES
+
+The program depend on several packages:
+1. Bioperl perl module.
+2. Bio::DB::Sam, version 1.5 or later, it requires samtools lib installed.
+
+
+=head1 BUGS AND LIMITATIONS
+
+There are no known bugs in this module, but the method is limitted to bam file 
+that has soft-clipping cigar string generated.Please report problems to 
+Jianmin Wang  (Jianmin.Wang@stjude.org)
+Patches are welcome.
+
+=head1 AUTHOR
+
+Jianmin Wang (Jianmin.Wang@stjude.org)
+
+
+
+=head1 LICENCE AND COPYRIGHT
+
+Copyright (c) 2010 by St. Jude Children's Research Hospital.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 2 of the License, or
+(at your option) any later version. 
+
+This program is distributed in the hope that it will be useful, 
+but WITHOUT ANY WARRANTY; without even the implied warranty of 
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
author	jjohnson
date	Wed, 08 Feb 2012 16:59:24 -0500
parents
children