Previous changeset 2:89b207100214 (2012-06-11) Next changeset 4:f7a84d31bd83 (2012-06-11) |
Commit message:
Uploaded |
added:
BAM_preprocessingPairs.pl |
b |
diff -r 89b207100214 -r 861783bb65d2 BAM_preprocessingPairs.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BAM_preprocessingPairs.pl Mon Jun 11 12:30:18 2012 -0400 |
[ |
b'@@ -0,0 +1,340 @@\n+#!/usr/bin/perl -w\n+\n+use strict;\n+use warnings;\n+use Getopt::Std;\n+my $version = \'0.4b_galaxy\';\n+\n+my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools";\n+\n+my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." );\n+\n+getopts(\'dt:p:n:f:s:S:o:b:l:x:N:\', \\%opts); #GALAXY \n+\n+my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory";\n+\n+my $pt_bad_mates_file=$opts{b}; #GALAXY \n+my $pt_log_file=$opts{l}; #GALAXY \n+my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY \n+\n+\n+die(qq/\n+ \n+Description:\n+ \n+ Preprocessing of mates to get anomalously mapped mate-pair\\/paired-end reads as input\n+ for SVDetect.\n+\n+ From all pairs mapped onto the reference genome, this script outputs abnormal pairs:\n+ - mapped on two different chromosomes\n+ - with an incorrect strand orientation and\\/or pair order\n+ - with an insert size distance +- sigma threshold\n+ into a file <prefix.ab.bam\\/sam> sorted by read names\n+ \n+ -BAM\\/SAM File input format only.\n+\n+ Version : $version\n+ SAMtools required for BAM files\n+ \n+ \n+Usage: BAM_preprocessingPairs.pl [options] <all_mate_file.sorted.bam\\/sam>\n+\n+Options: -t BOOLEAN read type: =1 (Illumina), =0 (SOLiD) [$opts{t}]\n+ -p BOOLEAN pair type: =1 (paired-end), =0 (mate-pair) [$opts{p}]\n+ -n INTEGER number of pairs for calculating mu and sigma lengths [$opts{n}]\n+\t -s INTEGER minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}]\n+\t -S INTEGER maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}]\n+ -f REAL minimal number of sigma fold for filtering pairs [$opts{f}]\n+ -d dump normal pairs into a file [<prefix.norm.bam\\/sam>] (optional)\n+\t -o STRING output directory [$working_dir]\n+\n+\\n/) if (@ARGV == 0 && -t STDIN);\n+\n+unless (-d $opts{o}){\n+\tmkdir $opts{o} or die;\n+}\n+$opts{o}.="/" if($opts{o}!~/\\/$/);\n+\n+my $mates_file=shift(@ARGV);\n+\n+$mates_file=readlink($mates_file);\n+\n+my $bad_mates_file=(split(/\\//,$mates_file))[$#_];\n+\n+if($bad_mates_file=~/.(s|b)am$/){\n+ $bad_mates_file=~s/.(b|s)am$/.ab.sam/;\n+ $bad_mates_file=$opts{o}.$bad_mates_file;\n+}\n+\n+else{\n+ die "Error: mate_file with the extension <.bam> or <.sam> needed !\\n";\n+}\n+\n+my $good_mates_file;\n+if($opts{d}){\n+ $good_mates_file=(split(/\\//,$mates_file))[$#_];\n+ $good_mates_file=~s/.(b|s)am$/.norm.sam/;\n+ $good_mates_file=$opts{o}.$good_mates_file;\n+}\n+\n+my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY \n+\n+#------------------------------------------------------------------------------#\n+#Calculate mu and sigma\n+\n+open LOG,">$log_file" or die "$0: can\'t open ".$log_file.":$!\\n";\n+\n+print LOG "\\# Calculating mu and sigma lengths...\\n";\n+print LOG "-- file=$mates_file\\n";\n+print LOG "-- n=$opts{n}\\n";\n+print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\\n";\n+\n+my ($record, $sumX,$sumX2) = (0,0,0);\n+my $warn=$opts{n}/10;\n+my $prev_pair="FIRST";\n+\n+my $bam=($mates_file =~ /.bam$/)? 1:0;\n+\n+if($bam){\n+ open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can\'t open ".$mates_file.":$!\\n";\n+}else{\n+ open MATES, "<".$mates_file or die "$0: can\'t open ".$mates_file.":$!\\n";\n+}\n+\n+while(<MATES>){\n+ \n+ my @t=split;\n+ \n+ next if ($t[0]=~/^@/);\n+ \n+ my $current_pair=$t[0];\n+ next if($current_pair eq $prev_pair);\n+ $prev_pair=$current_pair; \n+ \n+ my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8]));\n+ \n+ next if ($chr1 eq "*" || $chr2 eq "*");\n+ next if ($length<$opts{s} || $length>$opts{S}) ;\n+ \n+ if($chr2 eq "="){\n+\n+ $sumX += $length;\t\t\t\t\t\t\t#add to sum and sum^2 for mean and variance calculation\n+\t$sumX2 += $length*$length;\n+ $record++;\n+ }\n+\n+ if($record>$warn){\n+\tprint LOG "-- $warn pairs analysed\\n";\n+ $warn+=$warn;\n+ }\n+ \n+ last if ($record>$opts{n});\n+ \n+}\n+close (MATES);\n+\n+$record--;\n+my $m'..b'ad=-1;\n+ $count{unmap}++;\n+ $record++;\n+ next;\n+ \n+ }\n+ \n+ my $strand1 = (($t[1]&0x0010))? \'R\':\'F\';\n+ my $strand2 = (($t[1]&0x0020))? \'R\':\'F\';\n+ my $order1 = (($t[1]&0x0040))? \'1\':\'2\';\n+ my $order2 = (($t[1]&0x0080))? \'1\':\'2\';\n+ \n+ if($order1 == 2){\n+ ($strand1,$strand2)=($strand2,$strand1);\n+ ($chr1,$chr2)=($chr2,$chr1);\n+ ($pos1,$pos2)=($pos2,$pos1);\n+ ($order1,$order2)=($order2,$order1);\n+ }\n+ \n+ my $sense=$strand1.$strand2;\n+ \n+ if($chr1 ne "=" && $chr2 ne "="){\n+ $bad=1;\n+ $count{chr}++;\n+ }\n+ \n+ if($opts{p}){ #paired-end\n+ if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){\n+ $bad=1;\n+ $count{sense}++;\n+ }\n+ }else{ #mate-pair\n+ if($opts{t}){ #Illumina\n+ if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){\n+ $bad=1;\n+ $count{sense}++;\n+ }\n+ }else{ #SOLiD\n+ if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){\n+ $bad=1;\n+ $count{sense}++;\n+ }\n+ }\n+ }\n+ \n+ if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){\n+ $bad=1;\n+ $count{dist}++;\n+ }\n+ \n+ if($bad){\n+ print AB;\n+ $count{ab}++;\n+ $prev_bad=$bad;\n+ }else{\n+ print NORM if ($opts{d});\n+ $count{norm}++;\n+ $prev_bad=$bad;\n+ }\n+ \n+ $record++;\n+ \n+ if($record>$warn){\n+ print LOG "-- $warn pairs analysed\\n";\n+ $warn+=100000;\n+ }\n+}\n+\n+close AB;\n+close NORM if($opts{d});\n+\n+print LOG "-- Total : $record pairs analysed\\n";\n+print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\\n";\n+print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\\n";\n+print LOG "---- $count{ab} abnormal mapped pairs\\n";\n+print LOG "------ $count{chr} pairs mapped on two different chromosomes\\n";\n+print LOG "------ $count{sense} pairs with incorrect strand orientation and\\/or pair order\\n";\n+print LOG "------ $count{dist} pairs with incorrect insert size distance\\n";\n+print LOG "--- $count{norm} correct mapped pairs\\n";\n+\n+#------------------------------------------------------------------------------#\n+#------------------------------------------------------------------------------#\n+#OUTPUT\n+\n+if($bam){\n+ \n+ my $bam_file=$bad_mates_file;\n+ $bam_file=~s/.sam$/.bam/;\n+ print LOG "\\# Converting sam to bam for abnormal mapped pairs\\n";\n+ system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log");\n+ unlink($bad_mates_file);\n+ print LOG "-- output created: $bam_file\\n";\n+\n+ system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY\n+ \n+ if($opts{d}){\n+ $bam_file=$good_mates_file;\n+ $bam_file=~s/.sam$/.bam/;\n+ print LOG "\\# Converting sam to bam for correct mapped pairs\\n";\n+ system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log");\n+ unlink($good_mates_file);\n+ print LOG "-- output created: $bam_file\\n";\n+\n+\tsystem "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY\n+\n+ }\n+\n+}\n+\n+else{\n+ print LOG "-- output created: $bad_mates_file\\n";\n+ print LOG "-- output created: $good_mates_file\\n" if($opts{d});\n+}\n+\n+close LOG;\n+\n+system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY\n+\n+\n+#------------------------------------------------------------------------------#\n+#------------------------------------------------------------------------------#\n+sub decimal{\n+ \n+ my $num=shift;\n+ my $digs_to_cut=shift;\n+\n+ $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\\d+\\.(\\d){$digs_to_cut,}/);\n+\n+ return $num;\n+}\n+#------------------------------------------------------------------------------#\n' |