| 13 | 1 #!/usr/bin/perl -w | 
|  | 2 | 
|  | 3 use strict; | 
|  | 4 use warnings; | 
|  | 5 use Getopt::Std; | 
|  | 6 my $version = '0.5_galaxy'; | 
|  | 7 | 
|  | 8 my $SAMTOOLS_BIN_DIR="/home/cbl/Downloads/samtools-1.9/"; | 
|  | 9 | 
|  | 10 my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." ); | 
|  | 11 | 
|  | 12 getopts('dt:p:n:f:s:S:o:b:l:x:N:', \%opts); #GALAXY | 
|  | 13 | 
|  | 14 my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory"; | 
|  | 15 | 
|  | 16 my $pt_bad_mates_file=$opts{b};  #GALAXY | 
|  | 17 my $pt_log_file=$opts{l}; #GALAXY | 
|  | 18 my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY | 
|  | 19 | 
|  | 20 | 
|  | 21 die(qq/ | 
|  | 22 | 
|  | 23 Description: | 
|  | 24 | 
|  | 25     Preprocessing of mates to get anomalously mapped mate-pair\/paired-end reads as input | 
|  | 26     for SVDetect. | 
|  | 27 | 
|  | 28     From all pairs mapped onto the reference genome, this script outputs abnormal pairs: | 
|  | 29         - mapped on two different chromosomes | 
|  | 30         - with an incorrect strand orientation and\/or pair order | 
|  | 31         - with an insert size distance +- sigma threshold | 
|  | 32     into a file <prefix.ab.bam\/sam> | 
|  | 33 | 
|  | 34     -BAM\/SAM File input format only, sorted by read names | 
|  | 35 | 
|  | 36     Version : $version | 
|  | 37     SAMtools required for BAM files | 
|  | 38 | 
|  | 39 | 
|  | 40 Usage:   BAM_preprocessingPairs.pl [options] <all_mate_file.sorted.bam\/sam> | 
|  | 41 | 
|  | 42 Options: -t BOOLEAN   read type: =1 (Illumina), =0 (SOLiD) [$opts{t}] | 
|  | 43          -p BOOLEAN   pair type: =1 (paired-end), =0 (mate-pair)  [$opts{p}] | 
|  | 44          -n INTEGER   number of pairs for calculating mu and sigma lengths [$opts{n}] | 
|  | 45 	 -s INTEGER   minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}] | 
|  | 46 	 -S INTEGER   maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}] | 
|  | 47          -f REAL      minimal number of sigma fold for filtering pairs [$opts{f}] | 
|  | 48          -d           dump normal pairs into a file [<prefix.norm.bam\/sam>] (optional) | 
|  | 49 	 -o STRING    output directory [$working_dir] | 
|  | 50 | 
|  | 51 \n/) if (@ARGV == 0 && -t STDIN); | 
|  | 52 | 
|  | 53 unless (-d $opts{o}){ | 
|  | 54 	mkdir $opts{o} or die; | 
|  | 55 } | 
|  | 56 $opts{o}.="/" if($opts{o}!~/\/$/); | 
|  | 57 | 
|  | 58 my $mates_file=shift(@ARGV); | 
|  | 59 | 
|  | 60 $mates_file=readlink($mates_file); | 
|  | 61 | 
|  | 62 my $bad_mates_file=(split(/\//,$mates_file))[$#_]; | 
|  | 63 | 
|  | 64 if($bad_mates_file=~/.(s|b)am$/){ | 
|  | 65     $bad_mates_file=~s/.(b|s)am$/.ab.sam/; | 
|  | 66     $bad_mates_file=$opts{o}.$bad_mates_file; | 
|  | 67 } | 
|  | 68 | 
|  | 69 else{ | 
|  | 70     die "Error: mate_file with the extension <.bam> or <.sam> needed !\n"; | 
|  | 71 } | 
|  | 72 | 
|  | 73 my $good_mates_file; | 
|  | 74 if($opts{d}){ | 
|  | 75     $good_mates_file=(split(/\//,$mates_file))[$#_]; | 
|  | 76     $good_mates_file=~s/.(b|s)am$/.norm.sam/; | 
|  | 77     $good_mates_file=$opts{o}.$good_mates_file; | 
|  | 78 } | 
|  | 79 | 
|  | 80 my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY | 
|  | 81 | 
|  | 82 #------------------------------------------------------------------------------# | 
|  | 83 #Calculate mu and sigma | 
|  | 84 | 
|  | 85 open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n"; | 
|  | 86 | 
|  | 87 print LOG "\# Calculating mu and sigma lengths...\n"; | 
|  | 88 print LOG "-- file=$mates_file\n"; | 
|  | 89 print LOG "-- n=$opts{n}\n"; | 
|  | 90 print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\n"; | 
|  | 91 | 
|  | 92 my ($record, $sumX,$sumX2) = (0,0,0); | 
|  | 93 my $warn=$opts{n}/10; | 
|  | 94 my $prev_pair="FIRST"; | 
|  | 95 | 
|  | 96 my $bam=($mates_file =~ /.bam$/)? 1:0; | 
|  | 97 | 
|  | 98 if($bam){ | 
|  | 99     open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; | 
|  | 100 }else{ | 
|  | 101     open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; | 
|  | 102 } | 
|  | 103 | 
|  | 104 while(<MATES>){ | 
|  | 105 | 
|  | 106     my @t=split; | 
|  | 107 | 
|  | 108     next if ($t[0]=~/^@/); | 
|  | 109 | 
|  | 110     my $current_pair=$t[0]; | 
|  | 111     next if($current_pair eq $prev_pair); | 
|  | 112     $prev_pair=$current_pair; | 
|  | 113 | 
|  | 114     my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8])); | 
|  | 115 | 
|  | 116     next if (($t[1]&0x0004) || ($t[1]&0x0008)); | 
|  | 117     next if ($length<$opts{s} || $length>$opts{S}) ; | 
|  | 118 | 
|  | 119     if($chr2 eq "="){ | 
|  | 120 | 
|  | 121         $sumX += $length;							#add to sum and sum^2 for mean and variance calculation | 
|  | 122 	$sumX2 += $length*$length; | 
|  | 123         $record++; | 
|  | 124     } | 
|  | 125 | 
|  | 126     if($record>$warn){ | 
|  | 127 	print LOG "-- $warn pairs analysed\n"; | 
|  | 128         $warn+=$warn; | 
|  | 129     } | 
|  | 130 | 
|  | 131     last if ($record>$opts{n}); | 
|  | 132 | 
|  | 133 } | 
|  | 134 close (MATES); | 
|  | 135 | 
|  | 136 $record--; | 
|  | 137 my $mu = $sumX/$record; | 
|  | 138 my $sigma = sqrt($sumX2/$record - $mu*$mu); | 
|  | 139 | 
|  | 140 print LOG "-- Total : $record pairs analysed\n"; | 
|  | 141 print LOG "-- mu length = ".decimal($mu,1).", sigma length = ".decimal($sigma,1)."\n"; | 
|  | 142 | 
|  | 143 #------------------------------------------------------------------------------# | 
|  | 144 #------------------------------------------------------------------------------# | 
|  | 145 #Preprocessing pairs | 
|  | 146 | 
|  | 147 $warn=100000; | 
|  | 148 | 
|  | 149 $record=0; | 
|  | 150 my %count=( ab=>0, norm=>0, chr=>0, sense=>0, dist=>0, unmap=>0); | 
|  | 151 | 
|  | 152 my $read_type=($opts{t})? "Illumina":"SOLiD"; | 
|  | 153 my $pair_type=($opts{p})? "paired-end":"mate-paired"; | 
|  | 154 | 
|  | 155 print LOG "\# Preprocessing pairs...\n"; | 
|  | 156 print LOG "-- file= $mates_file\n"; | 
|  | 157 print LOG "-- type= $read_type $pair_type reads\n"; | 
|  | 158 print LOG "-- sigma threshold= $opts{f}\n"; | 
|  | 159 print LOG "-- using ".decimal($mu-$opts{f}*$sigma,4)."-".decimal($mu+$opts{f}*$sigma,4)." as normal range of insert size\n"; | 
|  | 160 | 
|  | 161 my @header; | 
|  | 162 | 
|  | 163 if($bam){ | 
|  | 164     open(HEADER, "${SAMTOOLS_BIN_DIR}/samtools view -H $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; | 
|  | 165     @header=<HEADER>; | 
|  | 166     close HEADER; | 
|  | 167     open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; | 
|  | 168 }else{ | 
|  | 169     open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; | 
|  | 170 } | 
|  | 171 | 
|  | 172 open AB, ">$bad_mates_file" or die "$0: can't write in the output: $bad_mates_file :$!\n"; | 
|  | 173 print AB @header if($bam); | 
|  | 174 | 
|  | 175 if($opts{d}){ | 
|  | 176     open NORM, ">$good_mates_file" or die "$0: can't write in the output: $good_mates_file :$!\n"; | 
|  | 177     print NORM @header if($bam); | 
|  | 178 } | 
|  | 179 | 
|  | 180 $prev_pair="FIRST"; | 
|  | 181 my $prev_bad; | 
|  | 182 | 
|  | 183 while(<MATES>){ | 
|  | 184 | 
|  | 185     my @t=split; | 
|  | 186     my $bad=0; | 
|  | 187 | 
|  | 188     if ($t[0]=~/^@/){ | 
|  | 189         print AB; | 
|  | 190         print NORM if ($opts{d}); | 
|  | 191         next; | 
|  | 192     } | 
|  | 193 | 
|  | 194     my $current_pair=$t[0]; | 
|  | 195     if($current_pair eq $prev_pair){ | 
|  | 196         next if($prev_bad==-1); | 
|  | 197         if($prev_bad){ | 
|  | 198             print AB; | 
|  | 199         }elsif(!$prev_bad){ | 
|  | 200             print NORM if($opts{d}); | 
|  | 201         } | 
|  | 202         next; | 
|  | 203     } | 
|  | 204 | 
|  | 205     $prev_pair=$current_pair; | 
|  | 206 | 
|  | 207     my ($chr1,$chr2,$pos1,$pos2,$length)=($t[2],$t[6],$t[3],$t[7], abs($t[8])); | 
|  | 208 | 
|  | 209     if (($t[1]&0x0004) || ($t[1]&0x0008)){ | 
|  | 210         $prev_bad=-1; | 
|  | 211         $count{unmap}++; | 
|  | 212         $record++; | 
|  | 213         next; | 
|  | 214 | 
|  | 215     } | 
|  | 216 | 
|  | 217     my $strand1 = (($t[1]&0x0010))? 'R':'F'; | 
|  | 218     my $strand2 = (($t[1]&0x0020))? 'R':'F'; | 
|  | 219     my $order1  = (($t[1]&0x0040))? '1':'2'; | 
|  | 220     my $order2  = (($t[1]&0x0080))? '1':'2'; | 
|  | 221 | 
|  | 222     if($order1 == 2){ | 
|  | 223         ($strand1,$strand2)=($strand2,$strand1); | 
|  | 224         ($chr1,$chr2)=($chr2,$chr1); | 
|  | 225         ($pos1,$pos2)=($pos2,$pos1); | 
|  | 226         ($order1,$order2)=($order2,$order1); | 
|  | 227     } | 
|  | 228 | 
|  | 229     my $sense=$strand1.$strand2; | 
|  | 230 | 
|  | 231     if($chr1 ne "=" && $chr2 ne "="){ | 
|  | 232         $bad=1; | 
|  | 233         $count{chr}++; | 
|  | 234     } | 
|  | 235 | 
|  | 236     if($opts{p}){ #paired-end | 
|  | 237         if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){ | 
|  | 238             $bad=1; | 
|  | 239             $count{sense}++; | 
|  | 240         } | 
|  | 241     }else{ #mate-pair | 
|  | 242         if($opts{t}){ #Illumina | 
|  | 243             if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){ | 
|  | 244             $bad=1; | 
|  | 245             $count{sense}++; | 
|  | 246             } | 
|  | 247         }else{ #SOLiD | 
|  | 248             if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){ | 
|  | 249             $bad=1; | 
|  | 250             $count{sense}++; | 
|  | 251             } | 
|  | 252         } | 
|  | 253     } | 
|  | 254 | 
|  | 255     if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){ | 
|  | 256         $bad=1; | 
|  | 257         $count{dist}++; | 
|  | 258     } | 
|  | 259 | 
|  | 260     if($bad){ | 
|  | 261         print AB; | 
|  | 262         $count{ab}++; | 
|  | 263         $prev_bad=$bad; | 
|  | 264     }else{ | 
|  | 265         print NORM if ($opts{d}); | 
|  | 266         $count{norm}++; | 
|  | 267         $prev_bad=$bad; | 
|  | 268     } | 
|  | 269 | 
|  | 270     $record++; | 
|  | 271 | 
|  | 272     if($record>$warn){ | 
|  | 273         print LOG "-- $warn pairs analysed\n"; | 
|  | 274         $warn+=100000; | 
|  | 275     } | 
|  | 276 } | 
|  | 277 | 
|  | 278 close AB; | 
|  | 279 close NORM if($opts{d}); | 
|  | 280 | 
|  | 281 print LOG "-- Total : $record pairs analysed\n"; | 
|  | 282 print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\n"; | 
|  | 283 print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\n"; | 
|  | 284 print LOG "---- $count{ab} abnormal mapped pairs\n"; | 
|  | 285 print LOG "------ $count{chr} pairs mapped on two different chromosomes\n"; | 
|  | 286 print LOG "------ $count{sense} pairs with incorrect strand orientation and\/or pair order\n"; | 
|  | 287 print LOG "------ $count{dist} pairs with incorrect insert size distance\n"; | 
|  | 288 print LOG "--- $count{norm} correct mapped pairs\n"; | 
|  | 289 | 
|  | 290 #------------------------------------------------------------------------------# | 
|  | 291 #------------------------------------------------------------------------------# | 
|  | 292 #OUTPUT | 
|  | 293 | 
|  | 294 if($bam){ | 
|  | 295 | 
|  | 296     my $bam_file=$bad_mates_file; | 
|  | 297     $bam_file=~s/.sam$/.bam/; | 
|  | 298     print LOG "\# Converting sam to bam for abnormal mapped pairs\n"; | 
|  | 299     system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log"); | 
|  | 300     unlink($bad_mates_file); | 
|  | 301     print LOG "-- output created: $bam_file\n"; | 
|  | 302 | 
|  | 303     system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY | 
|  | 304 | 
|  | 305     if($opts{d}){ | 
|  | 306         $bam_file=$good_mates_file; | 
|  | 307         $bam_file=~s/.sam$/.bam/; | 
|  | 308         print LOG "\# Converting sam to bam for correct mapped pairs\n"; | 
|  | 309         system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log"); | 
|  | 310         unlink($good_mates_file); | 
|  | 311         print LOG "-- output created: $bam_file\n"; | 
|  | 312 | 
|  | 313 	system "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY | 
|  | 314 | 
|  | 315     } | 
|  | 316 | 
|  | 317 } | 
|  | 318 | 
|  | 319 else{ | 
|  | 320     print LOG "-- output created: $bad_mates_file\n"; | 
|  | 321     print LOG "-- output created: $good_mates_file\n" if($opts{d}); | 
|  | 322 } | 
|  | 323 | 
|  | 324 close LOG; | 
|  | 325 | 
|  | 326 system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY | 
|  | 327 | 
|  | 328 | 
|  | 329 #------------------------------------------------------------------------------# | 
|  | 330 #------------------------------------------------------------------------------# | 
|  | 331 sub decimal{ | 
|  | 332 | 
|  | 333   my $num=shift; | 
|  | 334   my $digs_to_cut=shift; | 
|  | 335 | 
|  | 336   $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); | 
|  | 337 | 
|  | 338   return $num; | 
|  | 339 } | 
|  | 340 #------------------------------------------------------------------------------# |