Mercurial > repos > scisjnu123 > test
comparison svdetect/BAM_preprocessingPairs.pl @ 4:abdb3ed0c740 draft
Uploaded
| author | scisjnu123 |
|---|---|
| date | Thu, 12 Sep 2019 07:41:16 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 3:b27d4a0b7673 | 4:abdb3ed0c740 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 use Getopt::Std; | |
| 6 my $version = '0.5_galaxy'; | |
| 7 | |
| 8 my $SAMTOOLS_BIN_DIR="/home/cbl/Downloads/samtools-1.9/"; | |
| 9 | |
| 10 my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." ); | |
| 11 | |
| 12 getopts('dt:p:n:f:s:S:o:b:l:x:N:', \%opts); #GALAXY | |
| 13 | |
| 14 my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory"; | |
| 15 | |
| 16 my $pt_bad_mates_file=$opts{b}; #GALAXY | |
| 17 my $pt_log_file=$opts{l}; #GALAXY | |
| 18 my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY | |
| 19 | |
| 20 | |
| 21 die(qq/ | |
| 22 | |
| 23 Description: | |
| 24 | |
| 25 Preprocessing of mates to get anomalously mapped mate-pair\/paired-end reads as input | |
| 26 for SVDetect. | |
| 27 | |
| 28 From all pairs mapped onto the reference genome, this script outputs abnormal pairs: | |
| 29 - mapped on two different chromosomes | |
| 30 - with an incorrect strand orientation and\/or pair order | |
| 31 - with an insert size distance +- sigma threshold | |
| 32 into a file <prefix.ab.bam\/sam> | |
| 33 | |
| 34 -BAM\/SAM File input format only, sorted by read names | |
| 35 | |
| 36 Version : $version | |
| 37 SAMtools required for BAM files | |
| 38 | |
| 39 | |
| 40 Usage: BAM_preprocessingPairs.pl [options] <all_mate_file.sorted.bam\/sam> | |
| 41 | |
| 42 Options: -t BOOLEAN read type: =1 (Illumina), =0 (SOLiD) [$opts{t}] | |
| 43 -p BOOLEAN pair type: =1 (paired-end), =0 (mate-pair) [$opts{p}] | |
| 44 -n INTEGER number of pairs for calculating mu and sigma lengths [$opts{n}] | |
| 45 -s INTEGER minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}] | |
| 46 -S INTEGER maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}] | |
| 47 -f REAL minimal number of sigma fold for filtering pairs [$opts{f}] | |
| 48 -d dump normal pairs into a file [<prefix.norm.bam\/sam>] (optional) | |
| 49 -o STRING output directory [$working_dir] | |
| 50 | |
| 51 \n/) if (@ARGV == 0 && -t STDIN); | |
| 52 | |
| 53 unless (-d $opts{o}){ | |
| 54 mkdir $opts{o} or die; | |
| 55 } | |
| 56 $opts{o}.="/" if($opts{o}!~/\/$/); | |
| 57 | |
| 58 my $mates_file=shift(@ARGV); | |
| 59 | |
| 60 $mates_file=readlink($mates_file); | |
| 61 | |
| 62 my $bad_mates_file=(split(/\//,$mates_file))[$#_]; | |
| 63 | |
| 64 if($bad_mates_file=~/.(s|b)am$/){ | |
| 65 $bad_mates_file=~s/.(b|s)am$/.ab.sam/; | |
| 66 $bad_mates_file=$opts{o}.$bad_mates_file; | |
| 67 } | |
| 68 | |
| 69 else{ | |
| 70 die "Error: mate_file with the extension <.bam> or <.sam> needed !\n"; | |
| 71 } | |
| 72 | |
| 73 my $good_mates_file; | |
| 74 if($opts{d}){ | |
| 75 $good_mates_file=(split(/\//,$mates_file))[$#_]; | |
| 76 $good_mates_file=~s/.(b|s)am$/.norm.sam/; | |
| 77 $good_mates_file=$opts{o}.$good_mates_file; | |
| 78 } | |
| 79 | |
| 80 my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY | |
| 81 | |
| 82 #------------------------------------------------------------------------------# | |
| 83 #Calculate mu and sigma | |
| 84 | |
| 85 open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n"; | |
| 86 | |
| 87 print LOG "\# Calculating mu and sigma lengths...\n"; | |
| 88 print LOG "-- file=$mates_file\n"; | |
| 89 print LOG "-- n=$opts{n}\n"; | |
| 90 print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\n"; | |
| 91 | |
| 92 my ($record, $sumX,$sumX2) = (0,0,0); | |
| 93 my $warn=$opts{n}/10; | |
| 94 my $prev_pair="FIRST"; | |
| 95 | |
| 96 my $bam=($mates_file =~ /.bam$/)? 1:0; | |
| 97 | |
| 98 if($bam){ | |
| 99 open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; | |
| 100 }else{ | |
| 101 open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; | |
| 102 } | |
| 103 | |
| 104 while(<MATES>){ | |
| 105 | |
| 106 my @t=split; | |
| 107 | |
| 108 next if ($t[0]=~/^@/); | |
| 109 | |
| 110 my $current_pair=$t[0]; | |
| 111 next if($current_pair eq $prev_pair); | |
| 112 $prev_pair=$current_pair; | |
| 113 | |
| 114 my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8])); | |
| 115 | |
| 116 next if (($t[1]&0x0004) || ($t[1]&0x0008)); | |
| 117 next if ($length<$opts{s} || $length>$opts{S}) ; | |
| 118 | |
| 119 if($chr2 eq "="){ | |
| 120 | |
| 121 $sumX += $length; #add to sum and sum^2 for mean and variance calculation | |
| 122 $sumX2 += $length*$length; | |
| 123 $record++; | |
| 124 } | |
| 125 | |
| 126 if($record>$warn){ | |
| 127 print LOG "-- $warn pairs analysed\n"; | |
| 128 $warn+=$warn; | |
| 129 } | |
| 130 | |
| 131 last if ($record>$opts{n}); | |
| 132 | |
| 133 } | |
| 134 close (MATES); | |
| 135 | |
| 136 $record--; | |
| 137 my $mu = $sumX/$record; | |
| 138 my $sigma = sqrt($sumX2/$record - $mu*$mu); | |
| 139 | |
| 140 print LOG "-- Total : $record pairs analysed\n"; | |
| 141 print LOG "-- mu length = ".decimal($mu,1).", sigma length = ".decimal($sigma,1)."\n"; | |
| 142 | |
| 143 #------------------------------------------------------------------------------# | |
| 144 #------------------------------------------------------------------------------# | |
| 145 #Preprocessing pairs | |
| 146 | |
| 147 $warn=100000; | |
| 148 | |
| 149 $record=0; | |
| 150 my %count=( ab=>0, norm=>0, chr=>0, sense=>0, dist=>0, unmap=>0); | |
| 151 | |
| 152 my $read_type=($opts{t})? "Illumina":"SOLiD"; | |
| 153 my $pair_type=($opts{p})? "paired-end":"mate-paired"; | |
| 154 | |
| 155 print LOG "\# Preprocessing pairs...\n"; | |
| 156 print LOG "-- file= $mates_file\n"; | |
| 157 print LOG "-- type= $read_type $pair_type reads\n"; | |
| 158 print LOG "-- sigma threshold= $opts{f}\n"; | |
| 159 print LOG "-- using ".decimal($mu-$opts{f}*$sigma,4)."-".decimal($mu+$opts{f}*$sigma,4)." as normal range of insert size\n"; | |
| 160 | |
| 161 my @header; | |
| 162 | |
| 163 if($bam){ | |
| 164 open(HEADER, "${SAMTOOLS_BIN_DIR}/samtools view -H $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; | |
| 165 @header=<HEADER>; | |
| 166 close HEADER; | |
| 167 open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; | |
| 168 }else{ | |
| 169 open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; | |
| 170 } | |
| 171 | |
| 172 open AB, ">$bad_mates_file" or die "$0: can't write in the output: $bad_mates_file :$!\n"; | |
| 173 print AB @header if($bam); | |
| 174 | |
| 175 if($opts{d}){ | |
| 176 open NORM, ">$good_mates_file" or die "$0: can't write in the output: $good_mates_file :$!\n"; | |
| 177 print NORM @header if($bam); | |
| 178 } | |
| 179 | |
| 180 $prev_pair="FIRST"; | |
| 181 my $prev_bad; | |
| 182 | |
| 183 while(<MATES>){ | |
| 184 | |
| 185 my @t=split; | |
| 186 my $bad=0; | |
| 187 | |
| 188 if ($t[0]=~/^@/){ | |
| 189 print AB; | |
| 190 print NORM if ($opts{d}); | |
| 191 next; | |
| 192 } | |
| 193 | |
| 194 my $current_pair=$t[0]; | |
| 195 if($current_pair eq $prev_pair){ | |
| 196 next if($prev_bad==-1); | |
| 197 if($prev_bad){ | |
| 198 print AB; | |
| 199 }elsif(!$prev_bad){ | |
| 200 print NORM if($opts{d}); | |
| 201 } | |
| 202 next; | |
| 203 } | |
| 204 | |
| 205 $prev_pair=$current_pair; | |
| 206 | |
| 207 my ($chr1,$chr2,$pos1,$pos2,$length)=($t[2],$t[6],$t[3],$t[7], abs($t[8])); | |
| 208 | |
| 209 if (($t[1]&0x0004) || ($t[1]&0x0008)){ | |
| 210 $prev_bad=-1; | |
| 211 $count{unmap}++; | |
| 212 $record++; | |
| 213 next; | |
| 214 | |
| 215 } | |
| 216 | |
| 217 my $strand1 = (($t[1]&0x0010))? 'R':'F'; | |
| 218 my $strand2 = (($t[1]&0x0020))? 'R':'F'; | |
| 219 my $order1 = (($t[1]&0x0040))? '1':'2'; | |
| 220 my $order2 = (($t[1]&0x0080))? '1':'2'; | |
| 221 | |
| 222 if($order1 == 2){ | |
| 223 ($strand1,$strand2)=($strand2,$strand1); | |
| 224 ($chr1,$chr2)=($chr2,$chr1); | |
| 225 ($pos1,$pos2)=($pos2,$pos1); | |
| 226 ($order1,$order2)=($order2,$order1); | |
| 227 } | |
| 228 | |
| 229 my $sense=$strand1.$strand2; | |
| 230 | |
| 231 if($chr1 ne "=" && $chr2 ne "="){ | |
| 232 $bad=1; | |
| 233 $count{chr}++; | |
| 234 } | |
| 235 | |
| 236 if($opts{p}){ #paired-end | |
| 237 if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){ | |
| 238 $bad=1; | |
| 239 $count{sense}++; | |
| 240 } | |
| 241 }else{ #mate-pair | |
| 242 if($opts{t}){ #Illumina | |
| 243 if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){ | |
| 244 $bad=1; | |
| 245 $count{sense}++; | |
| 246 } | |
| 247 }else{ #SOLiD | |
| 248 if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){ | |
| 249 $bad=1; | |
| 250 $count{sense}++; | |
| 251 } | |
| 252 } | |
| 253 } | |
| 254 | |
| 255 if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){ | |
| 256 $bad=1; | |
| 257 $count{dist}++; | |
| 258 } | |
| 259 | |
| 260 if($bad){ | |
| 261 print AB; | |
| 262 $count{ab}++; | |
| 263 $prev_bad=$bad; | |
| 264 }else{ | |
| 265 print NORM if ($opts{d}); | |
| 266 $count{norm}++; | |
| 267 $prev_bad=$bad; | |
| 268 } | |
| 269 | |
| 270 $record++; | |
| 271 | |
| 272 if($record>$warn){ | |
| 273 print LOG "-- $warn pairs analysed\n"; | |
| 274 $warn+=100000; | |
| 275 } | |
| 276 } | |
| 277 | |
| 278 close AB; | |
| 279 close NORM if($opts{d}); | |
| 280 | |
| 281 print LOG "-- Total : $record pairs analysed\n"; | |
| 282 print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\n"; | |
| 283 print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\n"; | |
| 284 print LOG "---- $count{ab} abnormal mapped pairs\n"; | |
| 285 print LOG "------ $count{chr} pairs mapped on two different chromosomes\n"; | |
| 286 print LOG "------ $count{sense} pairs with incorrect strand orientation and\/or pair order\n"; | |
| 287 print LOG "------ $count{dist} pairs with incorrect insert size distance\n"; | |
| 288 print LOG "--- $count{norm} correct mapped pairs\n"; | |
| 289 | |
| 290 #------------------------------------------------------------------------------# | |
| 291 #------------------------------------------------------------------------------# | |
| 292 #OUTPUT | |
| 293 | |
| 294 if($bam){ | |
| 295 | |
| 296 my $bam_file=$bad_mates_file; | |
| 297 $bam_file=~s/.sam$/.bam/; | |
| 298 print LOG "\# Converting sam to bam for abnormal mapped pairs\n"; | |
| 299 system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log"); | |
| 300 unlink($bad_mates_file); | |
| 301 print LOG "-- output created: $bam_file\n"; | |
| 302 | |
| 303 system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY | |
| 304 | |
| 305 if($opts{d}){ | |
| 306 $bam_file=$good_mates_file; | |
| 307 $bam_file=~s/.sam$/.bam/; | |
| 308 print LOG "\# Converting sam to bam for correct mapped pairs\n"; | |
| 309 system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log"); | |
| 310 unlink($good_mates_file); | |
| 311 print LOG "-- output created: $bam_file\n"; | |
| 312 | |
| 313 system "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY | |
| 314 | |
| 315 } | |
| 316 | |
| 317 } | |
| 318 | |
| 319 else{ | |
| 320 print LOG "-- output created: $bad_mates_file\n"; | |
| 321 print LOG "-- output created: $good_mates_file\n" if($opts{d}); | |
| 322 } | |
| 323 | |
| 324 close LOG; | |
| 325 | |
| 326 system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY | |
| 327 | |
| 328 | |
| 329 #------------------------------------------------------------------------------# | |
| 330 #------------------------------------------------------------------------------# | |
| 331 sub decimal{ | |
| 332 | |
| 333 my $num=shift; | |
| 334 my $digs_to_cut=shift; | |
| 335 | |
| 336 $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); | |
| 337 | |
| 338 return $num; | |
| 339 } | |
| 340 #------------------------------------------------------------------------------# |
