3
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5 use Getopt::Std;
|
|
6 my $version = '0.4b_galaxy';
|
|
7
|
|
8 my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools";
|
|
9
|
|
10 my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." );
|
|
11
|
|
12 getopts('dt:p:n:f:s:S:o:b:l:x:N:', \%opts); #GALAXY
|
|
13
|
|
14 my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory";
|
|
15
|
|
16 my $pt_bad_mates_file=$opts{b}; #GALAXY
|
|
17 my $pt_log_file=$opts{l}; #GALAXY
|
|
18 my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY
|
|
19
|
|
20
|
|
21 die(qq/
|
|
22
|
|
23 Description:
|
|
24
|
|
25 Preprocessing of mates to get anomalously mapped mate-pair\/paired-end reads as input
|
|
26 for SVDetect.
|
|
27
|
|
28 From all pairs mapped onto the reference genome, this script outputs abnormal pairs:
|
|
29 - mapped on two different chromosomes
|
|
30 - with an incorrect strand orientation and\/or pair order
|
|
31 - with an insert size distance +- sigma threshold
|
|
32 into a file <prefix.ab.bam\/sam> sorted by read names
|
|
33
|
|
34 -BAM\/SAM File input format only.
|
|
35
|
|
36 Version : $version
|
|
37 SAMtools required for BAM files
|
|
38
|
|
39
|
|
40 Usage: BAM_preprocessingPairs.pl [options] <all_mate_file.sorted.bam\/sam>
|
|
41
|
|
42 Options: -t BOOLEAN read type: =1 (Illumina), =0 (SOLiD) [$opts{t}]
|
|
43 -p BOOLEAN pair type: =1 (paired-end), =0 (mate-pair) [$opts{p}]
|
|
44 -n INTEGER number of pairs for calculating mu and sigma lengths [$opts{n}]
|
|
45 -s INTEGER minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}]
|
|
46 -S INTEGER maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}]
|
|
47 -f REAL minimal number of sigma fold for filtering pairs [$opts{f}]
|
|
48 -d dump normal pairs into a file [<prefix.norm.bam\/sam>] (optional)
|
|
49 -o STRING output directory [$working_dir]
|
|
50
|
|
51 \n/) if (@ARGV == 0 && -t STDIN);
|
|
52
|
|
53 unless (-d $opts{o}){
|
|
54 mkdir $opts{o} or die;
|
|
55 }
|
|
56 $opts{o}.="/" if($opts{o}!~/\/$/);
|
|
57
|
|
58 my $mates_file=shift(@ARGV);
|
|
59
|
|
60 $mates_file=readlink($mates_file);
|
|
61
|
|
62 my $bad_mates_file=(split(/\//,$mates_file))[$#_];
|
|
63
|
|
64 if($bad_mates_file=~/.(s|b)am$/){
|
|
65 $bad_mates_file=~s/.(b|s)am$/.ab.sam/;
|
|
66 $bad_mates_file=$opts{o}.$bad_mates_file;
|
|
67 }
|
|
68
|
|
69 else{
|
|
70 die "Error: mate_file with the extension <.bam> or <.sam> needed !\n";
|
|
71 }
|
|
72
|
|
73 my $good_mates_file;
|
|
74 if($opts{d}){
|
|
75 $good_mates_file=(split(/\//,$mates_file))[$#_];
|
|
76 $good_mates_file=~s/.(b|s)am$/.norm.sam/;
|
|
77 $good_mates_file=$opts{o}.$good_mates_file;
|
|
78 }
|
|
79
|
|
80 my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY
|
|
81
|
|
82 #------------------------------------------------------------------------------#
|
|
83 #Calculate mu and sigma
|
|
84
|
|
85 open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";
|
|
86
|
|
87 print LOG "\# Calculating mu and sigma lengths...\n";
|
|
88 print LOG "-- file=$mates_file\n";
|
|
89 print LOG "-- n=$opts{n}\n";
|
|
90 print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\n";
|
|
91
|
|
92 my ($record, $sumX,$sumX2) = (0,0,0);
|
|
93 my $warn=$opts{n}/10;
|
|
94 my $prev_pair="FIRST";
|
|
95
|
|
96 my $bam=($mates_file =~ /.bam$/)? 1:0;
|
|
97
|
|
98 if($bam){
|
|
99 open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";
|
|
100 }else{
|
|
101 open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n";
|
|
102 }
|
|
103
|
|
104 while(<MATES>){
|
|
105
|
|
106 my @t=split;
|
|
107
|
|
108 next if ($t[0]=~/^@/);
|
|
109
|
|
110 my $current_pair=$t[0];
|
|
111 next if($current_pair eq $prev_pair);
|
|
112 $prev_pair=$current_pair;
|
|
113
|
|
114 my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8]));
|
|
115
|
|
116 next if ($chr1 eq "*" || $chr2 eq "*");
|
|
117 next if ($length<$opts{s} || $length>$opts{S}) ;
|
|
118
|
|
119 if($chr2 eq "="){
|
|
120
|
|
121 $sumX += $length; #add to sum and sum^2 for mean and variance calculation
|
|
122 $sumX2 += $length*$length;
|
|
123 $record++;
|
|
124 }
|
|
125
|
|
126 if($record>$warn){
|
|
127 print LOG "-- $warn pairs analysed\n";
|
|
128 $warn+=$warn;
|
|
129 }
|
|
130
|
|
131 last if ($record>$opts{n});
|
|
132
|
|
133 }
|
|
134 close (MATES);
|
|
135
|
|
136 $record--;
|
|
137 my $mu = $sumX/$record;
|
|
138 my $sigma = sqrt($sumX2/$record - $mu*$mu);
|
|
139
|
|
140 print LOG "-- Total : $record pairs analysed\n";
|
|
141 print LOG "-- mu length = ".decimal($mu,1).", sigma length = ".decimal($sigma,1)."\n";
|
|
142
|
|
143 #------------------------------------------------------------------------------#
|
|
144 #------------------------------------------------------------------------------#
|
|
145 #Preprocessing pairs
|
|
146
|
|
147 $warn=100000;
|
|
148
|
|
149 $record=0;
|
|
150 my %count=( ab=>0, norm=>0, chr=>0, sense=>0, dist=>0, unmap=>0);
|
|
151
|
|
152 my $read_type=($opts{t})? "Illumina":"SOLiD";
|
|
153 my $pair_type=($opts{p})? "paired-end":"mate-paired";
|
|
154
|
|
155 print LOG "\# Preprocessing pairs...\n";
|
|
156 print LOG "-- file= $mates_file\n";
|
|
157 print LOG "-- type= $read_type $pair_type reads\n";
|
|
158 print LOG "-- sigma threshold= $opts{f}\n";
|
|
159 print LOG "-- using ".decimal($mu-$opts{f}*$sigma,4)."-".decimal($mu+$opts{f}*$sigma,4)." as normal range of insert size\n";
|
|
160
|
|
161 my @header;
|
|
162
|
|
163 if($bam){
|
|
164 open(HEADER, "${SAMTOOLS_BIN_DIR}/samtools view -H $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";
|
|
165 @header=<HEADER>;
|
|
166 close HEADER;
|
|
167 open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";
|
|
168 }else{
|
|
169 open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n";
|
|
170 }
|
|
171
|
|
172 open AB, ">$bad_mates_file" or die "$0: can't write in the output: $bad_mates_file :$!\n";
|
|
173 print AB @header if($bam);
|
|
174
|
|
175 if($opts{d}){
|
|
176 open NORM, ">$good_mates_file" or die "$0: can't write in the output: $good_mates_file :$!\n";
|
|
177 print NORM @header if($bam);
|
|
178 }
|
|
179
|
|
180 $prev_pair="FIRST";
|
|
181 my $prev_bad;
|
|
182
|
|
183 while(<MATES>){
|
|
184
|
|
185 my @t=split;
|
|
186 my $bad=0;
|
|
187
|
|
188 if ($t[0]=~/^@/){
|
|
189 print AB;
|
|
190 print NORM if ($opts{d});
|
|
191 next;
|
|
192 }
|
|
193
|
|
194 my $current_pair=$t[0];
|
|
195 if($current_pair eq $prev_pair){
|
|
196 next if($prev_bad==-1);
|
|
197 if($prev_bad){
|
|
198 print AB;
|
|
199 }elsif(!$prev_bad){
|
|
200 print NORM if($opts{d});
|
|
201 }
|
|
202 next;
|
|
203 }
|
|
204
|
|
205 $prev_pair=$current_pair;
|
|
206
|
|
207 my ($chr1,$chr2,$pos1,$pos2,$length)=($t[2],$t[6],$t[3],$t[7], abs($t[8]));
|
|
208
|
|
209 if ($chr1 eq "*" || $chr2 eq "*"){
|
|
210 $prev_bad=-1;
|
|
211 $count{unmap}++;
|
|
212 $record++;
|
|
213 next;
|
|
214
|
|
215 }
|
|
216
|
|
217 my $strand1 = (($t[1]&0x0010))? 'R':'F';
|
|
218 my $strand2 = (($t[1]&0x0020))? 'R':'F';
|
|
219 my $order1 = (($t[1]&0x0040))? '1':'2';
|
|
220 my $order2 = (($t[1]&0x0080))? '1':'2';
|
|
221
|
|
222 if($order1 == 2){
|
|
223 ($strand1,$strand2)=($strand2,$strand1);
|
|
224 ($chr1,$chr2)=($chr2,$chr1);
|
|
225 ($pos1,$pos2)=($pos2,$pos1);
|
|
226 ($order1,$order2)=($order2,$order1);
|
|
227 }
|
|
228
|
|
229 my $sense=$strand1.$strand2;
|
|
230
|
|
231 if($chr1 ne "=" && $chr2 ne "="){
|
|
232 $bad=1;
|
|
233 $count{chr}++;
|
|
234 }
|
|
235
|
|
236 if($opts{p}){ #paired-end
|
|
237 if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){
|
|
238 $bad=1;
|
|
239 $count{sense}++;
|
|
240 }
|
|
241 }else{ #mate-pair
|
|
242 if($opts{t}){ #Illumina
|
|
243 if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){
|
|
244 $bad=1;
|
|
245 $count{sense}++;
|
|
246 }
|
|
247 }else{ #SOLiD
|
|
248 if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){
|
|
249 $bad=1;
|
|
250 $count{sense}++;
|
|
251 }
|
|
252 }
|
|
253 }
|
|
254
|
|
255 if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){
|
|
256 $bad=1;
|
|
257 $count{dist}++;
|
|
258 }
|
|
259
|
|
260 if($bad){
|
|
261 print AB;
|
|
262 $count{ab}++;
|
|
263 $prev_bad=$bad;
|
|
264 }else{
|
|
265 print NORM if ($opts{d});
|
|
266 $count{norm}++;
|
|
267 $prev_bad=$bad;
|
|
268 }
|
|
269
|
|
270 $record++;
|
|
271
|
|
272 if($record>$warn){
|
|
273 print LOG "-- $warn pairs analysed\n";
|
|
274 $warn+=100000;
|
|
275 }
|
|
276 }
|
|
277
|
|
278 close AB;
|
|
279 close NORM if($opts{d});
|
|
280
|
|
281 print LOG "-- Total : $record pairs analysed\n";
|
|
282 print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\n";
|
|
283 print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\n";
|
|
284 print LOG "---- $count{ab} abnormal mapped pairs\n";
|
|
285 print LOG "------ $count{chr} pairs mapped on two different chromosomes\n";
|
|
286 print LOG "------ $count{sense} pairs with incorrect strand orientation and\/or pair order\n";
|
|
287 print LOG "------ $count{dist} pairs with incorrect insert size distance\n";
|
|
288 print LOG "--- $count{norm} correct mapped pairs\n";
|
|
289
|
|
290 #------------------------------------------------------------------------------#
|
|
291 #------------------------------------------------------------------------------#
|
|
292 #OUTPUT
|
|
293
|
|
294 if($bam){
|
|
295
|
|
296 my $bam_file=$bad_mates_file;
|
|
297 $bam_file=~s/.sam$/.bam/;
|
|
298 print LOG "\# Converting sam to bam for abnormal mapped pairs\n";
|
|
299 system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log");
|
|
300 unlink($bad_mates_file);
|
|
301 print LOG "-- output created: $bam_file\n";
|
|
302
|
|
303 system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY
|
|
304
|
|
305 if($opts{d}){
|
|
306 $bam_file=$good_mates_file;
|
|
307 $bam_file=~s/.sam$/.bam/;
|
|
308 print LOG "\# Converting sam to bam for correct mapped pairs\n";
|
|
309 system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log");
|
|
310 unlink($good_mates_file);
|
|
311 print LOG "-- output created: $bam_file\n";
|
|
312
|
|
313 system "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY
|
|
314
|
|
315 }
|
|
316
|
|
317 }
|
|
318
|
|
319 else{
|
|
320 print LOG "-- output created: $bad_mates_file\n";
|
|
321 print LOG "-- output created: $good_mates_file\n" if($opts{d});
|
|
322 }
|
|
323
|
|
324 close LOG;
|
|
325
|
|
326 system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY
|
|
327
|
|
328
|
|
329 #------------------------------------------------------------------------------#
|
|
330 #------------------------------------------------------------------------------#
|
|
331 sub decimal{
|
|
332
|
|
333 my $num=shift;
|
|
334 my $digs_to_cut=shift;
|
|
335
|
|
336 $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/);
|
|
337
|
|
338 return $num;
|
|
339 }
|
|
340 #------------------------------------------------------------------------------#
|