comparison rapsodyn/PrepareFastqLight.pl @ 0:442a7c88b886 draft

Uploaded
author mcharles
date Wed, 10 Sep 2014 09:18:15 -0400
parents
children 9074a5104cdd
comparison
equal deleted inserted replaced
-1:000000000000 0:442a7c88b886
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4
5 my $read1 = $ARGV[0];
6 my $read2 = $ARGV[1];
7
8 my $output1 = $ARGV[2];
9 my $output2 = $ARGV[3];
10
11 my $TYPE = $ARGV[4];
12 my $MIN_LENGTH = $ARGV[5];
13 my $MIN_QUALITY = $ARGV[6];
14
15 my $VERBOSE = $ARGV[7];
16
17 if (!$VERBOSE){
18 $VERBOSE ="OFF";
19 }
20
21 open(READ1, $read1) or die ("Can't open $read1\n");
22 open(READ2, $read2) or die ("Can't open $read2\n");
23 open(OUT1, ">$output1") or die ("Can't open $output1\n");
24 open(OUT2, ">$output2") or die ("Can't open $output2\n");
25
26
27 my $error1=0;
28 my $error2=0;
29 my $error3=0;
30 my $error4=0;
31 my $error5=0;
32 my $error6=0;
33 my $error7=0;
34 my $error8=0;
35 my $error9=0;
36 my $error10=0;
37
38 while (my $ligne1_r1 =<READ1>){
39 my $ligne2_r1 =<READ1>;
40 my $ligne3_r1 =<READ1>;
41 my $ligne4_r1 =<READ1>;
42 my $ligne1_r2 =<READ2>;
43 my $ligne2_r2 =<READ2>;
44 my $ligne3_r2 =<READ2>;
45 my $ligne4_r2 =<READ2>;
46
47 #@ 1 sec
48 if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){
49 if ($VERBOSE eq "ON"){
50 print "Error in file format";
51 if ($ligne1_r1){print $ligne1_r1;}
52 if ($ligne2_r1){print $ligne2_r1;}
53 if ($ligne3_r1){print $ligne3_r1;}
54 if ($ligne4_r1){print $ligne4_r1;}
55 if ($ligne1_r2){print $ligne1_r2;}
56 if ($ligne2_r2){print $ligne2_r2;}
57 if ($ligne3_r2){print $ligne3_r2;}
58 if ($ligne4_r2){print $ligne4_r2;}
59 print "\n";
60 }
61 $error1++;
62 }
63 elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){
64 if ($VERBOSE eq "ON"){
65 print "Error in header : format\n";
66 print $ligne1_r1;
67 print $ligne2_r1;
68 print $ligne3_r1;
69 print $ligne4_r1;
70 print $ligne1_r2;
71 print $ligne2_r2;
72 print $ligne3_r2;
73 print $ligne4_r2;
74 print "\n";
75 }
76 $error2++;
77 }
78 #@ 1 - 2 sec
79 else {
80
81 my $length_seq1 = length($ligne2_r1);
82 my $length_qual1 =length($ligne4_r1);
83 my $seq1;
84 my $qual1;
85
86 my $length_seq2 = length($ligne2_r2);
87 my $length_qual2 =length($ligne4_r2);
88 my $seq2;
89 my $qual2;
90 my $header1="";
91 my $header2="";
92 my $repheader1="";
93 my $repheader2="";
94
95 if ($ligne1_r1 =~/^\@(.*?)\#/){
96 $header1 = $1;
97 }
98
99 if ($ligne3_r1 =~/^\+(.*?)\#/){
100 $repheader1 = $1;
101 }
102
103 if ($ligne1_r2 =~/^\@(.*?)\#/){
104 $header2 = $1;
105 }
106
107 if ($ligne3_r2 =~/^\+(.*?)\#/){
108 $repheader2 = $1;
109 }
110 #@ 2 sec
111
112 ### Verification de la coherence sequence /qualité @ 1 sec
113 if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){
114 if ($VERBOSE eq "ON"){
115 print "Error in header : empty\n";
116 print $ligne1_r1;
117 print $ligne2_r1;
118 print $ligne3_r1;
119 print $ligne4_r1;
120 print $ligne1_r2;
121 print $ligne2_r2;
122 print $ligne3_r2;
123 print $ligne4_r2;
124 print "\n";
125 }
126 $error3++;
127 }
128 elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){
129 if ($VERBOSE eq "ON"){
130 print "Error in header refgsd : empty\n";
131 print $ligne1_r1;
132 print $ligne2_r1;
133 print $ligne3_r1;
134 print $ligne4_r1;
135 print $ligne1_r2;
136 print $ligne2_r2;
137 print $ligne3_r2;
138 print $ligne4_r2;
139 print "\n";
140 }
141 $error3++;
142 }
143 elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){
144 if ($VERBOSE eq "ON"){
145 print "Error in header : different\n";
146 print $ligne1_r1;
147 print $ligne2_r1;
148 print $ligne3_r1;
149 print $ligne4_r1;
150 print $ligne1_r2;
151 print $ligne2_r2;
152 print $ligne3_r2;
153 print $ligne4_r2;
154 print "\n";
155 }
156 $error4++;
157 }
158 elsif (($TYPE eq "sanger")&&($header1 ne $header2)){
159 if ($VERBOSE eq "ON"){
160 print "Error in header : different\n";
161 print $ligne1_r1;
162 print $ligne2_r1;
163 print $ligne3_r1;
164 print $ligne4_r1;
165 print $ligne1_r2;
166 print $ligne2_r2;
167 print $ligne3_r2;
168 print $ligne4_r2;
169 print "\n";
170 }
171 $error4++;
172 }
173 elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){
174 if ($VERBOSE eq "ON"){
175 print "Error in seq/qual length\n";
176 print $ligne1_r1;
177 print $ligne2_r1;
178 print $ligne3_r1;
179 print $ligne4_r1;
180 print $ligne1_r2;
181 print $ligne2_r2;
182 print $ligne3_r2;
183 print $ligne4_r2;
184 print "\n";
185 }
186 $error5++;
187 }
188 #@ 1 - 2 sec
189 else {
190 ### Parsing sequence & qualité
191 if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){
192 $seq1 = $1;
193 }
194 if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){
195 $seq2 = $1;
196 }
197 if ($ligne4_r1 =~ /^(.*)\s*$/i){
198 $qual1 = $1;
199 }
200 if ($ligne4_r2 =~ /^(.*)\s*$/i){
201 $qual2 = $1;
202 }
203 #@ 2 sec
204 ### Verification du parsing et de la coherence sequence /qualité (n°2)
205 if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){
206 if ($VERBOSE eq "ON"){
207 print "Error parsing seq / quality \n";
208 print $ligne1_r1;
209 print $ligne2_r1;
210 print $ligne3_r1;
211 print $ligne4_r1;
212 print $ligne1_r2;
213 print $ligne2_r2;
214 print $ligne3_r2;
215 print $ligne4_r2;
216 print "\n";
217 }
218 $error6++;
219 }
220 elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){
221 if ($VERBOSE eq "ON"){
222 print "Error in seq/qual length after parsing\n";
223 print $ligne1_r1;
224 print $ligne2_r1;
225 print $ligne3_r1;
226 print $ligne4_r1;
227 print $ligne1_r2;
228 print $ligne2_r2;
229 print $ligne3_r2;
230 print $ligne4_r2;
231 print "\n";
232 }
233 $error7++;
234 }
235 #@ <1 sec
236 else {
237 my $fastq_lines_r1="";
238 my $fastq_lines_r2="";
239 $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1);
240 if ($fastq_lines_r1){
241 $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2);
242 }
243 if ($fastq_lines_r2){
244 print OUT1 $fastq_lines_r1;
245 print OUT2 $fastq_lines_r2;
246 }
247 }
248 }
249
250 # print OUT1 $ligne1_r1;
251 # print OUT1 $ligne2_r1;
252 # print OUT1 $ligne3_r1;
253 # print OUT1 $ligne4_r1;
254 # print OUT2 $ligne1_r2;
255 # print OUT2 $ligne2_r2;
256 # print OUT2 $ligne3_r2;
257 # print OUT2 $ligne4_r2;
258
259 #@ 7 sec
260 }
261 }
262
263
264
265 close (READ1);
266 close (READ2);
267 close (OUT1);
268 close (OUT2);
269
270
271
272
273 sub grooming_and_trimming{
274 my $header = shift;
275 my $seq = shift;
276 my $quality = shift;
277 my $quality_converted="";
278
279 my $startnoN = 0;
280 my $stopnoN = length($quality)-1;
281
282 #print "SEQ :\n$seq\n";
283
284 my $chercheN = $seq;
285 my @bad_position;
286 my $current_index = index($chercheN,"N");
287 my $abs_index = $current_index;
288 while ($current_index >=0){
289 push (@bad_position,$abs_index);
290
291 if ($current_index<length($seq)){
292 $chercheN = substr($chercheN,$current_index+1);
293 $current_index = index($chercheN,"N");
294 $abs_index = $current_index + $bad_position[$#bad_position]+1;
295 }
296 else {
297 last;
298 }
299 }
300
301
302 if ($#bad_position>=0){
303 my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)};
304 $startnoN = $coord{"start"};
305 $stopnoN = $coord{"stop"};
306 }
307 my $lengthnoN = $stopnoN - $startnoN + 1;
308 my $seqnoN = substr($seq,$startnoN,$lengthnoN);
309 #print "$seqnoN\n";
310
311 if ($lengthnoN >= $MIN_LENGTH){
312 my $startTrim = $startnoN;
313 my $stopTrim = $stopnoN;
314
315 my $quality_converted="";
316 my @bad_position;
317
318 my @q = split(//,$quality);
319 #print "QUALITY\n";
320 #print "$quality\n";
321 for (my $i=0;$i<=$stopnoN;$i++){
322 my $chr = $q[$i];
323 my $num = ord($q[$i]);
324 if ($TYPE eq "illumina"){
325 $num = $num -64+33;
326 $quality_converted .= chr($num);
327 }
328
329 if ($num <$MIN_QUALITY + 64 - 33 ){
330 push(@bad_position,$i+$startnoN);
331 }
332 }
333 if ($quality_converted){$quality = $quality_converted;}
334 #print "$quality\n";
335
336
337
338 if ($#bad_position>=0){
339 # for (my $i=0;$i<=$#bad_position;$i++){
340 # print $bad_position[$i]."\t";
341 # }
342 # print "\n";
343 my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)};
344 $startTrim = $coord{"start"};
345 $stopTrim = $coord{"stop"};
346 #print "$startTrim .. $stopTrim\n";
347
348 }
349 my $lengthTrim = $stopTrim - $startTrim +1;
350
351
352 my $fastq_lines="";
353
354 if ($lengthTrim >= $MIN_LENGTH){
355 $fastq_lines .= $header;
356 $fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n";
357 $fastq_lines .= "+\n";
358 $fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n";
359 return $fastq_lines;
360 }
361 else {
362 return "";
363 }
364
365
366
367 }
368 else {
369 return "";
370 }
371
372
373 # my @s = split(//,$seq);
374 # my $sanger_quality="";
375
376
377
378
379 # return $sanger_quality;
380 }
381
382 sub extract_longer_string_coordinates_from_bad_position{
383 my $start=shift;
384 my $stop =shift;
385 my $refbad = shift;
386 my @bad_position = @$refbad;
387 my %coord;
388
389 my $current_start = $start;
390 my $current_stop = $bad_position[0]-1;
391 if ($current_stop < $start){$current_stop = $start;}
392
393
394 #debut -> premier N
395 my $current_length = $current_stop - $current_start +1;
396 my $test_length;
397
398 #entre les N
399 for (my $i=1;$i<=$#bad_position;$i++){
400 $test_length = $bad_position[$i]+1-$bad_position[$i-1]-1;
401 if ( $test_length > $current_length){
402 $current_start = $bad_position[$i-1]+1;
403 $current_stop = $bad_position[$i]-1;
404 $current_length = $current_stop - $current_start +1;
405 }
406 }
407
408 #dernier N -> fin
409 $test_length = $stop-$bad_position[$#bad_position]+1;
410 if ( $test_length > $current_length){
411 $current_start = $bad_position[$#bad_position]+1;
412 if ($current_start > $stop){$current_start=$stop;}
413 $current_stop = $stop;
414 }
415 $coord{"start"}=$current_start;
416 $coord{"stop"}= $current_stop;
417 $coord{"lenght"}=$current_stop-$current_start+1;
418
419 return \%coord;
420 }