comparison rapsodyn/PrepareFastqLight.pl @ 5:b0cbb9d21aa9 draft

Uploaded
author mcharles
date Mon, 22 Sep 2014 10:19:53 -0400
parents 9074a5104cdd
children 3f7b0788a1c4
comparison
equal deleted inserted replaced
4:9074a5104cdd 5:b0cbb9d21aa9
266 close (READ2); 266 close (READ2);
267 close (OUT1); 267 close (OUT1);
268 close (OUT2); 268 close (OUT2);
269 269
270 270
271
272
273 sub grooming_and_trimming{ 271 sub grooming_and_trimming{
274 my $header = shift; 272 my $header = shift;
275 my $seq = shift; 273 my $seq = shift;
276 my $quality = shift; 274 my $quality = shift;
277 my $quality_converted=""; 275 my $quality_converted="";
278 276 my $quality_ori=$quality;
279 my $startnoN = 0; 277
280 my $stopnoN = length($quality)-1; 278 my $lengthseq = length($seq);
281 279 my $startTrim = 0;
282 #print "HEAD:\t$header"; 280 my $stopTrim = length($quality)-1;
283 #print "SEQ:\t$seq\n"; 281 my $startnoN = $startTrim;
282 my $stopnoN = $stopTrim;
283
284 284
285 my $chercheN = $seq; 285 my $chercheN = $seq;
286 my @bad_position; 286 my @bad_position_N;
287 my @bad_position_Q;
287 my $current_index = index($chercheN,"N"); 288 my $current_index = index($chercheN,"N");
288 my $abs_index = $current_index; 289 my $abs_index = $current_index;
289 while ($current_index >=0){ 290 while ($current_index >=0){
290 push (@bad_position,$abs_index); 291 push (@bad_position_N,$abs_index);
291 292
292 if ($current_index<length($seq)){ 293 if ($current_index<length($seq)){
293 $chercheN = substr($chercheN,$current_index+1); 294 $chercheN = substr($chercheN,$current_index+1);
294 $current_index = index($chercheN,"N"); 295 $current_index = index($chercheN,"N");
295 $abs_index = $current_index + $bad_position[$#bad_position]+1; 296 $abs_index = $current_index + $bad_position_N[$#bad_position_N]+1;
296 } 297 }
297 else { 298 else {
298 last; 299 last;
299 } 300 }
300 } 301 }
301 302
303 my @q = split(//,$quality);
304 for (my $i=0;$i<=$#q;$i++){
305 my $chr = $q[$i];
306 my $num = ord($q[$i]);
307 if ($TYPE eq "illumina"){
308 $num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40)
309 $quality_converted .= chr($num);
310 }
311
312 if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger
313 push(@bad_position_Q,$i);
314 }
315 }
316 if ($quality_converted){$quality = $quality_converted;}
317
318 my @bad_position = (@bad_position_N, @bad_position_Q);
302 319
303 if ($#bad_position>=0){ 320 if ($#bad_position>=0){
304 my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)}; 321 @bad_position = sort {$a <=> $b} @bad_position;
305 $startnoN = $coord{"start"}; 322 my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)};
306 $stopnoN = $coord{"stop"}; 323 $startTrim = $coord{"start"};
307 } 324 $stopTrim = $coord{"stop"};
308 my $lengthnoN = $stopnoN - $startnoN + 1; 325 #print "$startTrim .. $stopTrim\n";
309 my $seqnoN = substr($seq,$startnoN,$lengthnoN); 326
310 # print "SEQnoN\t:$seqnoN\n"; 327 }
311 # for (my $i=0;$i<=$#bad_position;$i++){ 328 my $lengthTrim = $stopTrim - $startTrim +1;
312 # print $bad_position[$i]."\t"; 329
330 my $fastq_lines="";
331
332 # if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){
333 # print "HEAD:\t$header";
334 # print "SEQ:\n$seq\n";
335 # print "$quality_ori\n";
336 # print "$quality\n";
337 # for (my $i=0;$i<=$#bad_position;$i++){
338 # print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t";
339 # }
340 # print "\n";
341 # print "$startTrim .. $stopTrim / $lengthTrim \n";
342 # print $fastq_lines;
343 # print "\n";
313 # } 344 # }
314 # print "\n"; 345
315 346 if ($lengthTrim >= $MIN_LENGTH){
316 if ($lengthnoN >= $MIN_LENGTH){ 347 $fastq_lines .= $header;
317 my $startTrim = $startnoN; 348 $fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n";
318 my $stopTrim = $stopnoN; 349 $fastq_lines .= "+\n";
319 350 $fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n";
320 my $quality_converted=""; 351 return $fastq_lines;
321 #my @bad_position;
322
323 my @q = split(//,$quality);
324 #print "QUALITY\n";
325 #print "$quality\n";
326 for (my $i=0;$i<=$stopnoN;$i++){
327 my $chr = $q[$i];
328 my $num = ord($q[$i]);
329 if ($TYPE eq "illumina"){
330 $num = $num -64+33;
331 $quality_converted .= chr($num);
332 }
333
334 if ($num <$MIN_QUALITY + 64 - 33 ){
335 push(@bad_position,$i+$startnoN);
336 }
337 }
338 if ($quality_converted){$quality = $quality_converted;}
339 #print "$quality\n";
340
341
342
343 if ($#bad_position>=0){
344 @bad_position = sort {$a <=> $b} @bad_position;
345 # for (my $i=0;$i<=$#bad_position;$i++){
346 # print $bad_position[$i]."\t";
347 # }
348 # print "\n";
349 my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)};
350 $startTrim = $coord{"start"};
351 $stopTrim = $coord{"stop"};
352 #print "$startTrim .. $stopTrim\n";
353
354 }
355 my $lengthTrim = $stopTrim - $startTrim +1;
356
357
358 my $fastq_lines="";
359
360 if ($lengthTrim >= $MIN_LENGTH){
361 $fastq_lines .= $header;
362 $fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n";
363 $fastq_lines .= "+\n";
364 $fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n";
365 # print $fastq_lines;
366 return $fastq_lines;
367
368 }
369 else {
370 return "";
371 }
372
373
374 352
375 } 353 }
376 else { 354 else {
355 #print "Insufficient length after trimming\n";
377 return ""; 356 return "";
378 } 357 }
379
380
381 # my @s = split(//,$seq);
382 # my $sanger_quality="";
383
384
385
386
387 # return $sanger_quality;
388 } 358 }
389 359
390 sub extract_longer_string_coordinates_from_bad_position{ 360 sub extract_longer_string_coordinates_from_bad_position{
391 my $start=shift; 361 my $start=shift;
392 my $stop =shift; 362 my $stop =shift;