comparison Paired_fastQ_trimmer.pl @ 1:fdbcc1aa4f01 draft

Uploaded
author geert-vandeweyer
date Thu, 13 Feb 2014 08:21:15 -0500
parents
children
comparison
equal deleted inserted replaced
0:548887c4227c 1:fdbcc1aa4f01
1 #!/usr/bin/perl
2
3 # load modules
4 use Getopt::Std;
5 use threads;
6 use Thread::Queue;
7 use threads::shared;
8 use File::Basename;
9 use List::Util qw(max);
10
11 $now = time ;
12
13 $|++;
14 # variables
15 my $paired :shared;
16 $paired = 0;
17 my $fq :shared;
18 my $rq :shared;
19 my $ofq :shared;
20 my $orq :shared;
21 my $trimq :shared;
22 my $failedfile :shared;
23 my $done :shared;
24 $done = 0;
25 my $nrin :shared;
26 $nrin = 0;
27
28 my $rqz :shared;
29 my $fqz :shared;
30 my $rand :shared;
31 my $discarded :shared;
32 $discarded = 0;
33 my $verbose :shared;
34 $verbose = 1;
35 my $trim3 :shared;
36 $trim3 = 1;
37 my $trim5 :shared;
38 $trim5 = 1;
39 # create file lock in the tmp dir
40 $rand = int(rand(10000));
41 while (-e "/tmp/$rand.lock") {
42 $rand = int(rand(10000));
43 }
44 system("touch '/tmp/$rand.lock'");
45 if (!-e "/tmp/sg.write.lock") {
46 system("touch '/tmp/sg.write.lock'");
47 system("chmod 777 '/tmp/sg.write.lock'");
48 }
49
50 # opts
51 # i : path to forward read FASTQ file
52 # I : path to reverse read FASTQ file
53 # q : quality threshold in phred.
54 # n : string representing common read name parts
55 # s : trimming style : simple/bwa
56 # o : output file for forward reads
57 # O : output file for reverse reads
58 # F : output file with fully failed reads pairs.
59 # v : verbose: defaults to on (0/1)
60 # S : Trim on side: 5/3/b(oth)
61 # m : minimal length.
62 getopts('i:I:q:n:s:o:O:F:v:S:m:', \%opts) ;
63
64
65 if (defined($opts{'v'})) {
66 $verbose = $opts{'v'};
67 }
68
69 if (defined($opts{'s'})) {
70 if ($opts{'s'} eq 'bwa') {
71 if ($verbose == 1) {
72 print "Using BWA style trimming\n";
73 }
74 $style = 'bwa';
75 }
76 else {
77 if ($verbose == 1) {
78 print "Using simple 1bp Quality trimming\n";
79 }
80 $style = 'simple';
81 }
82 }
83 else {
84 if ($verbose == 1){
85 print "Using simple 1bp Quality trimming\n";
86 }
87 $style = 'simple';
88 }
89
90 if (!defined($opts{'i'})) { die('Forward reads fastq is mandatory');}
91 $fq = $opts{'i'};
92 $ofq = $opts{'o'};
93
94 ## gzipped infile?
95 my $out = `file $fq`;
96 if ($out =~ m/gzip compressed/) {
97 if ($verbose == 1) {
98 print "Forward reads in gzipped format.\n";
99 }
100 $fqz = 1;
101 }
102 else {
103 if ($verbose == 1) {
104 print "Forward reads in plain fastq format.\n";
105 }
106 $fqz = 0;
107 }
108
109
110 if (defined($opts{'I'})) {
111 if ($verbose == 1) {
112 print "Reverse reads provided. Performing paired-end trimming\n";
113 }
114 $paired = 1;
115 $rq = $opts{'I'};
116 $orq = $opts{'O'};
117 # gzipped ?
118 my $out = `file $rq`;
119 if ($out =~ m/gzip compressed/) {
120 if ($verbose == 1) {
121 print "Reverse reads in gzipped format.\n";
122 }
123 $rqz = 1;
124 }
125 else {
126 if ($verbose == 1) {
127 print "Reverse reads in plain fastq format.\n";
128 }
129 $rqz = 0;
130 }
131 }
132 ## FAILED FILENAME
133 $failedfile = $opts{'F'};
134
135 ## set the sides to trim
136 my $trimside = '';
137 if (defined($opts{'S'})) {
138 if ($opts{'S'} eq '3') {
139 $trimside = "Trimming reads from the 3' end only\n";
140 $trim3 = 1;
141 $trim5 = 0;
142 }
143 elsif ($opts{'S'} eq '5') {
144 $trimside = "Trimming reads from the 5' end only\n";
145 $trim3 = 0;
146 $trim5 = 1;
147 }
148 elsif ($opts{'S'} eq 'b') {
149 $trimside = "Trimming reads both from 5' and 3'\n";
150 $trim3 = 1;
151 $trim5 = 1;
152 }
153 else {
154 $trimside = "Invalid trimming side specification. Setting to both sides trimming\n";
155 $trim3 = 1;
156 $trim5 = 1;
157 }
158 }
159 else {
160 $trimside = "Invalid trimming side specification. Setting to both sides trimming\n";
161 $trim3 = 1;
162 $trim5 = 1;
163 }
164 if ($verbose == 1) {
165 print $trimside;
166 }
167
168 # SET MINIMAL LENGTH
169 my $minlength :shared;
170 if (exists($opts{'m'})) {
171 $minlength = $opts{'m'};
172 }
173 else {
174 $minlength = 0;
175 }
176
177 ## CHECK IF OUTPUT FILESNAMES ARE PRESENT
178 if ($ofq eq '') {
179 $ofq = "$fq.$style.trimmed";
180 if ($verbose == 1) {
181 print "No Forward output name specified. Forward reads will be printed to:\n\t$ofq\n";
182 }
183 }
184 if ($paired == 1 && $orq eq '') {
185 $orq = "$rq.$style.trimmed";
186 if ($verbose == 1) {
187 print "No Reverse output name specified. Reverse reads will be printed to:\n\t$orq\n";
188 }
189 }
190 if ($failedfile eq '') {
191 $failedfile = "$fq.Failed_Reads";
192 if ($verbose == 1) {
193 print "No failed ouput name specified. Failed reads will be printed to:\n\t$failedfile\n";
194 }
195 }
196
197 ## set trimming threshold
198 if (!defined($opts{'q'}) or !($opts{'q'} =~ m/^[0-9]+$/)) {
199 $trimq = 20;
200 }
201 else {
202 $trimq = $opts{'q'};
203 }
204
205 ## set read delimiter
206 if (defined($opts{'n'})) {
207 $indelim = $opts{'n'};
208 # galaxy replaced @ by __at__
209 if (substr($indelim,0,6) eq '__at__') {
210 $indelim = '@'.substr($indelim,6);
211 }
212 }
213 else {
214 print "No indelim defined\n";
215 $indelim = "@"; # has risk, fails if @ is first qual item
216 }
217
218 ## NEW : REPLACE if indelim = @ by the first 5 chars of first line of forward-fastq.
219 if ($indelim eq '@') {
220 if ($fqz == 0) {
221 open IN, $fq;
222 }
223 else {
224 open(IN, "gunzip -c $fq | ");
225 }
226 $head = <IN>;
227 $indelim = substr($head,0,5);
228 print "\n";
229 print "WARNING:\n";
230 print "########\n";
231 print " Setting Delimiter as '\@' is prone to errors\n";
232 print " as it will incorrectly split if '\@' is the firt value in the quality string !!\n";
233 print ' It is recommended to use eg -n \'\@SRR301\''."\n";
234 print " Therefore, the delimiter was changed to the first 5 characters of the first forward fastq file.\n";
235 print " => Delimiter : $indelim\n";
236 print "\n\n";
237 sleep 3;
238 }
239 ## create queues
240 my $totrim = Thread::Queue->new();
241 my $toprint = Thread::Queue->new();
242 my $dummy = Thread::Queue->new();
243 my $failed = Thread::Queue->new();
244
245 # monitor thread
246 my $finish :shared;
247 $finish = 0;
248 $mon = threads->create('monitor');
249
250 # start FASTQ reading thread
251 if ($verbose == 1) {
252 print "start reading file\n";
253 }
254 $reading = threads->create('readfile');
255
256 ## give time to build reading buffer
257 sleep 10;
258
259 # start trimming threads
260 if ($verbose == 1) {
261 print "start trimming\n";
262 }
263 if ($paired == 1) {
264 if ($style eq 'bwa') {
265 $trimming1 = threads->create('bwapaired');
266 $trimming2 = threads->create('bwapaired');
267
268 }
269 else {
270 $trimming1 = threads->create('simplepaired');
271 $trimming2 = threads->create('simplepaired');
272 }
273
274 }
275 else {
276 if ($style eq 'bwa') {
277 #print "not available yet, switching to simple trimming\n";
278 #$style = 'simple';
279 $trimming1 = threads->create('bwasingle');
280 $trimming2 = threads->create('bwasingle');
281
282 }
283 if ($style eq 'simple') {
284 $trimming1 = threads->create('simplesingle');
285 $trimming2 = threads->create('simplesingle');
286 }
287 }
288 # start printing threads
289 $printout = threads->create('printout');
290 $printfail = threads->create('printfailed');
291 # end threads
292 $reading->join();
293 ## all reads are queued for trimming, add undef to trimming queues to end threads.
294 $totrim->enqueue(undef);
295 $totrim->enqueue(undef);
296 $trimming1->join();
297 $trimming2->join();
298 ## all reads are trimmed and queued from printing, send undef to end printing threads.
299 $toprint->enqueue(undef);
300 $failed->enqueue(undef);
301 $printout->join();
302 $printfail->join();
303 if ($verbose == 1) {
304 print "Waiting for status monitor to shut down\n";
305 }
306 $mon->join();
307 ## copy files from /tmp to destination folder
308 if ($verbose == 1) {
309 print "Copying files to destination: Forward..";
310 }
311 if ($fqz == 0) {
312 system("cp -f /tmp/$rand.fq $ofq");
313 unlink("/tmp/$rand.fq");
314 }
315 else {
316 system("gzip -c /tmp/$rand.fq > $ofq");
317 unlink("/tmp/$rand.fq");
318 }
319 if ($paired == 1) {
320 if ($verbose == 1) {
321 print " Reverse..";
322 }
323 if ($rqz == 0) {
324 system("cp -f /tmp/$rand.rq $orq");
325 unlink("/tmp/$rand.rq");
326 }
327 else {
328 system("gzip -c /tmp/$rand.rq > $orq");
329 unlink("/tmp/$rand.rq");
330 }
331 }
332
333 if ($verbose == 1) {
334 print " Failed items..";
335 }
336 system("cp -f /tmp/$rand.failed $failedfile");
337 unlink("/tmp/$rand.failed");
338 if ($verbose == 1) {
339 print " Done\n";
340 }
341
342 # remove lock file
343 unlink("/tmp/$rand.lock");
344
345 ########################
346 ## print run details: ##
347 ########################
348 my $string = sprintf("%.2f",$done/1000000)."M Reads in (each) FASTQ file processed.\n";
349 if ($paired == 1) {
350 $string .= sprintf ("%.2f",($done-$discarded)/1000000)."M read pairs in output FASTQ's.\n";
351 $string .= sprintf("%.2f",$discarded/1000000)."M reads pairs discarded\n";
352 }
353 else {
354 $string .= sprintf ("%.2f",($done-$discarded)/1000000)."M reads in output FASTQ.\n";
355 $string .= sprintf("%.2f",$discarded/1000000)."M reads discarded\n";
356 }
357 print $string;
358 ##################
359 # PRINT RUN-TIME #
360 ##################
361 $now = time - $now;
362 printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60));
363
364
365 exit();
366
367
368
369 ## SUBROUTINE : READ FASTQ FILES
370 ################################
371 sub readfile {
372 if ($paired == 1) {
373 if ($fqz == 0) {
374 open IN1, $fq;
375 }
376 else {
377 open(IN1, "gunzip -c $fq | ");
378 }
379 if ($rqz == 0) {
380 open IN2, $rq;
381 }
382 else {
383 open(IN2, "gunzip -c $rq | ");
384 }
385 $/ = "\n$indelim";
386 $deliml = length($indelim);
387 # compose pack ignore string
388 my $dx = '';
389 for (my $idx = 0; $idx<length($indelim); $idx++) {
390 $dx .= 'X';
391 }
392 my $f = <IN1>;
393 my $r = <IN2>;
394 # the reads contain the delimiter on the end (cf \n on regular file read line endings.)
395 # so we strip teh delimiter using pack, leaving just the \n.
396 # this means that in subsequent reads, we need to add the delimiter to the front of the read.
397 my $read = substr(pack("A*$dx",$f),$deliml) . $r; #pack("A*$dx",$r) ;#. '|';
398 my $counter = 1;
399 my $fr = '';
400 my $rr = '';
401 my $reads = '';
402 my $lastf = '';
403 while ($fr = <IN1>) {
404 $rr = <IN2>;
405 $lastf = $fr;
406 ## add delim to front, trim from back. add our delim.
407 ## done in next iteration, because last is different.
408 # the delimiter from the forward read is used to complete the reverse read.
409 $reads .= $indelim.pack("A*$dx",$read) .'|';
410 # concat reads
411 $read = $fr.$rr;
412 ## add string to queue if 20,000 reads in array
413 if ($counter == 20000) {
414 $nrin = $nrin + 20000;
415 $totrim->enqueue(pack("A*X",$reads)); # pack("A*X") strips last |
416 $reads = '';
417 $counter = 0;
418 }
419 $counter++;
420
421 ## dummy (see monitor).
422 my $d = $dummy->dequeue_nb();
423 }
424 # last reads in files need special care.
425 $reads .= $indelim.$lastf.$indelim.$rr;
426 close IN1;
427 close IN2;
428 # submit last reads to queue
429 if ($reads ne '') {
430 #$totrim->enqueue(pack("A*X",$reads));
431 $totrim->enqueue($reads);
432 }
433 if ($verbose == 1) {
434 print "All reads queued for trimming\n";
435 }
436 }
437 else {
438 if ($fqz == 0) {
439 open IN1, $fq;
440 }
441 else {
442 open(IN1, "gunzip -c $fq | ");
443 }
444 $/ = "\n$indelim";
445 $deliml = length($indelim);
446 # compose pack ignore string
447 my $dx = '';
448 for (my $idx = 0; $idx<length($indelim); $idx++) {
449 $dx .= 'X';
450 }
451 my $f = <IN1>;
452 # the reads contain the delimiter on the end (cf \n on regular file read line endings.)
453 # so we strip teh delimiter using pack, leaving just the \n.
454 # this means that in subsequent reads, we need to add the delimiter to the front of the read.
455 my $read = substr($f,$deliml);
456 my $counter = 1;
457 my $fr = '';
458 my $reads = '';
459 while ($fr = <IN1>) {
460 ## add delim to front, trim from back. add our delim.
461 ## done in next iteration, because last is different.
462 $reads .= $indelim.pack("A*$dx",$read) .'|';
463 # concat reads
464 $read = $fr;
465 ## add string to queue if 10,000 reads in array
466 if ($counter == 40000) {
467 $nrin = $nrin + 40000;
468 $totrim->enqueue(pack("A*X",$reads));
469 $reads = '';
470 $counter = 0;
471 }
472 $counter++;
473
474 ## dummy (see monitor).
475 my $d = $dummy->dequeue_nb();
476 }
477 $reads .= $indelim.$read;
478 close IN1;
479 # submit last reads to queue
480 if ($reads ne '') {
481 #$totrim->enqueue(pack("A*X",$reads));
482 $totrim->enqueue($reads);
483 }
484 if ($verbose == 1) {
485 print "All reads queued for trimming\n";
486 }
487
488 }
489 }
490
491 sub bwapaired {
492 my $tq = $trimq;
493 while ( defined(my $rstring = $totrim->dequeue())) {
494 my @reads = split(/\|/,$rstring); # each item has 20.000 reads
495 my $printstring = '';
496 my $failedstring = '';
497 foreach (@reads) {
498 my $read = $_;
499 my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read);
500 ## skip too short reads
501 if (length($q1) < $minlength || length($q2) < $minlength) {
502 $discarded++;
503 $failedstring .= $read ;#. '|';
504 next;
505 }
506 # initial check for maximal quality value.
507 ## quality arrays
508 my @q1array = unpack("C*",$q1);
509 if (max(@q1array) < $tq + 33) {
510 #print "Discarding on failed forward qualities: ".max(@q1array). " vs $tq + 33)\n$read\n";
511 $discarded++;
512 $failedstring .= $read ;#. '|';
513 next;
514 }
515 my @q2array = unpack("C*",$q2);
516 if (max(@q2array) < $trimq + 33) {
517 #print "Discarding on failed reverse qualities\n$read\n";
518 $discarded++;
519 $failedstring .= $read ;#. '|';
520 next;
521 }
522
523 #############
524 ## Trim 3' ##
525 #############
526 if ($trim3 == 1) {
527 #trim first
528 my $pos = length($q1) - 1;
529 my $maxPos = $pos;
530 ## skip empty read.
531 #if ($pos <= 0) {
532 # $discarded++;
533 # $failedstring .= $read . '|';
534 # #print "skip empty : \n$read\n";
535 # next;
536 #}
537 my $area = 0;
538 my $maxArea = 0;
539 ## unpack qual-string to array of phred scores
540 #my @qarray = unpack("C*",$q1); # use from initial check.
541 while ($pos>0 && $area>=0) {
542 $area += $tq - ($q1array[($pos)] - 33);
543 ## not <= for more aggressive trimming : if multiple equal maxima => use shortest read.
544 if ($area < $maxArea) {
545 $pos--;
546 next;
547 }
548 else {
549 $maxArea = $area;
550 $maxPos = $pos;
551 $pos--;
552 next;
553 }
554 }
555 if ($maxPos==0) { # scanned whole read and didn't integrate to maximum? skip (no output for this read)
556 $discarded++;
557 $failedstring .= $read ;#. '|';
558 next;
559 }
560 # otherwise trim before position where area reached a maximum
561 # $maxPos as length of substr gives positions zero to just before the maxpos.
562 #$s1 = substr($s1,0,$maxPos);
563 #$q1 = substr($q1,0,$maxPos);
564 $s1 = unpack("A$maxPos",$s1);
565 $q1 = unpack("A$maxPos",$q1);
566
567 #trim second
568 $pos = length($q2);
569 $maxPos = $pos;
570 ## skip empty read.
571 #if ($pos <= 0) {
572 # $discarded++;
573 # $failedstring .= $read . '|';
574 # next;
575 #}
576 $area = 0;
577 $maxArea = 0;
578 #@qarray = unpack("C*",$q2);
579 while ($pos>0 && $area>=0) {
580 $area += $tq - ($q2array[($pos)] - 33);
581 if ($area < $maxArea) {
582 $pos--;
583 next;
584 }
585 else {
586 $maxArea = $area;
587 $maxPos = $pos;
588 $pos--;
589 next;
590 }
591 }
592 if ($maxPos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read)
593 $discarded++;
594 $failedstring .= $read ;#. '|';
595 next;
596 }
597 # Otherwise trim before position where area reached a maximum
598 $s2 = unpack("A$maxPos",$s2);
599 $q2 = unpack("A$maxPos",$q2);
600 #$s2 = substr($s2,0,$maxPos);
601 #$q2 = substr($q2,0,$maxPos);
602
603 }
604 #############
605 ## TRIM 5' ##
606 #############
607 if ($trim5 == 1) {
608 #trim first
609 my $skip = 0; #length($q1);
610 my $maxPos = $skip;
611 my $area = 0;
612 my $maxArea = 0;
613 my @qarray = unpack("C*",$q1);
614 while ($skip<length($q1) && $area>=0) {
615 $area += $tq - ($qarray[($skip)] - 33);
616 if ($area < $maxArea) {
617 $skip++;
618 next;
619 }
620 else {
621 $maxArea = $area;
622 $maxPos = $skip;
623 $skip++;
624 next;
625 }
626 }
627 if ($maxPos==length($q1)- 1) { # => maximal value at end of string => no point of no return found.
628 $discarded++;
629 $failedstring .= $read ;#. '|';
630 next;
631 }
632 # trim after position where area reached a maximum
633 #$s1 = substr($s1,($maxPos+1));
634 #$q1 = substr($q1,($maxPos+1));
635 $s1 = unpack("x$maxPos A*",$s1);
636 $q1 = unpack("x$maxPos A*",$q1);
637
638
639 #trim second
640 $skip = 0 ;#length($q2);
641 $maxPos = $skip;
642 $area = 0;
643 $maxArea = 0;
644 @qarray = unpack("C*",$q2);
645
646 while ($skip < length($q2) && $area>=0) {
647 $area += $tq - ($qarray[($skip)] - 33);
648
649 if ($area < $maxArea) {
650 $skip++;
651 next;
652 }
653 else {
654 $maxArea = $area;
655 $maxPos = $skip;
656 $skip++;
657 next;
658 }
659 }
660 if ($maxPos==(length($q2)-1)) { # scanned whole read and didn't integrate to zero? skip (no output for this read)
661 $discarded++;
662 $failedstring .= $read ;#. '|';
663 next;
664 }
665 # trim after position where area reached a maximum
666 #$s2 = substr($s2,($maxPos+1));
667 #$q2 = substr($q2,($maxPos+1));
668 $s2 = unpack("x$maxPos A*",$s2);
669 $q2 = unpack("x$maxPos A*",$q2);
670 }
671
672 ## DISCARD IF TOO SHORT ##
673 if (length($s1) < $minlength || length($s2) < $minlength) {
674 $discarded++;
675 $failedstring .= $read ;#. '|';
676 next;
677
678 }
679 # queue for printing
680 my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2);
681 $printstring .= join('~',@arr) . '|'; # these two are not in quality string, and can thus be used as seperator
682 # | seperates pairs of reads; ~ seperates lines of single read entry.
683 #print "$printstring\n";
684 }
685
686 if ($printstring ne '') {
687 $toprint->enqueue(pack("A*X",$printstring)); # removes trailing |
688 #$toprint->enqueue($printstring);
689 }
690 if ($failedstring ne '') {
691 #$failed->enqueue(pack("A*X",$failedstring));
692 $failed->enqueue($failedstring);
693 }
694
695
696 }
697 }
698 sub simplepaired {
699 # local variables speed up threading
700 my $tq = $trimq;
701 while ( defined(my $rstring = $totrim->dequeue())){
702 my @reads = split(/\|/,$rstring);
703 my $printstring = '';
704 my $failedstring = '';
705 foreach (@reads) {
706 my $read = $_;
707 my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read);
708 ## skip too short reads
709 if (length($q1) < $minlength || length($q2) < $minlength) {
710 $discarded++;
711 $failedstring .= $read; # . '|';
712 next;
713 }
714
715 # initial check for maximal quality value.
716 ## quality arrays
717 my @q1array = unpack("C*",$q1);
718 if (max(@q1array) < $trimq + 33) {
719 #print "Discarding on failed forward qualities\n$read\n";
720 $discarded++;
721 $failedstring .= $read;# . '|';
722 next;
723 }
724 my @q2array = unpack("C*",$q2);
725 if (max(@q2array) < $trimq + 33) {
726 #print "Discarding on failed reverse qualities\n$read\n";
727 $discarded++;
728 $failedstring .= $read;# . '|';
729 next;
730 }
731
732 #############
733 ## trim 3' ##
734 #############
735 if ($trim3 == 1) {
736 #trim first
737 my $pos = length($q1) - 1;
738 my $maxPos = $pos;
739 #if ($pos <= 0) {
740 # $discarded++;
741 # $failedstring .= $read . '|';
742 # next;
743 #}
744 #my @qarray = unpack("C*",$q1);
745 while ( $pos>0 ) {
746 if ($tq > $q1array[$pos] - 33) {
747 # score at position is below threshold
748 $pos--;
749 next;
750 }
751 else {
752 # score is at or above threshold
753 $maxPos = $pos;
754 last;
755 }
756 }
757 if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read)
758 $discarded++;
759 $failedstring .= $read ;#. '|';
760 next;
761 }
762 # integrated to zero? trim before position where quality was below threshold
763 # unpack is one based
764 #$maxPos++;
765 #$s1 = substr($s1,0,$maxPos);
766 $s1 = unpack("A$maxPos",$s1);
767 #$q1 = substr($q1,0,$maxPos);
768 $q1 = unpack("A$maxPos",$q1);
769
770 #trim second
771 $pos = length($q2) -1;
772 $maxPos = $pos;
773 if ($pos <= 0) {
774 $discarded++;
775 $failedstring .= $read ;#. '|';
776 next;
777 }
778 #my @qarray = unpack("C*",$q2);
779
780 while ($pos>0) {
781 if ($tq > $q2array[$pos] - 33) {
782 # score at position is below threshold
783 $pos--;
784 next;
785 }
786 else {
787 # score is at or above threshold
788 $maxPos = $pos;
789 last;
790 }
791 }
792 if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read)
793 $discarded++;
794 $failedstring .= $read ;#. '|';
795 next;
796 }
797 # integrated to zero? trim before position where score below threshold
798 #$maxPos++;
799 $s2 = unpack("A$maxPos",$s2);
800 $q2 = unpack("A$maxPos",$q2);
801 #$s2 = substr($s2,0,$maxPos);
802 #$q2 = substr($q2,0,$maxPos);
803
804 }
805 #############
806 ## trim 5' ##
807 #############
808 if ($trim5 == 1) {
809 # first
810 my $skip = 0; #length($q1);
811 #my $maxPos = $skip;
812 my @qarray = unpack("C*",$q1);
813 while ( $skip < length($q1) ) {
814 if ($tq > $qarray[$skip] - 33) {
815 # score at position is below threshold
816 $skip++;
817 next;
818 }
819 else {
820 # score is at or above threshold
821 #$maxPos = $skip;
822 last;
823 }
824 }
825 if ($skip == length($q1)) { # should not fail, as that would have happened on 3' trim
826 $discarded++;
827 $failedstring .= $read ;#. '|';
828 next;
829 }
830 # integrated to zero? trim before position where quality was below threshold
831 # unpack is one based
832 $s1 = unpack("x$skip A*",$s1);
833 $q1 = unpack("x$skip A*",$q1);
834 #$s1 = substr($s1,$skip);
835 #$q1 = substr($q1,$skip);
836
837 # second
838 $skip = 0; #length($q1);
839 my @qarray = unpack("C*",$q2);
840 #$maxPos = $skip;
841 while ( $skip < length($q2) ) {
842 if ($tq > $qarray[$skip] - 33) {
843 # score at position is below threshold
844 $skip++;
845 next;
846 }
847 else {
848 # score is at or above threshold
849 #$maxPos = $skip;
850 last;
851 }
852 }
853 if ($skip == length($q2)) { # should not fail, as that would have happened on 3' trim
854 $discarded++;
855 $failedstring .= $read ;#. '|';
856 next;
857 }
858 # integrated to zero? trim before position where quality was below threshold
859 # unpack is one based
860 $s2 = unpack("x$skip A*",$s2);
861 $q2 = unpack("x$skip A*",$q2);
862 #$s2 = substr($s2,$skip);
863 #$q2 = substr($q2,$skip);
864
865 }
866 ## DISCARD IF TOO SHORT ##
867 if (length($s1) < $minlength || length($s2) < $minlength) {
868 $discarded++;
869 $failedstring .= $read .#. '|';
870 next;
871
872 }
873
874 # queue for printing
875 my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2);
876 $printstring .= join('~',@arr) . '|'; # these two are not in quality string, and can thus be used as seperator
877 # | seperates pairs of reads; ~ seperates lines of single read entry.
878 }
879 if ($printstring ne '') {
880 $toprint->enqueue(pack("A*X",$printstring)); # removes trailing |
881 #$toprint->enqueue($printstring);
882 }
883 if ($failedstring ne '') {
884 #$failed->enqueue(pack("A*X",$failedstring));
885 $failed->enqueue($failedstring);
886 }
887 }
888 }
889
890 sub bwasingle {
891 # local variables speed up threading
892 my $tq = $trimq;
893 while ( defined(my $rstring = $totrim->dequeue())){
894 my @reads = split(/\|/,$rstring);
895 my $printstring = '';
896 my $failedstring = '';
897 foreach (@reads) {
898 my $read = $_;
899 my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read);
900 ## skip too short reads
901 if (length($q1) < $minlength ) {
902
903 $discarded++;
904 $failedstring .= $read ;#. '|';
905 next;
906 }
907 # initial check for maximal quality value.
908 ## quality arrays
909 my @q1array = unpack("C*",$q1);
910 if (max(@q1array) < $tq + 33) {
911 print "Discarding on failed qualities\n$read\n";
912 $discarded++;
913 $failedstring .= $read;# . '|';
914 next;
915 }
916
917 #############
918 ## trim 3' ##
919 #############
920 if ($trim3 == 1) {
921 my $pos = length($q1) - 1;
922 my $maxPos = $pos;
923 ## skip empty read.
924 #if ($pos <= 0) {
925 # $discarded++;
926 # $failedstring .= $read . '|';
927 # next;
928 #}
929 my $area = 0;
930 my $maxArea = 0;
931 #my @qarray = unpack("C*",$q1);
932 while ($pos>0 && $area>=0) {
933 $area += $tq - ($q1array[($pos)] - 33);
934 ## not <= for more aggressive trimming : if multiple equal maxima => use shortest read.
935 if ($area < $maxArea) {
936 $pos--;
937 next;
938 }
939 else {
940 $maxArea = $area;
941 $maxPos = $pos;
942 $pos--;
943 next;
944 }
945 }
946 ## The online perl approach is wrong if the high-quality part is not long enough to reach 0.
947 if ($maxPos == 0) { # maximal badness on position 0 => no point of no return reached.
948 $discarded++;
949 $failedstring .= $read ;#. '|';
950 next;
951 }
952 # otherwise trim before position where area reached a maximum
953 # $maxPos as length of substr gives positions zero to just before the maxpos.
954 $s1 = unpack("A$maxPos",$s1);
955 $q1 = unpack("A$maxPos",$q1);
956
957 }
958 #############
959 ## trim 5' ##
960 #############
961 if ($trim5 == 1) {
962 my $skip = 0; #length($q1);
963 my $maxPos = $skip;
964 my $area = 0;
965 my $maxArea = 0;
966 my @qarray = unpack("C*",$q1);
967 while ($skip<length($q1) && $area>=0) {
968 #$area += $tq - (ord(unpack("x$skip A1",$q1)) - 33);
969 $area += $tq - ($qarray[($skip)] - 33);
970 if ($area < $maxArea) {
971 $skip++;
972 next;
973 }
974 else {
975 $maxArea = $area;
976 $maxPos = $skip;
977 $skip++;
978 next;
979 }
980 }
981 #if ($skip==length($q1)) { # scanned whole read and didn't integrate to zero? skip (no output for this read
982 if ($maxPos == (length($q1)-1)) { # => maximal value at end of string => no point of no return found.
983 $discarded++;
984 $failedstring .= $read ;#. '|';
985 next;
986 }
987 # trim after position where area reached a maximum
988 $s1 = unpack("x$maxPos A*",$s1);
989 $q1 = unpack("x$maxPos A*",$q1);
990 #$s1 = substr($s1,($maxPos+1));
991 #$q1 = substr($q1,($maxPos+1));
992 }
993 ## DISCARD IF TOO SHORT ##
994 if (length($s1) < $minlength) {
995 $discarded++;
996 $failedstring .= $read ;#. '|';
997 next;
998
999 }
1000
1001 # queue for printing
1002 my @arr = ($h1a,$s1,$h1b,$q1);
1003 $printstring .= join("\n",@arr) . "\n"; # join components by newline character for printing
1004 }
1005 if ($printstring ne '') {
1006 #$toprint->enqueue(pack("A*X",$printstring)); # removes trailing |
1007 $toprint->enqueue($printstring);
1008 }
1009 if ($failedstring ne '') {
1010 #$failed->enqueue(pack("A*X",$failedstring));
1011 $failed->enqueue($failedstring);
1012 }
1013
1014 }
1015 }
1016
1017
1018 sub simplesingle {
1019 # local variables speed up threading
1020 my $tq = $trimq;
1021 while ( defined(my $rstring = $totrim->dequeue())){
1022 my @reads = split(/\|/,$rstring);
1023 my $printstring = '';
1024 my $failedstring = '';
1025 foreach (@reads) {
1026 my $read = $_;
1027 my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read);
1028 ## skip too short reads
1029 if (length($q1) < $minlength ) {
1030 $discarded++;
1031 $failedstring .= $read ;#. '|';
1032 next;
1033 }
1034 # initial check for maximal quality value.
1035 ## quality arrays
1036 my @q1array = unpack("C*",$q1);
1037 if (max(@q1array) < $trimq + 33) {
1038 #print "Discarding on failed qualities\n$read\n";
1039 $discarded++;
1040 $failedstring .= $read ;#. '|';
1041 next;
1042 }
1043
1044 #############
1045 ## trim 3' ##
1046 #############
1047 if ($trim3 == 1) {
1048 my $pos = length($q1);
1049 my $maxPos = $pos;
1050 #if ($pos <= 0) {
1051 # $discarded++;
1052 # $failedstring .= $read . '|';
1053 # next;
1054 #}
1055 #my @qarray = unpack("C*",$q1);
1056
1057 while ( $pos>0 ) {
1058 if ($tq > $q1array[$pos] - 33) {
1059 # score at position is below threshold
1060 $pos--;
1061 next;
1062 }
1063 else {
1064 # score is at or above threshold
1065 $maxPos = $pos;
1066 last;
1067 }
1068 }
1069 if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read)
1070 $discarded++;
1071 $failedstring .= $read ;#. '|';
1072 next;
1073 }
1074 # integrated to zero? trim before position where quality was below threshold
1075 # unpack is one based
1076 $s1 = unpack("A$maxPos",$s1);
1077 $q1 = unpack("A$maxPos",$q1);
1078 #$s1 = substr($s1,0,$maxPos);
1079 #$q1 = substr($q1,0,$maxPos);
1080
1081 }
1082 #############
1083 ## trim 5' ##
1084 #############
1085 if ($trim5 == 1) {
1086 my $skip = 0; #length($q1);
1087 #my $maxPos = $skip;
1088 my @qarray = unpack("C*",$q1);
1089
1090 while ( $skip < length($q1) ) {
1091 if ($tq > $qarray[$skip] - 33) {
1092 # score at position is below threshold
1093 $skip++;
1094 next;
1095 }
1096 else {
1097 # score is at or above threshold
1098 #$maxPos = $skip;
1099 last;
1100 }
1101 }
1102 if ($skip == length($q1)) {
1103 $discarded++;
1104 $failedstring .= $read ;#. '|';
1105 next;
1106 }
1107 # integrated to zero? trim before position where quality was below threshold
1108 # unpack is one based
1109 #$s1 = substr($s1,$skip);
1110 #$q1 = substr($q1,$skip);
1111 $s1 = unpack("x$skip A*",$s1);
1112 $q1 = unpack("x$skip A*",$q1);
1113
1114
1115 }
1116 ## DISCARD IF TOO SHORT ##
1117 if (length($s1) < $minlength) {
1118 $discarded++;
1119 $failedstring .= $read ;#. '|';
1120 next;
1121
1122 }
1123
1124
1125 # queue for printing
1126 my @arr = ($h1a,$s1,$h1b,$q1);
1127 $printstring .= join("\n",@arr) . "\n"; # join components by newline character for printing
1128 }
1129 if ($printstring ne '') {
1130 $toprint->enqueue($printstring); # single reads => no | present, just \n
1131 }
1132 if ($failedstring ne '') {
1133 #$failed->enqueue(pack("A*X",$failedstring));
1134 $failed->enqueue($failedstring);
1135 }
1136
1137 }
1138 }
1139
1140 sub printout {
1141 if ($rand eq '' || !defined($rand)) {
1142 die('Random value not passed on!');
1143 }
1144 if ($paired == 0) {
1145 my $counter = 0;
1146 my $string1 = '';
1147 if (-e "/tmp/$rand.fq") {
1148 unlink("/tmp/$rand.fq");
1149 }
1150 while (defined(my $string = $toprint->dequeue())) { # each item represents 10000 input reads (might be less by trimming)
1151 #my @array = split(/\|/,$printstring);
1152 #$done = $done + scalar(@array);
1153 $done = $done + 10000;
1154 $string1 .= $string;
1155 $counter++;
1156 #print batches of 500,000 reads
1157 if ($counter == 50) {
1158 open OUT1, ">>/tmp/$rand.fq";
1159 print OUT1 $string1;
1160 close OUT1;
1161 $string1 = '';
1162 #$string2 = '';
1163 $counter = 0;
1164 }
1165 }
1166 ## print last
1167 open OUT1, ">>/tmp/$rand.fq";
1168 print OUT1 $string1;
1169 close OUT1;
1170 $finish = 1;
1171
1172 }
1173 else {
1174 my $counter = 0;
1175 my $string1 = '';
1176 my $string2 = '';
1177 if (-e "/tmp/$rand.fq") {
1178 unlink("/tmp/$rand.fq");
1179 }
1180 if (-e "/tmp/$rand.fq") {
1181 unlink("/tmp/$rand.rq");
1182 }
1183 while (defined(my $printstring = $toprint->dequeue())) { # each item represents 10000 input reads
1184 my @array = split(/\|/,$printstring);
1185 $done = $done + scalar(@array);
1186 foreach (@array) {
1187 my @subarr = split(/\~/,$_);
1188 $string1 .= join("\n",@subarr[0..3])."\n";
1189 $string2 .= join("\n",@subarr[4..7])."\n";
1190 }
1191 $counter++;
1192 if ($counter == 50) {
1193 #print "Printing!\n";
1194 flock("/tmp/sg.write.lock",2);
1195 open OUT1, ">>/tmp/$rand.fq";
1196 print OUT1 $string1;
1197 close OUT1;
1198 open OUT2, ">>/tmp/$rand.rq";
1199 print OUT2 $string2;
1200 close OUT2;
1201 flock("/tmp/sg.write.lock",8);
1202 $string1 = '';
1203 $string2 = '';
1204 $counter = 0;
1205 }
1206 }
1207 ## print last
1208 flock("/tmp/sg.write.lock",2);
1209 open OUT1, ">>/tmp/$rand.fq";
1210 print OUT1 $string1;
1211 close OUT1;
1212 open OUT2, ">>/tmp/$rand.rq";
1213 print OUT2 $string2;
1214 close OUT2;
1215 flock("/tmp/sg.write.lock",8);
1216 $finish = 1;
1217 }
1218
1219 }
1220
1221 sub printfailed {
1222 if (-e "/tmp/$rand.failed") {
1223 unlink("/tmp/$rand.failed");
1224 }
1225 #open FAIL, ">/tmp/$rand.failed" or die("could not open file with failed reads, '/tmp/$rand.failed'");
1226 #FAIL->autoflush(1);
1227 my $nr = 0;
1228 my $string = '';
1229 while ( defined(my $rstring = $failed->dequeue())) {
1230 #my @array = split(/\|/,$rstring);
1231 #foreach (@array) {
1232 $string .= $rstring;
1233 #}
1234 $nr++;
1235 if ($nr == 50) {
1236 open FAIL, ">>/tmp/$rand.failed";
1237 print FAIL $string;
1238 close FAIL;
1239 $nr = 0;
1240 $string = '';
1241 }
1242 #close FAIL;
1243 }
1244 open FAIL, ">>/tmp/$rand.failed";
1245 print FAIL $string;
1246 close FAIL;
1247 }
1248 sub monitor {
1249 # checks the file read monitor
1250 my $counter = 0;
1251 while ($finish == 0) {
1252 $counter++;
1253 if ($totrim->pending() > 200) {
1254 lock($dummy);
1255 #print "Putting the file reading to sleep for 10 seconds.\n";
1256 sleep 5;
1257 }
1258 else {
1259 sleep 5;
1260 }
1261 if ($counter == 6) {
1262 if ($verbose == 1) {
1263 my $string = ($done/1000000)."M Reads Passed, ".($discarded/1000000)."M reads discarded (".$totrim->pending().".10^4 pending)\n";
1264 print $string ;
1265 }
1266 $counter = 0;
1267 }
1268
1269 }
1270 }