Mercurial > repos > geert-vandeweyer > fastq_qc_trimmer
changeset 1:fdbcc1aa4f01 draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Thu, 13 Feb 2014 08:21:15 -0500 |
parents | 548887c4227c |
children | 16a85df293d2 |
files | Paired_fastQ_trimmer.pl |
diffstat | 1 files changed, 1270 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Paired_fastQ_trimmer.pl Thu Feb 13 08:21:15 2014 -0500 @@ -0,0 +1,1270 @@ +#!/usr/bin/perl + +# load modules +use Getopt::Std; +use threads; +use Thread::Queue; +use threads::shared; +use File::Basename; +use List::Util qw(max); + +$now = time ; + +$|++; +# variables +my $paired :shared; +$paired = 0; +my $fq :shared; +my $rq :shared; +my $ofq :shared; +my $orq :shared; +my $trimq :shared; +my $failedfile :shared; +my $done :shared; + $done = 0; +my $nrin :shared; + $nrin = 0; + +my $rqz :shared; +my $fqz :shared; +my $rand :shared; +my $discarded :shared; + $discarded = 0; +my $verbose :shared; + $verbose = 1; +my $trim3 :shared; + $trim3 = 1; +my $trim5 :shared; + $trim5 = 1; +# create file lock in the tmp dir +$rand = int(rand(10000)); +while (-e "/tmp/$rand.lock") { + $rand = int(rand(10000)); +} +system("touch '/tmp/$rand.lock'"); +if (!-e "/tmp/sg.write.lock") { + system("touch '/tmp/sg.write.lock'"); + system("chmod 777 '/tmp/sg.write.lock'"); +} + +# opts +# i : path to forward read FASTQ file +# I : path to reverse read FASTQ file +# q : quality threshold in phred. +# n : string representing common read name parts +# s : trimming style : simple/bwa +# o : output file for forward reads +# O : output file for reverse reads +# F : output file with fully failed reads pairs. +# v : verbose: defaults to on (0/1) +# S : Trim on side: 5/3/b(oth) +# m : minimal length. +getopts('i:I:q:n:s:o:O:F:v:S:m:', \%opts) ; + + +if (defined($opts{'v'})) { + $verbose = $opts{'v'}; +} + +if (defined($opts{'s'})) { + if ($opts{'s'} eq 'bwa') { + if ($verbose == 1) { + print "Using BWA style trimming\n"; + } + $style = 'bwa'; + } + else { + if ($verbose == 1) { + print "Using simple 1bp Quality trimming\n"; + } + $style = 'simple'; + } +} +else { + if ($verbose == 1){ + print "Using simple 1bp Quality trimming\n"; + } + $style = 'simple'; +} + +if (!defined($opts{'i'})) { die('Forward reads fastq is mandatory');} +$fq = $opts{'i'}; +$ofq = $opts{'o'}; + +## gzipped infile? +my $out = `file $fq`; +if ($out =~ m/gzip compressed/) { + if ($verbose == 1) { + print "Forward reads in gzipped format.\n"; + } + $fqz = 1; +} +else { + if ($verbose == 1) { + print "Forward reads in plain fastq format.\n"; + } + $fqz = 0; +} + + +if (defined($opts{'I'})) { + if ($verbose == 1) { + print "Reverse reads provided. Performing paired-end trimming\n"; + } + $paired = 1; + $rq = $opts{'I'}; + $orq = $opts{'O'}; + # gzipped ? + my $out = `file $rq`; + if ($out =~ m/gzip compressed/) { + if ($verbose == 1) { + print "Reverse reads in gzipped format.\n"; + } + $rqz = 1; + } + else { + if ($verbose == 1) { + print "Reverse reads in plain fastq format.\n"; + } + $rqz = 0; + } +} +## FAILED FILENAME +$failedfile = $opts{'F'}; + +## set the sides to trim +my $trimside = ''; +if (defined($opts{'S'})) { + if ($opts{'S'} eq '3') { + $trimside = "Trimming reads from the 3' end only\n"; + $trim3 = 1; + $trim5 = 0; + } + elsif ($opts{'S'} eq '5') { + $trimside = "Trimming reads from the 5' end only\n"; + $trim3 = 0; + $trim5 = 1; + } + elsif ($opts{'S'} eq 'b') { + $trimside = "Trimming reads both from 5' and 3'\n"; + $trim3 = 1; + $trim5 = 1; + } + else { + $trimside = "Invalid trimming side specification. Setting to both sides trimming\n"; + $trim3 = 1; + $trim5 = 1; + } +} +else { + $trimside = "Invalid trimming side specification. Setting to both sides trimming\n"; + $trim3 = 1; + $trim5 = 1; +} +if ($verbose == 1) { + print $trimside; +} + +# SET MINIMAL LENGTH +my $minlength :shared; +if (exists($opts{'m'})) { + $minlength = $opts{'m'}; +} +else { + $minlength = 0; +} + +## CHECK IF OUTPUT FILESNAMES ARE PRESENT +if ($ofq eq '') { + $ofq = "$fq.$style.trimmed"; + if ($verbose == 1) { + print "No Forward output name specified. Forward reads will be printed to:\n\t$ofq\n"; + } +} +if ($paired == 1 && $orq eq '') { + $orq = "$rq.$style.trimmed"; + if ($verbose == 1) { + print "No Reverse output name specified. Reverse reads will be printed to:\n\t$orq\n"; + } +} +if ($failedfile eq '') { + $failedfile = "$fq.Failed_Reads"; + if ($verbose == 1) { + print "No failed ouput name specified. Failed reads will be printed to:\n\t$failedfile\n"; + } +} + +## set trimming threshold +if (!defined($opts{'q'}) or !($opts{'q'} =~ m/^[0-9]+$/)) { + $trimq = 20; +} +else { + $trimq = $opts{'q'}; +} + +## set read delimiter +if (defined($opts{'n'})) { + $indelim = $opts{'n'}; + # galaxy replaced @ by __at__ + if (substr($indelim,0,6) eq '__at__') { + $indelim = '@'.substr($indelim,6); + } +} +else { + print "No indelim defined\n"; + $indelim = "@"; # has risk, fails if @ is first qual item +} + +## NEW : REPLACE if indelim = @ by the first 5 chars of first line of forward-fastq. +if ($indelim eq '@') { + if ($fqz == 0) { + open IN, $fq; + } + else { + open(IN, "gunzip -c $fq | "); + } + $head = <IN>; + $indelim = substr($head,0,5); + print "\n"; + print "WARNING:\n"; + print "########\n"; + print " Setting Delimiter as '\@' is prone to errors\n"; + print " as it will incorrectly split if '\@' is the firt value in the quality string !!\n"; + print ' It is recommended to use eg -n \'\@SRR301\''."\n"; + print " Therefore, the delimiter was changed to the first 5 characters of the first forward fastq file.\n"; + print " => Delimiter : $indelim\n"; + print "\n\n"; + sleep 3; +} +## create queues +my $totrim = Thread::Queue->new(); +my $toprint = Thread::Queue->new(); +my $dummy = Thread::Queue->new(); +my $failed = Thread::Queue->new(); + +# monitor thread +my $finish :shared; +$finish = 0; +$mon = threads->create('monitor'); + +# start FASTQ reading thread +if ($verbose == 1) { + print "start reading file\n"; +} +$reading = threads->create('readfile'); + +## give time to build reading buffer +sleep 10; + +# start trimming threads +if ($verbose == 1) { + print "start trimming\n"; +} +if ($paired == 1) { + if ($style eq 'bwa') { + $trimming1 = threads->create('bwapaired'); + $trimming2 = threads->create('bwapaired'); + + } + else { + $trimming1 = threads->create('simplepaired'); + $trimming2 = threads->create('simplepaired'); + } + +} +else { + if ($style eq 'bwa') { + #print "not available yet, switching to simple trimming\n"; + #$style = 'simple'; + $trimming1 = threads->create('bwasingle'); + $trimming2 = threads->create('bwasingle'); + + } + if ($style eq 'simple') { + $trimming1 = threads->create('simplesingle'); + $trimming2 = threads->create('simplesingle'); + } +} +# start printing threads +$printout = threads->create('printout'); +$printfail = threads->create('printfailed'); +# end threads +$reading->join(); +## all reads are queued for trimming, add undef to trimming queues to end threads. +$totrim->enqueue(undef); +$totrim->enqueue(undef); +$trimming1->join(); +$trimming2->join(); +## all reads are trimmed and queued from printing, send undef to end printing threads. +$toprint->enqueue(undef); +$failed->enqueue(undef); +$printout->join(); +$printfail->join(); +if ($verbose == 1) { + print "Waiting for status monitor to shut down\n"; +} +$mon->join(); +## copy files from /tmp to destination folder +if ($verbose == 1) { + print "Copying files to destination: Forward.."; +} +if ($fqz == 0) { + system("cp -f /tmp/$rand.fq $ofq"); + unlink("/tmp/$rand.fq"); +} +else { + system("gzip -c /tmp/$rand.fq > $ofq"); + unlink("/tmp/$rand.fq"); +} +if ($paired == 1) { + if ($verbose == 1) { + print " Reverse.."; + } + if ($rqz == 0) { + system("cp -f /tmp/$rand.rq $orq"); + unlink("/tmp/$rand.rq"); + } + else { + system("gzip -c /tmp/$rand.rq > $orq"); + unlink("/tmp/$rand.rq"); + } +} + +if ($verbose == 1) { + print " Failed items.."; +} +system("cp -f /tmp/$rand.failed $failedfile"); +unlink("/tmp/$rand.failed"); +if ($verbose == 1) { + print " Done\n"; +} + +# remove lock file +unlink("/tmp/$rand.lock"); + +######################## +## print run details: ## +######################## +my $string = sprintf("%.2f",$done/1000000)."M Reads in (each) FASTQ file processed.\n"; +if ($paired == 1) { + $string .= sprintf ("%.2f",($done-$discarded)/1000000)."M read pairs in output FASTQ's.\n"; + $string .= sprintf("%.2f",$discarded/1000000)."M reads pairs discarded\n"; +} +else { + $string .= sprintf ("%.2f",($done-$discarded)/1000000)."M reads in output FASTQ.\n"; + $string .= sprintf("%.2f",$discarded/1000000)."M reads discarded\n"; +} +print $string; +################## +# PRINT RUN-TIME # +################## +$now = time - $now; +printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60)); + + +exit(); + + + +## SUBROUTINE : READ FASTQ FILES +################################ +sub readfile { + if ($paired == 1) { + if ($fqz == 0) { + open IN1, $fq; + } + else { + open(IN1, "gunzip -c $fq | "); + } + if ($rqz == 0) { + open IN2, $rq; + } + else { + open(IN2, "gunzip -c $rq | "); + } + $/ = "\n$indelim"; + $deliml = length($indelim); + # compose pack ignore string + my $dx = ''; + for (my $idx = 0; $idx<length($indelim); $idx++) { + $dx .= 'X'; + } + my $f = <IN1>; + my $r = <IN2>; + # the reads contain the delimiter on the end (cf \n on regular file read line endings.) + # so we strip teh delimiter using pack, leaving just the \n. + # this means that in subsequent reads, we need to add the delimiter to the front of the read. + my $read = substr(pack("A*$dx",$f),$deliml) . $r; #pack("A*$dx",$r) ;#. '|'; + my $counter = 1; + my $fr = ''; + my $rr = ''; + my $reads = ''; + my $lastf = ''; + while ($fr = <IN1>) { + $rr = <IN2>; + $lastf = $fr; + ## add delim to front, trim from back. add our delim. + ## done in next iteration, because last is different. + # the delimiter from the forward read is used to complete the reverse read. + $reads .= $indelim.pack("A*$dx",$read) .'|'; + # concat reads + $read = $fr.$rr; + ## add string to queue if 20,000 reads in array + if ($counter == 20000) { + $nrin = $nrin + 20000; + $totrim->enqueue(pack("A*X",$reads)); # pack("A*X") strips last | + $reads = ''; + $counter = 0; + } + $counter++; + + ## dummy (see monitor). + my $d = $dummy->dequeue_nb(); + } + # last reads in files need special care. + $reads .= $indelim.$lastf.$indelim.$rr; + close IN1; + close IN2; + # submit last reads to queue + if ($reads ne '') { + #$totrim->enqueue(pack("A*X",$reads)); + $totrim->enqueue($reads); + } + if ($verbose == 1) { + print "All reads queued for trimming\n"; + } + } + else { + if ($fqz == 0) { + open IN1, $fq; + } + else { + open(IN1, "gunzip -c $fq | "); + } + $/ = "\n$indelim"; + $deliml = length($indelim); + # compose pack ignore string + my $dx = ''; + for (my $idx = 0; $idx<length($indelim); $idx++) { + $dx .= 'X'; + } + my $f = <IN1>; + # the reads contain the delimiter on the end (cf \n on regular file read line endings.) + # so we strip teh delimiter using pack, leaving just the \n. + # this means that in subsequent reads, we need to add the delimiter to the front of the read. + my $read = substr($f,$deliml); + my $counter = 1; + my $fr = ''; + my $reads = ''; + while ($fr = <IN1>) { + ## add delim to front, trim from back. add our delim. + ## done in next iteration, because last is different. + $reads .= $indelim.pack("A*$dx",$read) .'|'; + # concat reads + $read = $fr; + ## add string to queue if 10,000 reads in array + if ($counter == 40000) { + $nrin = $nrin + 40000; + $totrim->enqueue(pack("A*X",$reads)); + $reads = ''; + $counter = 0; + } + $counter++; + + ## dummy (see monitor). + my $d = $dummy->dequeue_nb(); + } + $reads .= $indelim.$read; + close IN1; + # submit last reads to queue + if ($reads ne '') { + #$totrim->enqueue(pack("A*X",$reads)); + $totrim->enqueue($reads); + } + if ($verbose == 1) { + print "All reads queued for trimming\n"; + } + + } +} + +sub bwapaired { + my $tq = $trimq; + while ( defined(my $rstring = $totrim->dequeue())) { + my @reads = split(/\|/,$rstring); # each item has 20.000 reads + my $printstring = ''; + my $failedstring = ''; + foreach (@reads) { + my $read = $_; + my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read); + ## skip too short reads + if (length($q1) < $minlength || length($q2) < $minlength) { + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # initial check for maximal quality value. + ## quality arrays + my @q1array = unpack("C*",$q1); + if (max(@q1array) < $tq + 33) { + #print "Discarding on failed forward qualities: ".max(@q1array). " vs $tq + 33)\n$read\n"; + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + my @q2array = unpack("C*",$q2); + if (max(@q2array) < $trimq + 33) { + #print "Discarding on failed reverse qualities\n$read\n"; + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + + ############# + ## Trim 3' ## + ############# + if ($trim3 == 1) { + #trim first + my $pos = length($q1) - 1; + my $maxPos = $pos; + ## skip empty read. + #if ($pos <= 0) { + # $discarded++; + # $failedstring .= $read . '|'; + # #print "skip empty : \n$read\n"; + # next; + #} + my $area = 0; + my $maxArea = 0; + ## unpack qual-string to array of phred scores + #my @qarray = unpack("C*",$q1); # use from initial check. + while ($pos>0 && $area>=0) { + $area += $tq - ($q1array[($pos)] - 33); + ## not <= for more aggressive trimming : if multiple equal maxima => use shortest read. + if ($area < $maxArea) { + $pos--; + next; + } + else { + $maxArea = $area; + $maxPos = $pos; + $pos--; + next; + } + } + if ($maxPos==0) { # scanned whole read and didn't integrate to maximum? skip (no output for this read) + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # otherwise trim before position where area reached a maximum + # $maxPos as length of substr gives positions zero to just before the maxpos. + #$s1 = substr($s1,0,$maxPos); + #$q1 = substr($q1,0,$maxPos); + $s1 = unpack("A$maxPos",$s1); + $q1 = unpack("A$maxPos",$q1); + + #trim second + $pos = length($q2); + $maxPos = $pos; + ## skip empty read. + #if ($pos <= 0) { + # $discarded++; + # $failedstring .= $read . '|'; + # next; + #} + $area = 0; + $maxArea = 0; + #@qarray = unpack("C*",$q2); + while ($pos>0 && $area>=0) { + $area += $tq - ($q2array[($pos)] - 33); + if ($area < $maxArea) { + $pos--; + next; + } + else { + $maxArea = $area; + $maxPos = $pos; + $pos--; + next; + } + } + if ($maxPos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read) + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # Otherwise trim before position where area reached a maximum + $s2 = unpack("A$maxPos",$s2); + $q2 = unpack("A$maxPos",$q2); + #$s2 = substr($s2,0,$maxPos); + #$q2 = substr($q2,0,$maxPos); + + } + ############# + ## TRIM 5' ## + ############# + if ($trim5 == 1) { + #trim first + my $skip = 0; #length($q1); + my $maxPos = $skip; + my $area = 0; + my $maxArea = 0; + my @qarray = unpack("C*",$q1); + while ($skip<length($q1) && $area>=0) { + $area += $tq - ($qarray[($skip)] - 33); + if ($area < $maxArea) { + $skip++; + next; + } + else { + $maxArea = $area; + $maxPos = $skip; + $skip++; + next; + } + } + if ($maxPos==length($q1)- 1) { # => maximal value at end of string => no point of no return found. + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # trim after position where area reached a maximum + #$s1 = substr($s1,($maxPos+1)); + #$q1 = substr($q1,($maxPos+1)); + $s1 = unpack("x$maxPos A*",$s1); + $q1 = unpack("x$maxPos A*",$q1); + + + #trim second + $skip = 0 ;#length($q2); + $maxPos = $skip; + $area = 0; + $maxArea = 0; + @qarray = unpack("C*",$q2); + + while ($skip < length($q2) && $area>=0) { + $area += $tq - ($qarray[($skip)] - 33); + + if ($area < $maxArea) { + $skip++; + next; + } + else { + $maxArea = $area; + $maxPos = $skip; + $skip++; + next; + } + } + if ($maxPos==(length($q2)-1)) { # scanned whole read and didn't integrate to zero? skip (no output for this read) + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # trim after position where area reached a maximum + #$s2 = substr($s2,($maxPos+1)); + #$q2 = substr($q2,($maxPos+1)); + $s2 = unpack("x$maxPos A*",$s2); + $q2 = unpack("x$maxPos A*",$q2); + } + + ## DISCARD IF TOO SHORT ## + if (length($s1) < $minlength || length($s2) < $minlength) { + $discarded++; + $failedstring .= $read ;#. '|'; + next; + + } + # queue for printing + my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2); + $printstring .= join('~',@arr) . '|'; # these two are not in quality string, and can thus be used as seperator + # | seperates pairs of reads; ~ seperates lines of single read entry. + #print "$printstring\n"; + } + + if ($printstring ne '') { + $toprint->enqueue(pack("A*X",$printstring)); # removes trailing | + #$toprint->enqueue($printstring); + } + if ($failedstring ne '') { + #$failed->enqueue(pack("A*X",$failedstring)); + $failed->enqueue($failedstring); + } + + + } +} +sub simplepaired { + # local variables speed up threading + my $tq = $trimq; + while ( defined(my $rstring = $totrim->dequeue())){ + my @reads = split(/\|/,$rstring); + my $printstring = ''; + my $failedstring = ''; + foreach (@reads) { + my $read = $_; + my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read); + ## skip too short reads + if (length($q1) < $minlength || length($q2) < $minlength) { + $discarded++; + $failedstring .= $read; # . '|'; + next; + } + + # initial check for maximal quality value. + ## quality arrays + my @q1array = unpack("C*",$q1); + if (max(@q1array) < $trimq + 33) { + #print "Discarding on failed forward qualities\n$read\n"; + $discarded++; + $failedstring .= $read;# . '|'; + next; + } + my @q2array = unpack("C*",$q2); + if (max(@q2array) < $trimq + 33) { + #print "Discarding on failed reverse qualities\n$read\n"; + $discarded++; + $failedstring .= $read;# . '|'; + next; + } + + ############# + ## trim 3' ## + ############# + if ($trim3 == 1) { + #trim first + my $pos = length($q1) - 1; + my $maxPos = $pos; + #if ($pos <= 0) { + # $discarded++; + # $failedstring .= $read . '|'; + # next; + #} + #my @qarray = unpack("C*",$q1); + while ( $pos>0 ) { + if ($tq > $q1array[$pos] - 33) { + # score at position is below threshold + $pos--; + next; + } + else { + # score is at or above threshold + $maxPos = $pos; + last; + } + } + if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read) + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # integrated to zero? trim before position where quality was below threshold + # unpack is one based + #$maxPos++; + #$s1 = substr($s1,0,$maxPos); + $s1 = unpack("A$maxPos",$s1); + #$q1 = substr($q1,0,$maxPos); + $q1 = unpack("A$maxPos",$q1); + + #trim second + $pos = length($q2) -1; + $maxPos = $pos; + if ($pos <= 0) { + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + #my @qarray = unpack("C*",$q2); + + while ($pos>0) { + if ($tq > $q2array[$pos] - 33) { + # score at position is below threshold + $pos--; + next; + } + else { + # score is at or above threshold + $maxPos = $pos; + last; + } + } + if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read) + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # integrated to zero? trim before position where score below threshold + #$maxPos++; + $s2 = unpack("A$maxPos",$s2); + $q2 = unpack("A$maxPos",$q2); + #$s2 = substr($s2,0,$maxPos); + #$q2 = substr($q2,0,$maxPos); + + } + ############# + ## trim 5' ## + ############# + if ($trim5 == 1) { + # first + my $skip = 0; #length($q1); + #my $maxPos = $skip; + my @qarray = unpack("C*",$q1); + while ( $skip < length($q1) ) { + if ($tq > $qarray[$skip] - 33) { + # score at position is below threshold + $skip++; + next; + } + else { + # score is at or above threshold + #$maxPos = $skip; + last; + } + } + if ($skip == length($q1)) { # should not fail, as that would have happened on 3' trim + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # integrated to zero? trim before position where quality was below threshold + # unpack is one based + $s1 = unpack("x$skip A*",$s1); + $q1 = unpack("x$skip A*",$q1); + #$s1 = substr($s1,$skip); + #$q1 = substr($q1,$skip); + + # second + $skip = 0; #length($q1); + my @qarray = unpack("C*",$q2); + #$maxPos = $skip; + while ( $skip < length($q2) ) { + if ($tq > $qarray[$skip] - 33) { + # score at position is below threshold + $skip++; + next; + } + else { + # score is at or above threshold + #$maxPos = $skip; + last; + } + } + if ($skip == length($q2)) { # should not fail, as that would have happened on 3' trim + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # integrated to zero? trim before position where quality was below threshold + # unpack is one based + $s2 = unpack("x$skip A*",$s2); + $q2 = unpack("x$skip A*",$q2); + #$s2 = substr($s2,$skip); + #$q2 = substr($q2,$skip); + + } + ## DISCARD IF TOO SHORT ## + if (length($s1) < $minlength || length($s2) < $minlength) { + $discarded++; + $failedstring .= $read .#. '|'; + next; + + } + + # queue for printing + my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2); + $printstring .= join('~',@arr) . '|'; # these two are not in quality string, and can thus be used as seperator + # | seperates pairs of reads; ~ seperates lines of single read entry. + } + if ($printstring ne '') { + $toprint->enqueue(pack("A*X",$printstring)); # removes trailing | + #$toprint->enqueue($printstring); + } + if ($failedstring ne '') { + #$failed->enqueue(pack("A*X",$failedstring)); + $failed->enqueue($failedstring); + } + } +} + +sub bwasingle { + # local variables speed up threading + my $tq = $trimq; + while ( defined(my $rstring = $totrim->dequeue())){ + my @reads = split(/\|/,$rstring); + my $printstring = ''; + my $failedstring = ''; + foreach (@reads) { + my $read = $_; + my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read); + ## skip too short reads + if (length($q1) < $minlength ) { + + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # initial check for maximal quality value. + ## quality arrays + my @q1array = unpack("C*",$q1); + if (max(@q1array) < $tq + 33) { + print "Discarding on failed qualities\n$read\n"; + $discarded++; + $failedstring .= $read;# . '|'; + next; + } + + ############# + ## trim 3' ## + ############# + if ($trim3 == 1) { + my $pos = length($q1) - 1; + my $maxPos = $pos; + ## skip empty read. + #if ($pos <= 0) { + # $discarded++; + # $failedstring .= $read . '|'; + # next; + #} + my $area = 0; + my $maxArea = 0; + #my @qarray = unpack("C*",$q1); + while ($pos>0 && $area>=0) { + $area += $tq - ($q1array[($pos)] - 33); + ## not <= for more aggressive trimming : if multiple equal maxima => use shortest read. + if ($area < $maxArea) { + $pos--; + next; + } + else { + $maxArea = $area; + $maxPos = $pos; + $pos--; + next; + } + } + ## The online perl approach is wrong if the high-quality part is not long enough to reach 0. + if ($maxPos == 0) { # maximal badness on position 0 => no point of no return reached. + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # otherwise trim before position where area reached a maximum + # $maxPos as length of substr gives positions zero to just before the maxpos. + $s1 = unpack("A$maxPos",$s1); + $q1 = unpack("A$maxPos",$q1); + + } + ############# + ## trim 5' ## + ############# + if ($trim5 == 1) { + my $skip = 0; #length($q1); + my $maxPos = $skip; + my $area = 0; + my $maxArea = 0; + my @qarray = unpack("C*",$q1); + while ($skip<length($q1) && $area>=0) { + #$area += $tq - (ord(unpack("x$skip A1",$q1)) - 33); + $area += $tq - ($qarray[($skip)] - 33); + if ($area < $maxArea) { + $skip++; + next; + } + else { + $maxArea = $area; + $maxPos = $skip; + $skip++; + next; + } + } + #if ($skip==length($q1)) { # scanned whole read and didn't integrate to zero? skip (no output for this read + if ($maxPos == (length($q1)-1)) { # => maximal value at end of string => no point of no return found. + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # trim after position where area reached a maximum + $s1 = unpack("x$maxPos A*",$s1); + $q1 = unpack("x$maxPos A*",$q1); + #$s1 = substr($s1,($maxPos+1)); + #$q1 = substr($q1,($maxPos+1)); + } + ## DISCARD IF TOO SHORT ## + if (length($s1) < $minlength) { + $discarded++; + $failedstring .= $read ;#. '|'; + next; + + } + + # queue for printing + my @arr = ($h1a,$s1,$h1b,$q1); + $printstring .= join("\n",@arr) . "\n"; # join components by newline character for printing + } + if ($printstring ne '') { + #$toprint->enqueue(pack("A*X",$printstring)); # removes trailing | + $toprint->enqueue($printstring); + } + if ($failedstring ne '') { + #$failed->enqueue(pack("A*X",$failedstring)); + $failed->enqueue($failedstring); + } + + } +} + + +sub simplesingle { + # local variables speed up threading + my $tq = $trimq; + while ( defined(my $rstring = $totrim->dequeue())){ + my @reads = split(/\|/,$rstring); + my $printstring = ''; + my $failedstring = ''; + foreach (@reads) { + my $read = $_; + my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read); + ## skip too short reads + if (length($q1) < $minlength ) { + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # initial check for maximal quality value. + ## quality arrays + my @q1array = unpack("C*",$q1); + if (max(@q1array) < $trimq + 33) { + #print "Discarding on failed qualities\n$read\n"; + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + + ############# + ## trim 3' ## + ############# + if ($trim3 == 1) { + my $pos = length($q1); + my $maxPos = $pos; + #if ($pos <= 0) { + # $discarded++; + # $failedstring .= $read . '|'; + # next; + #} + #my @qarray = unpack("C*",$q1); + + while ( $pos>0 ) { + if ($tq > $q1array[$pos] - 33) { + # score at position is below threshold + $pos--; + next; + } + else { + # score is at or above threshold + $maxPos = $pos; + last; + } + } + if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read) + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # integrated to zero? trim before position where quality was below threshold + # unpack is one based + $s1 = unpack("A$maxPos",$s1); + $q1 = unpack("A$maxPos",$q1); + #$s1 = substr($s1,0,$maxPos); + #$q1 = substr($q1,0,$maxPos); + + } + ############# + ## trim 5' ## + ############# + if ($trim5 == 1) { + my $skip = 0; #length($q1); + #my $maxPos = $skip; + my @qarray = unpack("C*",$q1); + + while ( $skip < length($q1) ) { + if ($tq > $qarray[$skip] - 33) { + # score at position is below threshold + $skip++; + next; + } + else { + # score is at or above threshold + #$maxPos = $skip; + last; + } + } + if ($skip == length($q1)) { + $discarded++; + $failedstring .= $read ;#. '|'; + next; + } + # integrated to zero? trim before position where quality was below threshold + # unpack is one based + #$s1 = substr($s1,$skip); + #$q1 = substr($q1,$skip); + $s1 = unpack("x$skip A*",$s1); + $q1 = unpack("x$skip A*",$q1); + + + } + ## DISCARD IF TOO SHORT ## + if (length($s1) < $minlength) { + $discarded++; + $failedstring .= $read ;#. '|'; + next; + + } + + + # queue for printing + my @arr = ($h1a,$s1,$h1b,$q1); + $printstring .= join("\n",@arr) . "\n"; # join components by newline character for printing + } + if ($printstring ne '') { + $toprint->enqueue($printstring); # single reads => no | present, just \n + } + if ($failedstring ne '') { + #$failed->enqueue(pack("A*X",$failedstring)); + $failed->enqueue($failedstring); + } + + } +} + +sub printout { + if ($rand eq '' || !defined($rand)) { + die('Random value not passed on!'); + } + if ($paired == 0) { + my $counter = 0; + my $string1 = ''; + if (-e "/tmp/$rand.fq") { + unlink("/tmp/$rand.fq"); + } + while (defined(my $string = $toprint->dequeue())) { # each item represents 10000 input reads (might be less by trimming) + #my @array = split(/\|/,$printstring); + #$done = $done + scalar(@array); + $done = $done + 10000; + $string1 .= $string; + $counter++; + #print batches of 500,000 reads + if ($counter == 50) { + open OUT1, ">>/tmp/$rand.fq"; + print OUT1 $string1; + close OUT1; + $string1 = ''; + #$string2 = ''; + $counter = 0; + } + } + ## print last + open OUT1, ">>/tmp/$rand.fq"; + print OUT1 $string1; + close OUT1; + $finish = 1; + + } + else { + my $counter = 0; + my $string1 = ''; + my $string2 = ''; + if (-e "/tmp/$rand.fq") { + unlink("/tmp/$rand.fq"); + } + if (-e "/tmp/$rand.fq") { + unlink("/tmp/$rand.rq"); + } + while (defined(my $printstring = $toprint->dequeue())) { # each item represents 10000 input reads + my @array = split(/\|/,$printstring); + $done = $done + scalar(@array); + foreach (@array) { + my @subarr = split(/\~/,$_); + $string1 .= join("\n",@subarr[0..3])."\n"; + $string2 .= join("\n",@subarr[4..7])."\n"; + } + $counter++; + if ($counter == 50) { + #print "Printing!\n"; + flock("/tmp/sg.write.lock",2); + open OUT1, ">>/tmp/$rand.fq"; + print OUT1 $string1; + close OUT1; + open OUT2, ">>/tmp/$rand.rq"; + print OUT2 $string2; + close OUT2; + flock("/tmp/sg.write.lock",8); + $string1 = ''; + $string2 = ''; + $counter = 0; + } + } + ## print last + flock("/tmp/sg.write.lock",2); + open OUT1, ">>/tmp/$rand.fq"; + print OUT1 $string1; + close OUT1; + open OUT2, ">>/tmp/$rand.rq"; + print OUT2 $string2; + close OUT2; + flock("/tmp/sg.write.lock",8); + $finish = 1; + } + +} + +sub printfailed { + if (-e "/tmp/$rand.failed") { + unlink("/tmp/$rand.failed"); + } + #open FAIL, ">/tmp/$rand.failed" or die("could not open file with failed reads, '/tmp/$rand.failed'"); + #FAIL->autoflush(1); + my $nr = 0; + my $string = ''; + while ( defined(my $rstring = $failed->dequeue())) { + #my @array = split(/\|/,$rstring); + #foreach (@array) { + $string .= $rstring; + #} + $nr++; + if ($nr == 50) { + open FAIL, ">>/tmp/$rand.failed"; + print FAIL $string; + close FAIL; + $nr = 0; + $string = ''; + } + #close FAIL; + } + open FAIL, ">>/tmp/$rand.failed"; + print FAIL $string; + close FAIL; +} +sub monitor { + # checks the file read monitor + my $counter = 0; + while ($finish == 0) { + $counter++; + if ($totrim->pending() > 200) { + lock($dummy); + #print "Putting the file reading to sleep for 10 seconds.\n"; + sleep 5; + } + else { + sleep 5; + } + if ($counter == 6) { + if ($verbose == 1) { + my $string = ($done/1000000)."M Reads Passed, ".($discarded/1000000)."M reads discarded (".$totrim->pending().".10^4 pending)\n"; + print $string ; + } + $counter = 0; + } + + } +}