changeset 1:fdbcc1aa4f01 draft

Uploaded
author geert-vandeweyer
date Thu, 13 Feb 2014 08:21:15 -0500
parents 548887c4227c
children 16a85df293d2
files Paired_fastQ_trimmer.pl
diffstat 1 files changed, 1270 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Paired_fastQ_trimmer.pl	Thu Feb 13 08:21:15 2014 -0500
@@ -0,0 +1,1270 @@
+#!/usr/bin/perl
+
+# load modules
+use Getopt::Std;
+use threads;
+use Thread::Queue;
+use threads::shared;
+use File::Basename;
+use List::Util qw(max);
+
+$now = time ;
+
+$|++;
+# variables 
+my $paired :shared;
+$paired = 0;
+my $fq :shared;
+my $rq :shared;
+my $ofq :shared;
+my $orq :shared;
+my $trimq :shared;
+my $failedfile :shared;
+my $done :shared;
+   $done = 0;
+my $nrin :shared;
+   $nrin = 0;
+
+my $rqz :shared;
+my $fqz :shared;
+my $rand :shared;
+my $discarded :shared;
+   $discarded = 0;
+my $verbose :shared;
+   $verbose = 1;
+my $trim3 :shared;
+   $trim3 = 1;
+my $trim5 :shared;
+   $trim5 = 1;
+# create file lock in the tmp dir
+$rand = int(rand(10000));
+while (-e "/tmp/$rand.lock") {
+	$rand = int(rand(10000));
+}	
+system("touch '/tmp/$rand.lock'");
+if (!-e "/tmp/sg.write.lock") {
+	system("touch '/tmp/sg.write.lock'");
+	system("chmod 777 '/tmp/sg.write.lock'");
+}
+
+# opts
+# i : path to forward read FASTQ file
+# I : path to reverse read FASTQ file
+# q : quality threshold in phred.
+# n : string representing common read name parts
+# s : trimming style : simple/bwa
+# o : output file for forward reads
+# O : output file for reverse reads
+# F : output file with fully failed reads pairs.
+# v : verbose: defaults to on (0/1)
+# S : Trim on side: 5/3/b(oth)
+# m : minimal length. 
+getopts('i:I:q:n:s:o:O:F:v:S:m:', \%opts) ;
+
+
+if (defined($opts{'v'})) {
+	$verbose = $opts{'v'};
+}
+
+if (defined($opts{'s'})) {
+	if ($opts{'s'} eq 'bwa') {
+		if ($verbose == 1) {
+			print "Using BWA style trimming\n";
+		}
+		$style = 'bwa';
+	}
+	else {
+		if ($verbose == 1) {
+			print "Using simple 1bp Quality trimming\n";
+		}
+		$style = 'simple';
+	}
+}
+else {
+	if ($verbose == 1){
+		print "Using simple 1bp Quality trimming\n";
+	}
+	$style = 'simple';
+}
+
+if (!defined($opts{'i'})) { die('Forward reads fastq is mandatory');}
+$fq = $opts{'i'};
+$ofq = $opts{'o'};
+
+## gzipped infile?
+my $out = `file $fq`;
+if ($out =~  m/gzip compressed/) {
+	if ($verbose == 1) {
+		print "Forward reads in gzipped format.\n";
+	}
+	$fqz = 1;
+}
+else {
+	if ($verbose == 1) {	
+		print "Forward reads in plain fastq format.\n";
+	}
+	$fqz = 0;
+}
+
+
+if (defined($opts{'I'})) {
+	if ($verbose == 1) {
+		print "Reverse reads provided. Performing paired-end trimming\n";
+	}
+	$paired = 1;
+	$rq = $opts{'I'};
+	$orq = $opts{'O'};
+	# gzipped ?
+	my $out = `file $rq`;
+	if ($out =~ m/gzip compressed/) {
+		if ($verbose == 1) {
+			print "Reverse reads in gzipped format.\n";
+		}	
+		$rqz = 1;
+	}
+	else {
+		if ($verbose == 1) {
+			print "Reverse reads in plain fastq format.\n";
+		}
+		$rqz = 0;
+	}
+}
+## FAILED FILENAME
+$failedfile = $opts{'F'};
+
+## set the sides to trim
+my $trimside = '';
+if (defined($opts{'S'})) {
+	if ($opts{'S'} eq '3') {
+		$trimside = "Trimming reads from the 3' end only\n";
+		$trim3 = 1;
+		$trim5 = 0;
+	}			
+	elsif ($opts{'S'} eq '5') {
+		$trimside = "Trimming reads from the 5' end only\n";
+		$trim3 = 0;
+		$trim5 = 1;
+	}
+	elsif ($opts{'S'} eq 'b') {
+		$trimside = "Trimming reads both from 5' and 3'\n";
+		$trim3 = 1;
+		$trim5 = 1;
+	}
+	else {
+		$trimside = "Invalid trimming side specification. Setting to both sides trimming\n";
+		$trim3 = 1;
+		$trim5 = 1;
+	}
+}
+else {
+	$trimside = "Invalid trimming side specification. Setting to both sides trimming\n";
+	$trim3 = 1;
+	$trim5 = 1;
+}
+if ($verbose == 1) {
+	print $trimside;
+}
+		
+# SET MINIMAL LENGTH
+my $minlength :shared;
+if (exists($opts{'m'})) {
+	$minlength = $opts{'m'};
+}
+else {
+	$minlength = 0;
+}
+
+## CHECK IF OUTPUT FILESNAMES ARE PRESENT
+if ($ofq eq '') {
+	$ofq = "$fq.$style.trimmed";
+	if ($verbose == 1) {
+		print "No Forward output name specified. Forward reads will be printed to:\n\t$ofq\n";
+	}
+}
+if ($paired == 1 && $orq eq '') {
+	$orq = "$rq.$style.trimmed";
+	if ($verbose == 1) {
+		print "No Reverse output name specified. Reverse reads will be printed to:\n\t$orq\n";
+	}
+}
+if ($failedfile eq '') {
+	$failedfile = "$fq.Failed_Reads";
+	if ($verbose == 1) {
+		print "No failed ouput name specified. Failed reads will be printed to:\n\t$failedfile\n";
+	}
+}
+
+## set trimming threshold
+if (!defined($opts{'q'}) or !($opts{'q'} =~ m/^[0-9]+$/)) {
+	$trimq = 20;
+}
+else {
+	$trimq = $opts{'q'};
+}
+
+## set read delimiter 
+if (defined($opts{'n'})) {
+	$indelim = $opts{'n'};
+	# galaxy replaced @ by __at__
+	if (substr($indelim,0,6) eq '__at__') {
+		$indelim = '@'.substr($indelim,6);
+	}
+}
+else {
+	print "No indelim defined\n";
+	$indelim = "@"; # has risk, fails if @ is first qual item
+}	
+
+## NEW : REPLACE if indelim = @ by the first 5 chars of first line of forward-fastq.
+if ($indelim eq '@') {
+	if ($fqz == 0) {
+		open IN, $fq;
+	}
+	else {
+		open(IN, "gunzip -c $fq | ");
+	}
+	$head = <IN>;
+	$indelim = substr($head,0,5);
+	print "\n";
+	print "WARNING:\n";
+	print "########\n";
+	print " Setting Delimiter as '\@' is prone to errors\n";
+	print " as it will incorrectly split if '\@' is the firt value in the quality string !!\n";
+	print ' It is recommended to use eg -n \'\@SRR301\''."\n";
+	print " Therefore, the delimiter was changed to the first 5 characters of the first forward fastq file.\n";
+	print "    => Delimiter : $indelim\n"; 
+	print "\n\n";
+	sleep 3;
+}
+## create queues
+my $totrim = Thread::Queue->new(); 
+my $toprint = Thread::Queue->new();
+my $dummy = Thread::Queue->new();
+my $failed = Thread::Queue->new();
+
+# monitor thread
+my $finish :shared;
+$finish = 0;
+$mon = threads->create('monitor');
+
+# start FASTQ reading thread
+if ($verbose == 1) {
+	print "start reading file\n";
+}
+$reading = threads->create('readfile');
+
+## give time to build reading buffer
+sleep 10;
+
+# start trimming threads
+if ($verbose == 1) {
+	print "start trimming\n";
+}
+if ($paired == 1) {
+	if ($style eq 'bwa') {
+		$trimming1 = threads->create('bwapaired');
+		$trimming2 = threads->create('bwapaired');
+		
+	}
+	else {
+		$trimming1 = threads->create('simplepaired');
+		$trimming2 = threads->create('simplepaired');
+	}
+	
+}
+else {
+	if ($style eq 'bwa') {
+		#print "not available yet, switching to simple trimming\n";
+		#$style = 'simple';
+		$trimming1 = threads->create('bwasingle');
+		$trimming2 = threads->create('bwasingle');
+
+	}
+	if ($style eq 'simple') {
+		$trimming1 = threads->create('simplesingle');
+		$trimming2 = threads->create('simplesingle');
+	}
+}
+# start printing threads
+$printout = threads->create('printout');
+$printfail = threads->create('printfailed');
+# end threads
+$reading->join();
+## all reads are queued for trimming, add undef to trimming queues to end threads.
+$totrim->enqueue(undef);
+$totrim->enqueue(undef);
+$trimming1->join();
+$trimming2->join();
+## all reads are trimmed and queued from printing, send undef to end printing threads.
+$toprint->enqueue(undef);
+$failed->enqueue(undef);
+$printout->join();
+$printfail->join();
+if ($verbose == 1) {
+	print "Waiting for status monitor to shut down\n";
+}
+$mon->join();
+## copy files from /tmp to destination folder
+if ($verbose == 1) {
+	print "Copying files to destination: Forward..";
+}
+if ($fqz == 0) {
+	system("cp -f /tmp/$rand.fq $ofq");
+	unlink("/tmp/$rand.fq");
+}
+else {
+	system("gzip -c /tmp/$rand.fq > $ofq");
+	unlink("/tmp/$rand.fq");
+}
+if ($paired == 1) {
+	if ($verbose == 1) {
+		print " Reverse..";	
+	}
+	if ($rqz == 0) {
+		system("cp -f /tmp/$rand.rq $orq");
+		unlink("/tmp/$rand.rq");
+	}
+	else {
+		system("gzip -c /tmp/$rand.rq > $orq");
+		unlink("/tmp/$rand.rq");
+	}
+}
+
+if ($verbose == 1) {
+	print " Failed items..";
+}
+system("cp -f /tmp/$rand.failed $failedfile");
+unlink("/tmp/$rand.failed");
+if ($verbose == 1) {
+	print " Done\n";
+}
+
+# remove lock file
+unlink("/tmp/$rand.lock");
+
+########################
+## print run details: ##
+########################
+my $string =  sprintf("%.2f",$done/1000000)."M Reads in (each) FASTQ file processed.\n";
+if ($paired == 1) {
+	$string .= sprintf ("%.2f",($done-$discarded)/1000000)."M read pairs in output FASTQ's.\n";
+	$string .= sprintf("%.2f",$discarded/1000000)."M reads pairs discarded\n";
+}
+else {
+	$string .= sprintf ("%.2f",($done-$discarded)/1000000)."M reads in output FASTQ.\n";
+	$string .= sprintf("%.2f",$discarded/1000000)."M reads discarded\n";
+}
+print $string;
+##################
+# PRINT RUN-TIME #
+##################
+$now = time - $now;
+printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60));
+
+
+exit();
+
+
+
+## SUBROUTINE : READ FASTQ FILES
+################################
+sub readfile {
+	if ($paired == 1) {
+		if ($fqz == 0) {
+			open IN1, $fq;
+		}
+		else {
+			open(IN1, "gunzip -c $fq | ");
+		}
+		if ($rqz == 0) {
+			open IN2, $rq;
+		}
+		else {
+			open(IN2, "gunzip -c $rq | ");
+		}
+		$/ = "\n$indelim";
+		$deliml = length($indelim);
+		# compose pack ignore string
+		my $dx = '';
+		for (my $idx = 0; $idx<length($indelim); $idx++) {
+			$dx .=  'X';
+		}
+		my $f = <IN1>;	
+		my $r = <IN2>;
+		# the reads contain the delimiter on the end (cf \n on regular file read line endings.)
+		# so we strip teh delimiter using pack, leaving just the \n.
+		# this means that in subsequent reads, we need to add the delimiter to the front of the read. 
+		my $read = substr(pack("A*$dx",$f),$deliml) . $r; #pack("A*$dx",$r) ;#. '|';	
+		my $counter = 1;
+		my $fr = '';
+		my $rr = '';
+		my $reads = '';
+		my $lastf = '';
+		while ($fr = <IN1>) {
+			$rr = <IN2>;
+			$lastf = $fr;
+			## add delim to front, trim from back. add our delim.
+			## done in next iteration, because last is different.
+			# the delimiter from the forward read is used to complete the reverse read.
+			$reads .= $indelim.pack("A*$dx",$read) .'|';
+			# concat reads
+			$read = $fr.$rr;
+			## add string to queue if 20,000 reads in array
+			if ($counter == 20000) {
+				$nrin = $nrin + 20000;
+				$totrim->enqueue(pack("A*X",$reads));  # pack("A*X") strips last |
+				$reads = '';
+				$counter = 0;
+			}
+			$counter++;
+			
+			## dummy (see monitor). 
+			my $d = $dummy->dequeue_nb();
+		}
+		# last reads in files need special care.
+		$reads .= $indelim.$lastf.$indelim.$rr;
+		close IN1;
+		close IN2;
+		# submit last reads to queue
+		if ($reads ne '') {
+			#$totrim->enqueue(pack("A*X",$reads));
+			$totrim->enqueue($reads);
+		}
+		if ($verbose == 1) {
+			print "All reads queued for trimming\n";
+		}
+	}
+	else {
+		if ($fqz == 0) {
+			open IN1, $fq;
+		}
+		else {
+			open(IN1, "gunzip -c $fq | ");
+		}
+		$/ = "\n$indelim";
+		$deliml = length($indelim);
+		# compose pack ignore string
+		my $dx = '';
+		for (my $idx = 0; $idx<length($indelim); $idx++) {
+			$dx .=  'X';
+		}
+		my $f = <IN1>;	
+		# the reads contain the delimiter on the end (cf \n on regular file read line endings.)
+		# so we strip teh delimiter using pack, leaving just the \n.
+		# this means that in subsequent reads, we need to add the delimiter to the front of the read. 
+		my $read = substr($f,$deliml); 
+		my $counter = 1;
+		my $fr = '';
+		my $reads = '';
+		while ($fr = <IN1>) {
+			## add delim to front, trim from back. add our delim.
+			## done in next iteration, because last is different.
+			$reads .= $indelim.pack("A*$dx",$read) .'|';
+			# concat reads
+			$read = $fr;
+			## add string to queue if 10,000 reads in array
+			if ($counter == 40000) {
+				$nrin = $nrin + 40000;
+				$totrim->enqueue(pack("A*X",$reads));
+				$reads = '';
+				$counter = 0;
+			}
+			$counter++;
+			
+			## dummy (see monitor). 
+			my $d = $dummy->dequeue_nb();
+		}
+		$reads .= $indelim.$read;	
+		close IN1;
+		# submit last reads to queue
+		if ($reads ne '') {
+			#$totrim->enqueue(pack("A*X",$reads));
+			$totrim->enqueue($reads);
+		}
+		if ($verbose == 1) {
+			print "All reads queued for trimming\n";
+		}
+		
+	}
+}
+
+sub bwapaired {
+	my $tq = $trimq;
+	while ( defined(my $rstring = $totrim->dequeue())) {
+	  my @reads = split(/\|/,$rstring); # each item has 20.000 reads
+          my $printstring = '';
+	  my $failedstring = '';
+          foreach (@reads) {
+		my $read = $_;
+		my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read);
+		## skip too short reads
+		if (length($q1) <  $minlength || length($q2) <  $minlength) {
+			$discarded++;
+			$failedstring .= $read ;#. '|';
+			next;
+		}
+		# initial check for maximal quality value. 
+		## quality arrays 
+		my @q1array = unpack("C*",$q1);
+		if (max(@q1array) < $tq + 33) {
+			#print "Discarding on failed forward qualities: ".max(@q1array). " vs $tq + 33)\n$read\n";
+			$discarded++;
+			$failedstring .= $read ;#. '|';
+			next;
+		}
+		my @q2array = unpack("C*",$q2);
+		if (max(@q2array) < $trimq + 33) {
+			#print "Discarding on failed reverse qualities\n$read\n";
+			$discarded++;
+			$failedstring .= $read ;#. '|';
+			next;
+		}
+
+		#############
+		## Trim 3' ##
+		#############
+		if ($trim3 == 1) {
+			#trim first
+			my $pos = length($q1) - 1;
+			my $maxPos = $pos;
+			## skip empty read.
+			#if ($pos <= 0) {   
+			#	$discarded++;
+			#	$failedstring .= $read . '|';
+			#	#print "skip empty : \n$read\n";	
+			#	next;
+			#} 
+			my $area = 0;
+			my $maxArea = 0;
+			## unpack qual-string to array of phred scores
+			#my @qarray = unpack("C*",$q1);	# use from initial check.
+			while ($pos>0 && $area>=0) {
+				$area += $tq - ($q1array[($pos)] - 33);
+				## not <=  for more aggressive trimming : if multiple equal maxima => use shortest read.
+				if ($area < $maxArea) {
+					$pos--;
+					next;
+				}
+				else {
+					$maxArea = $area;
+					$maxPos = $pos;
+					$pos--;
+					next;
+				}
+			}
+			if ($maxPos==0) { 		# scanned whole read and didn't integrate to maximum? skip (no output for this read)
+				$discarded++;
+				$failedstring .= $read ;#. '|';
+				next;
+			}  
+			# otherwise trim before position where area reached a maximum 
+			# $maxPos as length of substr gives positions zero to just before the maxpos.
+			#$s1 = substr($s1,0,$maxPos);
+			#$q1 = substr($q1,0,$maxPos);
+			$s1 = unpack("A$maxPos",$s1);
+			$q1 = unpack("A$maxPos",$q1);
+	
+			#trim second
+			$pos = length($q2);
+			$maxPos = $pos;
+			## skip empty read.
+			#if ($pos <= 0) {   
+			#	$discarded++;
+			#	$failedstring .= $read . '|';	
+			#	next;
+			#} 
+			$area = 0;
+			$maxArea = 0;
+			#@qarray = unpack("C*",$q2);
+			while ($pos>0 && $area>=0) {
+				$area += $tq - ($q2array[($pos)] - 33);
+				if ($area < $maxArea) {
+					$pos--;
+					next;
+				}
+				else {
+					$maxArea = $area;
+					$maxPos = $pos;
+					$pos--;
+					next;
+				}
+			}
+			if ($maxPos==0) { 		# scanned whole read and didn't integrate to zero? skip (no output for this read)
+				$discarded++;
+				$failedstring .= $read ;#. '|';
+				next;
+			}  
+			# Otherwise trim before position where area reached a maximum 
+			$s2 = unpack("A$maxPos",$s2);
+			$q2 = unpack("A$maxPos",$q2);
+			#$s2 = substr($s2,0,$maxPos);
+			#$q2 = substr($q2,0,$maxPos);
+
+		}
+		#############
+		## TRIM 5' ##
+		#############
+		if ($trim5 == 1) {
+			#trim first
+			my $skip = 0; #length($q1);
+			my $maxPos = $skip;
+			my $area = 0;
+			my $maxArea = 0;
+			my @qarray = unpack("C*",$q1);
+			while ($skip<length($q1) && $area>=0) {
+				$area += $tq - ($qarray[($skip)] - 33);
+				if ($area < $maxArea) {
+					$skip++;
+					next;
+				}
+				else {
+					$maxArea = $area;
+					$maxPos = $skip;
+					$skip++;
+					next;
+				}
+			}
+			if ($maxPos==length($q1)- 1) { 	# => maximal value at end of string => no point of no return found.
+				$discarded++;
+				$failedstring .= $read ;#. '|';	
+				next;
+			}  
+			#  trim after position where area reached a maximum 
+			#$s1 = substr($s1,($maxPos+1));
+			#$q1 = substr($q1,($maxPos+1));
+			$s1 = unpack("x$maxPos A*",$s1);
+			$q1 = unpack("x$maxPos A*",$q1); 
+
+
+			#trim second
+			$skip = 0 ;#length($q2);
+			$maxPos = $skip;
+			$area = 0;
+			$maxArea = 0;
+			@qarray = unpack("C*",$q2);
+			
+			while ($skip < length($q2) && $area>=0) {
+				$area += $tq - ($qarray[($skip)] - 33);
+
+				if ($area < $maxArea) {
+					$skip++;
+					next;
+				}
+				else {
+					$maxArea = $area;
+					$maxPos = $skip;
+					$skip++;
+					next;
+				}
+			}
+			if ($maxPos==(length($q2)-1)) { 		# scanned whole read and didn't integrate to zero? skip (no output for this read)
+				$discarded++;
+				$failedstring .= $read ;#. '|';
+				next;
+			}  
+			#  trim after position where area reached a maximum 
+			#$s2 = substr($s2,($maxPos+1));
+			#$q2 = substr($q2,($maxPos+1));
+			$s2 = unpack("x$maxPos A*",$s2);
+			$q2 = unpack("x$maxPos A*",$q2);
+		}
+
+		## DISCARD IF TOO SHORT ##	
+		if (length($s1) < $minlength || length($s2) < $minlength) {
+			$discarded++;
+			$failedstring .= $read ;#. '|';
+			next;
+
+		}
+		# queue for printing
+		my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2);
+		$printstring .= join('~',@arr) . '|';  # these two are not in quality string, and can thus be used as seperator
+							# | seperates pairs of reads; ~ seperates lines of single read entry.
+		#print "$printstring\n";
+	   }
+
+	   if ($printstring ne '') {
+		   $toprint->enqueue(pack("A*X",$printstring));  # removes trailing |
+		   #$toprint->enqueue($printstring);
+	   }
+	   if ($failedstring ne '') {
+		   #$failed->enqueue(pack("A*X",$failedstring));
+		   $failed->enqueue($failedstring);	
+	   }
+ 
+
+	}
+}
+sub simplepaired {
+	# local variables speed up threading
+	my $tq = $trimq;
+	while ( defined(my $rstring = $totrim->dequeue())){
+	  my @reads = split(/\|/,$rstring);
+          my $printstring = '';
+	  my $failedstring = '';
+          foreach (@reads) {
+		my $read = $_;
+		my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read);
+		## skip too short reads
+		if (length($q1) <  $minlength || length($q2) <  $minlength) {
+			$discarded++;
+			$failedstring .= $read; # . '|';
+			next;
+		}
+
+		# initial check for maximal quality value. 
+		## quality arrays 
+		my @q1array = unpack("C*",$q1);
+		if (max(@q1array) < $trimq + 33) {
+			#print "Discarding on failed forward qualities\n$read\n";
+			$discarded++;
+			$failedstring .= $read;# . '|';
+			next;
+		}
+		my @q2array = unpack("C*",$q2);
+		if (max(@q2array) < $trimq + 33) {
+			#print "Discarding on failed reverse qualities\n$read\n";
+			$discarded++;
+			$failedstring .= $read;# . '|';
+			next;
+		}
+
+		#############
+		## trim 3' ##
+		#############
+		if ($trim3 == 1) {
+			#trim first
+			my $pos = length($q1) - 1;
+			my $maxPos = $pos;
+			#if ($pos <= 0) {   
+			#	$discarded++;
+			#	$failedstring .= $read . '|';	
+			#	next;
+			#} 
+			#my @qarray = unpack("C*",$q1);
+			while ( $pos>0 ) {
+				if ($tq > $q1array[$pos] - 33) {
+					# score at position is below threshold
+					$pos--;
+					next;
+				}
+				else {
+					# score is at or above threshold
+					$maxPos = $pos;
+					last;
+				}
+			}
+			if ($pos==0) { 		# scanned whole read and didn't integrate to zero? skip (no output for this read)
+				$discarded++;
+				$failedstring .= $read ;#. '|';	
+				next;
+			}  
+			# integrated to zero?  trim before position where quality was below threshold 
+	 		# unpack is one based
+			#$maxPos++;
+			#$s1 = substr($s1,0,$maxPos);
+			$s1 = unpack("A$maxPos",$s1);
+			#$q1 = substr($q1,0,$maxPos);
+			$q1 = unpack("A$maxPos",$q1);
+			
+			#trim second
+			$pos = length($q2) -1;
+			$maxPos = $pos;
+			if ($pos <= 0) {   
+				$discarded++;
+				$failedstring .= $read ;#. '|';	
+				next;
+			} 
+			#my @qarray = unpack("C*",$q2);
+
+			while ($pos>0) {
+				if ($tq > $q2array[$pos] - 33) {
+					# score at position is below threshold
+					$pos--;
+					next;
+				}
+				else {
+					# score is at or above threshold
+					$maxPos = $pos;
+					last;
+				}
+			}
+			if ($pos==0) { 		# scanned whole read and didn't integrate to zero? skip (no output for this read)
+				$discarded++;
+				$failedstring .= $read ;#. '|';
+				next;
+			}  
+			# integrated to zero?  trim before position where score below threshold  
+			#$maxPos++;
+			$s2 = unpack("A$maxPos",$s2);
+			$q2 = unpack("A$maxPos",$q2);
+			#$s2 = substr($s2,0,$maxPos);
+			#$q2 = substr($q2,0,$maxPos);
+
+		}
+		#############
+		## trim 5' ##
+		#############
+		if ($trim5 == 1) {
+			# first
+			my $skip = 0; #length($q1);
+			#my $maxPos = $skip;
+			my @qarray = unpack("C*",$q1);
+			while ( $skip < length($q1) ) {
+				if ($tq > $qarray[$skip] - 33) {
+					# score at position is below threshold
+					$skip++;
+					next;
+				}
+				else {
+					# score is at or above threshold
+					#$maxPos = $skip;
+					last;
+				}
+			}
+			if ($skip == length($q1)) { 		# should not fail, as that would have happened on 3' trim
+				$discarded++;
+				$failedstring .= $read ;#. '|';	
+				next;
+			}  
+			# integrated to zero?  trim before position where quality was below threshold 
+	 		# unpack is one based
+			$s1 = unpack("x$skip A*",$s1);
+			$q1 = unpack("x$skip A*",$q1);
+			#$s1 = substr($s1,$skip);
+			#$q1 = substr($q1,$skip);
+
+			# second 
+			$skip = 0; #length($q1);
+			my @qarray = unpack("C*",$q2);
+			#$maxPos = $skip;
+			while ( $skip < length($q2) ) {
+				if ($tq > $qarray[$skip] - 33) {
+					# score at position is below threshold
+					$skip++;
+					next;
+				}
+				else {
+					# score is at or above threshold
+					#$maxPos = $skip;
+					last;
+				}
+			}
+			if ($skip == length($q2)) { 		# should not fail, as that would have happened on 3' trim
+				$discarded++;
+				$failedstring .= $read ;#. '|';	
+				next;
+			}  
+			# integrated to zero?  trim before position where quality was below threshold 
+ 			# unpack is one based
+			$s2 = unpack("x$skip A*",$s2);
+			$q2 = unpack("x$skip A*",$q2);
+			#$s2 = substr($s2,$skip);
+			#$q2 = substr($q2,$skip);
+
+		}
+		## DISCARD IF TOO SHORT ##	
+		if (length($s1) < $minlength || length($s2) < $minlength) {
+			$discarded++;
+			$failedstring .= $read .#. '|';
+			next;
+
+		}
+
+		# queue for printing
+		my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2);
+		$printstring .= join('~',@arr) . '|';  # these two are not in quality string, and can thus be used as seperator
+							# | seperates pairs of reads; ~ seperates lines of single read entry.
+	   }
+	   if ($printstring ne '') {
+		   $toprint->enqueue(pack("A*X",$printstring));  # removes trailing |
+		   #$toprint->enqueue($printstring);
+	   }
+	   if ($failedstring ne '') {
+		   #$failed->enqueue(pack("A*X",$failedstring));
+		   $failed->enqueue($failedstring);	
+	   }
+	}
+}
+
+sub bwasingle {
+	# local variables speed up threading
+	my $tq = $trimq;
+	while ( defined(my $rstring = $totrim->dequeue())){
+	  my @reads = split(/\|/,$rstring);
+          my $printstring = '';
+	  my $failedstring = '';
+          foreach (@reads) {
+		my $read = $_;
+		my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read);
+		## skip too short reads
+		if (length($q1) <  $minlength ) {
+			
+			$discarded++;
+			$failedstring .= $read ;#. '|';
+			next;
+		}
+		# initial check for maximal quality value. 
+		## quality arrays 
+		my @q1array = unpack("C*",$q1);
+		if (max(@q1array) < $tq + 33) {
+			print "Discarding on failed qualities\n$read\n";
+			$discarded++;
+			$failedstring .= $read;# . '|';
+			next;
+		}
+
+		#############
+		## trim 3' ##
+		#############
+		if ($trim3 == 1) {
+			my $pos = length($q1) - 1;
+			my $maxPos = $pos;
+			## skip empty read.
+			#if ($pos <= 0) {   
+			#	$discarded++;
+			#	$failedstring .= $read . '|';	
+			#	next;
+			#}  
+			my $area = 0;
+			my $maxArea = 0;
+			#my @qarray = unpack("C*",$q1);
+			while ($pos>0 && $area>=0) {
+				$area += $tq - ($q1array[($pos)] - 33);
+				## not <=  for more aggressive trimming : if multiple equal maxima => use shortest read.
+				if ($area < $maxArea) {
+					$pos--;
+					next;
+				}
+				else {
+					$maxArea = $area;
+					$maxPos = $pos;
+					$pos--;
+					next;
+				}
+			}
+			## The online perl approach is wrong if the high-quality part is not long enough to reach 0.
+			if ($maxPos == 0) {  # maximal badness on position 0 => no point of no return reached. 
+				$discarded++;
+				$failedstring .= $read ;#. '|';	
+				next;
+			}  
+			# otherwise trim before position where area reached a maximum 
+			# $maxPos as length of substr gives positions zero to just before the maxpos.
+			$s1 = unpack("A$maxPos",$s1);
+			$q1 = unpack("A$maxPos",$q1);
+
+		}
+		#############
+		## trim 5' ##
+		#############
+		if ($trim5 == 1) {
+			my $skip = 0; #length($q1);
+			my $maxPos = $skip;
+			my $area = 0;
+			my $maxArea = 0;
+			my @qarray = unpack("C*",$q1);
+			while ($skip<length($q1) && $area>=0) {
+				#$area += $tq - (ord(unpack("x$skip A1",$q1)) - 33);
+				$area += $tq - ($qarray[($skip)] - 33);
+				if ($area < $maxArea) {
+					$skip++;
+					next;
+				}
+				else {
+					$maxArea = $area;
+					$maxPos = $skip;
+					$skip++;
+					next;
+				}
+			}
+			#if ($skip==length($q1)) { 		# scanned whole read and didn't integrate to zero? skip (no output for this read
+			if ($maxPos == (length($q1)-1)) { # => maximal value at end of string => no point of no return found.
+				$discarded++;
+				$failedstring .= $read ;#. '|';	
+				next;
+			}  
+			# trim after position where area reached a maximum
+			$s1 = unpack("x$maxPos A*",$s1);
+			$q1 = unpack("x$maxPos A*",$q1); 
+			#$s1 = substr($s1,($maxPos+1));
+			#$q1 = substr($q1,($maxPos+1));
+		}
+		## DISCARD IF TOO SHORT ##	
+		if (length($s1) < $minlength) {
+			$discarded++;
+			$failedstring .= $read ;#. '|';
+			next;
+
+		}
+
+		# queue for printing
+		my @arr = ($h1a,$s1,$h1b,$q1);
+		$printstring .= join("\n",@arr) . "\n";  # join components by newline character for printing 
+	   }
+	   if ($printstring ne '') {
+		   #$toprint->enqueue(pack("A*X",$printstring));  # removes trailing |
+		   $toprint->enqueue($printstring);
+	   }
+	   if ($failedstring ne '') {
+		   #$failed->enqueue(pack("A*X",$failedstring));
+		   $failed->enqueue($failedstring);	
+	   }
+
+	}
+}
+
+
+sub simplesingle {
+	# local variables speed up threading
+	my $tq = $trimq;
+	while ( defined(my $rstring = $totrim->dequeue())){
+	  my @reads = split(/\|/,$rstring);
+          my $printstring = '';
+	  my $failedstring = '';
+          foreach (@reads) {
+		my $read = $_;
+		my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read);
+		## skip too short reads
+		if (length($q1) <  $minlength ) {
+			$discarded++;
+			$failedstring .= $read ;#. '|';
+			next;
+		}
+		# initial check for maximal quality value. 
+		## quality arrays 
+		my @q1array = unpack("C*",$q1);
+		if (max(@q1array) < $trimq + 33) {
+			#print "Discarding on failed qualities\n$read\n";
+			$discarded++;
+			$failedstring .= $read ;#. '|';
+			next;
+		}
+
+		#############
+		## trim 3' ##
+		#############
+		if ($trim3 == 1) {
+			my $pos = length($q1);
+			my $maxPos = $pos;
+			#if ($pos <= 0) {   
+			#	$discarded++;
+			#	$failedstring .= $read . '|';	
+			#	next;
+			#} 
+			#my @qarray = unpack("C*",$q1);
+
+			while ( $pos>0 ) {
+				if ($tq >  $q1array[$pos] - 33) {
+					# score at position is below threshold
+					$pos--;
+					next;
+				}
+				else {
+					# score is at or above threshold
+					$maxPos = $pos;
+					last;
+				}
+			}
+			if ($pos==0) { 		# scanned whole read and didn't integrate to zero? skip (no output for this read)
+				$discarded++;
+				$failedstring .= $read ;#. '|';	
+				next;
+			}  
+			# integrated to zero?  trim before position where quality was below threshold 
+	 		# unpack is one based
+			$s1 = unpack("A$maxPos",$s1);
+			$q1 = unpack("A$maxPos",$q1);
+			#$s1 = substr($s1,0,$maxPos);
+			#$q1 = substr($q1,0,$maxPos);
+
+		}
+		#############
+		## trim 5' ##
+		#############
+		if ($trim5 == 1) {
+			my $skip = 0; #length($q1);
+			#my $maxPos = $skip;
+			my @qarray = unpack("C*",$q1);
+
+			while ( $skip < length($q1) ) {
+				if ($tq >  $qarray[$skip] - 33) {
+					# score at position is below threshold
+					$skip++;
+					next;
+				}
+				else {
+					# score is at or above threshold
+					#$maxPos = $skip;
+					last;
+				}
+			}
+			if ($skip == length($q1)) { 		
+				$discarded++;
+				$failedstring .= $read ;#. '|';	
+				next;
+			}  
+			# integrated to zero?  trim before position where quality was below threshold 
+	 		# unpack is one based
+			#$s1 = substr($s1,$skip);
+			#$q1 = substr($q1,$skip);
+			$s1 = unpack("x$skip A*",$s1);
+			$q1 = unpack("x$skip A*",$q1);
+
+
+		}
+		## DISCARD IF TOO SHORT ##	
+		if (length($s1) < $minlength) {
+			$discarded++;
+			$failedstring .= $read ;#. '|';
+			next;
+
+		}
+
+
+		# queue for printing
+		my @arr = ($h1a,$s1,$h1b,$q1);
+		$printstring .= join("\n",@arr) . "\n";  # join components by newline character for printing 
+	   }
+	   if ($printstring ne '') {
+		   $toprint->enqueue($printstring);  # single reads => no | present, just \n
+	   }
+	   if ($failedstring ne '') {
+		   #$failed->enqueue(pack("A*X",$failedstring));
+		   $failed->enqueue($failedstring);	
+	   }
+
+	}
+}
+
+sub printout {
+	if ($rand eq '' || !defined($rand)) {
+		die('Random value not passed on!');
+	}
+	if ($paired == 0) {
+		my $counter = 0;
+		my $string1 = '';
+		if (-e "/tmp/$rand.fq") {
+			unlink("/tmp/$rand.fq");
+		}
+		while (defined(my $string = $toprint->dequeue())) {	# each item represents 10000 input reads (might be less by trimming)
+			#my @array = split(/\|/,$printstring);
+			#$done = $done + scalar(@array);
+			$done = $done + 10000;
+			$string1 .= $string;
+			$counter++;
+			#print batches of 500,000 reads
+			if ($counter == 50) {
+				open OUT1, ">>/tmp/$rand.fq";
+				print OUT1 $string1;
+				close OUT1;
+				$string1 = '';
+				#$string2 = '';
+				$counter = 0;
+			}
+		}
+		## print last
+		open OUT1, ">>/tmp/$rand.fq";
+		print OUT1 $string1;
+		close OUT1;
+		$finish = 1;
+
+  	}
+	else {
+		my $counter = 0;
+		my $string1 = '';
+		my $string2 = '';
+		if (-e "/tmp/$rand.fq") {
+			unlink("/tmp/$rand.fq");
+		}
+		if (-e "/tmp/$rand.fq") {
+			unlink("/tmp/$rand.rq");
+		}
+  		while (defined(my $printstring = $toprint->dequeue())) {	# each item represents 10000 input reads
+			my @array = split(/\|/,$printstring);
+			$done = $done + scalar(@array);
+			foreach (@array) {
+				my @subarr = split(/\~/,$_);
+				$string1 .= join("\n",@subarr[0..3])."\n";
+				$string2 .= join("\n",@subarr[4..7])."\n";
+			}
+			$counter++;
+			if ($counter == 50) {
+				#print "Printing!\n";
+				flock("/tmp/sg.write.lock",2);
+				open OUT1, ">>/tmp/$rand.fq";
+				print OUT1 $string1;
+				close OUT1;
+				open OUT2, ">>/tmp/$rand.rq";
+				print OUT2 $string2;
+				close OUT2;
+				flock("/tmp/sg.write.lock",8);
+				$string1 = '';
+				$string2 = '';
+				$counter = 0;
+			}
+		}
+		## print last
+		flock("/tmp/sg.write.lock",2);
+		open OUT1, ">>/tmp/$rand.fq";
+		print OUT1 $string1;
+		close OUT1;
+		open OUT2, ">>/tmp/$rand.rq";
+		print OUT2 $string2;
+		close OUT2;
+		flock("/tmp/sg.write.lock",8);
+		$finish = 1;
+	}
+
+}
+
+sub printfailed {
+	if (-e "/tmp/$rand.failed") {
+		unlink("/tmp/$rand.failed");
+	}
+	#open FAIL, ">/tmp/$rand.failed" or die("could not open file with failed reads, '/tmp/$rand.failed'");
+	#FAIL->autoflush(1);
+	my $nr = 0;
+	my $string = '';
+	while ( defined(my $rstring = $failed->dequeue())) {
+		#my @array = split(/\|/,$rstring);
+		#foreach (@array) {
+			$string .= $rstring;
+		#}
+		$nr++;
+		if ($nr == 50) {
+			open FAIL, ">>/tmp/$rand.failed";
+			print FAIL $string;
+			close FAIL;
+			$nr = 0;
+			$string = '';
+		}
+		#close FAIL;
+	}
+	open FAIL, ">>/tmp/$rand.failed";
+	print FAIL $string;
+	close FAIL;	
+}
+sub monitor {
+	# checks the file read monitor
+	my $counter = 0;
+	while ($finish == 0) {
+		$counter++;
+		if ($totrim->pending() > 200) {
+			lock($dummy);
+			#print "Putting the file reading to sleep for 10 seconds.\n";
+			sleep 5;
+		}
+		else {
+			sleep 5;
+		}
+		if ($counter == 6) {
+			if ($verbose == 1) {
+				my $string =  ($done/1000000)."M Reads Passed, ".($discarded/1000000)."M reads discarded (".$totrim->pending().".10^4 pending)\n";
+				print $string ;
+			}
+			$counter = 0;
+		}
+		
+	}
+}