view Paired_fastQ_trimmer.pl @ 3:cba6282b5dc8 draft default tip

Uploaded
author geert-vandeweyer
date Tue, 21 Oct 2014 05:07:06 -0400
parents fdbcc1aa4f01
children
line wrap: on
line source

#!/usr/bin/perl

# load modules
use Getopt::Std;
use threads;
use Thread::Queue;
use threads::shared;
use File::Basename;
use List::Util qw(max);

$now = time ;

$|++;
# variables 
my $paired :shared;
$paired = 0;
my $fq :shared;
my $rq :shared;
my $ofq :shared;
my $orq :shared;
my $trimq :shared;
my $failedfile :shared;
my $done :shared;
   $done = 0;
my $nrin :shared;
   $nrin = 0;

my $rqz :shared;
my $fqz :shared;
my $rand :shared;
my $discarded :shared;
   $discarded = 0;
my $verbose :shared;
   $verbose = 1;
my $trim3 :shared;
   $trim3 = 1;
my $trim5 :shared;
   $trim5 = 1;
# create file lock in the tmp dir
$rand = int(rand(10000));
while (-e "/tmp/$rand.lock") {
	$rand = int(rand(10000));
}	
system("touch '/tmp/$rand.lock'");
if (!-e "/tmp/sg.write.lock") {
	system("touch '/tmp/sg.write.lock'");
	system("chmod 777 '/tmp/sg.write.lock'");
}

# opts
# i : path to forward read FASTQ file
# I : path to reverse read FASTQ file
# q : quality threshold in phred.
# n : string representing common read name parts
# s : trimming style : simple/bwa
# o : output file for forward reads
# O : output file for reverse reads
# F : output file with fully failed reads pairs.
# v : verbose: defaults to on (0/1)
# S : Trim on side: 5/3/b(oth)
# m : minimal length. 
getopts('i:I:q:n:s:o:O:F:v:S:m:', \%opts) ;


if (defined($opts{'v'})) {
	$verbose = $opts{'v'};
}

if (defined($opts{'s'})) {
	if ($opts{'s'} eq 'bwa') {
		if ($verbose == 1) {
			print "Using BWA style trimming\n";
		}
		$style = 'bwa';
	}
	else {
		if ($verbose == 1) {
			print "Using simple 1bp Quality trimming\n";
		}
		$style = 'simple';
	}
}
else {
	if ($verbose == 1){
		print "Using simple 1bp Quality trimming\n";
	}
	$style = 'simple';
}

if (!defined($opts{'i'})) { die('Forward reads fastq is mandatory');}
$fq = $opts{'i'};
$ofq = $opts{'o'};

## gzipped infile?
my $out = `file $fq`;
if ($out =~  m/gzip compressed/) {
	if ($verbose == 1) {
		print "Forward reads in gzipped format.\n";
	}
	$fqz = 1;
}
else {
	if ($verbose == 1) {	
		print "Forward reads in plain fastq format.\n";
	}
	$fqz = 0;
}


if (defined($opts{'I'})) {
	if ($verbose == 1) {
		print "Reverse reads provided. Performing paired-end trimming\n";
	}
	$paired = 1;
	$rq = $opts{'I'};
	$orq = $opts{'O'};
	# gzipped ?
	my $out = `file $rq`;
	if ($out =~ m/gzip compressed/) {
		if ($verbose == 1) {
			print "Reverse reads in gzipped format.\n";
		}	
		$rqz = 1;
	}
	else {
		if ($verbose == 1) {
			print "Reverse reads in plain fastq format.\n";
		}
		$rqz = 0;
	}
}
## FAILED FILENAME
$failedfile = $opts{'F'};

## set the sides to trim
my $trimside = '';
if (defined($opts{'S'})) {
	if ($opts{'S'} eq '3') {
		$trimside = "Trimming reads from the 3' end only\n";
		$trim3 = 1;
		$trim5 = 0;
	}			
	elsif ($opts{'S'} eq '5') {
		$trimside = "Trimming reads from the 5' end only\n";
		$trim3 = 0;
		$trim5 = 1;
	}
	elsif ($opts{'S'} eq 'b') {
		$trimside = "Trimming reads both from 5' and 3'\n";
		$trim3 = 1;
		$trim5 = 1;
	}
	else {
		$trimside = "Invalid trimming side specification. Setting to both sides trimming\n";
		$trim3 = 1;
		$trim5 = 1;
	}
}
else {
	$trimside = "Invalid trimming side specification. Setting to both sides trimming\n";
	$trim3 = 1;
	$trim5 = 1;
}
if ($verbose == 1) {
	print $trimside;
}
		
# SET MINIMAL LENGTH
my $minlength :shared;
if (exists($opts{'m'})) {
	$minlength = $opts{'m'};
}
else {
	$minlength = 0;
}

## CHECK IF OUTPUT FILESNAMES ARE PRESENT
if ($ofq eq '') {
	$ofq = "$fq.$style.trimmed";
	if ($verbose == 1) {
		print "No Forward output name specified. Forward reads will be printed to:\n\t$ofq\n";
	}
}
if ($paired == 1 && $orq eq '') {
	$orq = "$rq.$style.trimmed";
	if ($verbose == 1) {
		print "No Reverse output name specified. Reverse reads will be printed to:\n\t$orq\n";
	}
}
if ($failedfile eq '') {
	$failedfile = "$fq.Failed_Reads";
	if ($verbose == 1) {
		print "No failed ouput name specified. Failed reads will be printed to:\n\t$failedfile\n";
	}
}

## set trimming threshold
if (!defined($opts{'q'}) or !($opts{'q'} =~ m/^[0-9]+$/)) {
	$trimq = 20;
}
else {
	$trimq = $opts{'q'};
}

## set read delimiter 
if (defined($opts{'n'})) {
	$indelim = $opts{'n'};
	# galaxy replaced @ by __at__
	if (substr($indelim,0,6) eq '__at__') {
		$indelim = '@'.substr($indelim,6);
	}
}
else {
	print "No indelim defined\n";
	$indelim = "@"; # has risk, fails if @ is first qual item
}	

## NEW : REPLACE if indelim = @ by the first 5 chars of first line of forward-fastq.
if ($indelim eq '@') {
	if ($fqz == 0) {
		open IN, $fq;
	}
	else {
		open(IN, "gunzip -c $fq | ");
	}
	$head = <IN>;
	$indelim = substr($head,0,5);
	print "\n";
	print "WARNING:\n";
	print "########\n";
	print " Setting Delimiter as '\@' is prone to errors\n";
	print " as it will incorrectly split if '\@' is the firt value in the quality string !!\n";
	print ' It is recommended to use eg -n \'\@SRR301\''."\n";
	print " Therefore, the delimiter was changed to the first 5 characters of the first forward fastq file.\n";
	print "    => Delimiter : $indelim\n"; 
	print "\n\n";
	sleep 3;
}
## create queues
my $totrim = Thread::Queue->new(); 
my $toprint = Thread::Queue->new();
my $dummy = Thread::Queue->new();
my $failed = Thread::Queue->new();

# monitor thread
my $finish :shared;
$finish = 0;
$mon = threads->create('monitor');

# start FASTQ reading thread
if ($verbose == 1) {
	print "start reading file\n";
}
$reading = threads->create('readfile');

## give time to build reading buffer
sleep 10;

# start trimming threads
if ($verbose == 1) {
	print "start trimming\n";
}
if ($paired == 1) {
	if ($style eq 'bwa') {
		$trimming1 = threads->create('bwapaired');
		$trimming2 = threads->create('bwapaired');
		
	}
	else {
		$trimming1 = threads->create('simplepaired');
		$trimming2 = threads->create('simplepaired');
	}
	
}
else {
	if ($style eq 'bwa') {
		#print "not available yet, switching to simple trimming\n";
		#$style = 'simple';
		$trimming1 = threads->create('bwasingle');
		$trimming2 = threads->create('bwasingle');

	}
	if ($style eq 'simple') {
		$trimming1 = threads->create('simplesingle');
		$trimming2 = threads->create('simplesingle');
	}
}
# start printing threads
$printout = threads->create('printout');
$printfail = threads->create('printfailed');
# end threads
$reading->join();
## all reads are queued for trimming, add undef to trimming queues to end threads.
$totrim->enqueue(undef);
$totrim->enqueue(undef);
$trimming1->join();
$trimming2->join();
## all reads are trimmed and queued from printing, send undef to end printing threads.
$toprint->enqueue(undef);
$failed->enqueue(undef);
$printout->join();
$printfail->join();
if ($verbose == 1) {
	print "Waiting for status monitor to shut down\n";
}
$mon->join();
## copy files from /tmp to destination folder
if ($verbose == 1) {
	print "Copying files to destination: Forward..";
}
if ($fqz == 0) {
	system("cp -f /tmp/$rand.fq $ofq");
	unlink("/tmp/$rand.fq");
}
else {
	system("gzip -c /tmp/$rand.fq > $ofq");
	unlink("/tmp/$rand.fq");
}
if ($paired == 1) {
	if ($verbose == 1) {
		print " Reverse..";	
	}
	if ($rqz == 0) {
		system("cp -f /tmp/$rand.rq $orq");
		unlink("/tmp/$rand.rq");
	}
	else {
		system("gzip -c /tmp/$rand.rq > $orq");
		unlink("/tmp/$rand.rq");
	}
}

if ($verbose == 1) {
	print " Failed items..";
}
system("cp -f /tmp/$rand.failed $failedfile");
unlink("/tmp/$rand.failed");
if ($verbose == 1) {
	print " Done\n";
}

# remove lock file
unlink("/tmp/$rand.lock");

########################
## print run details: ##
########################
my $string =  sprintf("%.2f",$done/1000000)."M Reads in (each) FASTQ file processed.\n";
if ($paired == 1) {
	$string .= sprintf ("%.2f",($done-$discarded)/1000000)."M read pairs in output FASTQ's.\n";
	$string .= sprintf("%.2f",$discarded/1000000)."M reads pairs discarded\n";
}
else {
	$string .= sprintf ("%.2f",($done-$discarded)/1000000)."M reads in output FASTQ.\n";
	$string .= sprintf("%.2f",$discarded/1000000)."M reads discarded\n";
}
print $string;
##################
# PRINT RUN-TIME #
##################
$now = time - $now;
printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60));


exit();



## SUBROUTINE : READ FASTQ FILES
################################
sub readfile {
	if ($paired == 1) {
		if ($fqz == 0) {
			open IN1, $fq;
		}
		else {
			open(IN1, "gunzip -c $fq | ");
		}
		if ($rqz == 0) {
			open IN2, $rq;
		}
		else {
			open(IN2, "gunzip -c $rq | ");
		}
		$/ = "\n$indelim";
		$deliml = length($indelim);
		# compose pack ignore string
		my $dx = '';
		for (my $idx = 0; $idx<length($indelim); $idx++) {
			$dx .=  'X';
		}
		my $f = <IN1>;	
		my $r = <IN2>;
		# the reads contain the delimiter on the end (cf \n on regular file read line endings.)
		# so we strip teh delimiter using pack, leaving just the \n.
		# this means that in subsequent reads, we need to add the delimiter to the front of the read. 
		my $read = substr(pack("A*$dx",$f),$deliml) . $r; #pack("A*$dx",$r) ;#. '|';	
		my $counter = 1;
		my $fr = '';
		my $rr = '';
		my $reads = '';
		my $lastf = '';
		while ($fr = <IN1>) {
			$rr = <IN2>;
			$lastf = $fr;
			## add delim to front, trim from back. add our delim.
			## done in next iteration, because last is different.
			# the delimiter from the forward read is used to complete the reverse read.
			$reads .= $indelim.pack("A*$dx",$read) .'|';
			# concat reads
			$read = $fr.$rr;
			## add string to queue if 20,000 reads in array
			if ($counter == 20000) {
				$nrin = $nrin + 20000;
				$totrim->enqueue(pack("A*X",$reads));  # pack("A*X") strips last |
				$reads = '';
				$counter = 0;
			}
			$counter++;
			
			## dummy (see monitor). 
			my $d = $dummy->dequeue_nb();
		}
		# last reads in files need special care.
		$reads .= $indelim.$lastf.$indelim.$rr;
		close IN1;
		close IN2;
		# submit last reads to queue
		if ($reads ne '') {
			#$totrim->enqueue(pack("A*X",$reads));
			$totrim->enqueue($reads);
		}
		if ($verbose == 1) {
			print "All reads queued for trimming\n";
		}
	}
	else {
		if ($fqz == 0) {
			open IN1, $fq;
		}
		else {
			open(IN1, "gunzip -c $fq | ");
		}
		$/ = "\n$indelim";
		$deliml = length($indelim);
		# compose pack ignore string
		my $dx = '';
		for (my $idx = 0; $idx<length($indelim); $idx++) {
			$dx .=  'X';
		}
		my $f = <IN1>;	
		# the reads contain the delimiter on the end (cf \n on regular file read line endings.)
		# so we strip teh delimiter using pack, leaving just the \n.
		# this means that in subsequent reads, we need to add the delimiter to the front of the read. 
		my $read = substr($f,$deliml); 
		my $counter = 1;
		my $fr = '';
		my $reads = '';
		while ($fr = <IN1>) {
			## add delim to front, trim from back. add our delim.
			## done in next iteration, because last is different.
			$reads .= $indelim.pack("A*$dx",$read) .'|';
			# concat reads
			$read = $fr;
			## add string to queue if 10,000 reads in array
			if ($counter == 40000) {
				$nrin = $nrin + 40000;
				$totrim->enqueue(pack("A*X",$reads));
				$reads = '';
				$counter = 0;
			}
			$counter++;
			
			## dummy (see monitor). 
			my $d = $dummy->dequeue_nb();
		}
		$reads .= $indelim.$read;	
		close IN1;
		# submit last reads to queue
		if ($reads ne '') {
			#$totrim->enqueue(pack("A*X",$reads));
			$totrim->enqueue($reads);
		}
		if ($verbose == 1) {
			print "All reads queued for trimming\n";
		}
		
	}
}

sub bwapaired {
	my $tq = $trimq;
	while ( defined(my $rstring = $totrim->dequeue())) {
	  my @reads = split(/\|/,$rstring); # each item has 20.000 reads
          my $printstring = '';
	  my $failedstring = '';
          foreach (@reads) {
		my $read = $_;
		my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read);
		## skip too short reads
		if (length($q1) <  $minlength || length($q2) <  $minlength) {
			$discarded++;
			$failedstring .= $read ;#. '|';
			next;
		}
		# initial check for maximal quality value. 
		## quality arrays 
		my @q1array = unpack("C*",$q1);
		if (max(@q1array) < $tq + 33) {
			#print "Discarding on failed forward qualities: ".max(@q1array). " vs $tq + 33)\n$read\n";
			$discarded++;
			$failedstring .= $read ;#. '|';
			next;
		}
		my @q2array = unpack("C*",$q2);
		if (max(@q2array) < $trimq + 33) {
			#print "Discarding on failed reverse qualities\n$read\n";
			$discarded++;
			$failedstring .= $read ;#. '|';
			next;
		}

		#############
		## Trim 3' ##
		#############
		if ($trim3 == 1) {
			#trim first
			my $pos = length($q1) - 1;
			my $maxPos = $pos;
			## skip empty read.
			#if ($pos <= 0) {   
			#	$discarded++;
			#	$failedstring .= $read . '|';
			#	#print "skip empty : \n$read\n";	
			#	next;
			#} 
			my $area = 0;
			my $maxArea = 0;
			## unpack qual-string to array of phred scores
			#my @qarray = unpack("C*",$q1);	# use from initial check.
			while ($pos>0 && $area>=0) {
				$area += $tq - ($q1array[($pos)] - 33);
				## not <=  for more aggressive trimming : if multiple equal maxima => use shortest read.
				if ($area < $maxArea) {
					$pos--;
					next;
				}
				else {
					$maxArea = $area;
					$maxPos = $pos;
					$pos--;
					next;
				}
			}
			if ($maxPos==0) { 		# scanned whole read and didn't integrate to maximum? skip (no output for this read)
				$discarded++;
				$failedstring .= $read ;#. '|';
				next;
			}  
			# otherwise trim before position where area reached a maximum 
			# $maxPos as length of substr gives positions zero to just before the maxpos.
			#$s1 = substr($s1,0,$maxPos);
			#$q1 = substr($q1,0,$maxPos);
			$s1 = unpack("A$maxPos",$s1);
			$q1 = unpack("A$maxPos",$q1);
	
			#trim second
			$pos = length($q2);
			$maxPos = $pos;
			## skip empty read.
			#if ($pos <= 0) {   
			#	$discarded++;
			#	$failedstring .= $read . '|';	
			#	next;
			#} 
			$area = 0;
			$maxArea = 0;
			#@qarray = unpack("C*",$q2);
			while ($pos>0 && $area>=0) {
				$area += $tq - ($q2array[($pos)] - 33);
				if ($area < $maxArea) {
					$pos--;
					next;
				}
				else {
					$maxArea = $area;
					$maxPos = $pos;
					$pos--;
					next;
				}
			}
			if ($maxPos==0) { 		# scanned whole read and didn't integrate to zero? skip (no output for this read)
				$discarded++;
				$failedstring .= $read ;#. '|';
				next;
			}  
			# Otherwise trim before position where area reached a maximum 
			$s2 = unpack("A$maxPos",$s2);
			$q2 = unpack("A$maxPos",$q2);
			#$s2 = substr($s2,0,$maxPos);
			#$q2 = substr($q2,0,$maxPos);

		}
		#############
		## TRIM 5' ##
		#############
		if ($trim5 == 1) {
			#trim first
			my $skip = 0; #length($q1);
			my $maxPos = $skip;
			my $area = 0;
			my $maxArea = 0;
			my @qarray = unpack("C*",$q1);
			while ($skip<length($q1) && $area>=0) {
				$area += $tq - ($qarray[($skip)] - 33);
				if ($area < $maxArea) {
					$skip++;
					next;
				}
				else {
					$maxArea = $area;
					$maxPos = $skip;
					$skip++;
					next;
				}
			}
			if ($maxPos==length($q1)- 1) { 	# => maximal value at end of string => no point of no return found.
				$discarded++;
				$failedstring .= $read ;#. '|';	
				next;
			}  
			#  trim after position where area reached a maximum 
			#$s1 = substr($s1,($maxPos+1));
			#$q1 = substr($q1,($maxPos+1));
			$s1 = unpack("x$maxPos A*",$s1);
			$q1 = unpack("x$maxPos A*",$q1); 


			#trim second
			$skip = 0 ;#length($q2);
			$maxPos = $skip;
			$area = 0;
			$maxArea = 0;
			@qarray = unpack("C*",$q2);
			
			while ($skip < length($q2) && $area>=0) {
				$area += $tq - ($qarray[($skip)] - 33);

				if ($area < $maxArea) {
					$skip++;
					next;
				}
				else {
					$maxArea = $area;
					$maxPos = $skip;
					$skip++;
					next;
				}
			}
			if ($maxPos==(length($q2)-1)) { 		# scanned whole read and didn't integrate to zero? skip (no output for this read)
				$discarded++;
				$failedstring .= $read ;#. '|';
				next;
			}  
			#  trim after position where area reached a maximum 
			#$s2 = substr($s2,($maxPos+1));
			#$q2 = substr($q2,($maxPos+1));
			$s2 = unpack("x$maxPos A*",$s2);
			$q2 = unpack("x$maxPos A*",$q2);
		}

		## DISCARD IF TOO SHORT ##	
		if (length($s1) < $minlength || length($s2) < $minlength) {
			$discarded++;
			$failedstring .= $read ;#. '|';
			next;

		}
		# queue for printing
		my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2);
		$printstring .= join('~',@arr) . '|';  # these two are not in quality string, and can thus be used as seperator
							# | seperates pairs of reads; ~ seperates lines of single read entry.
		#print "$printstring\n";
	   }

	   if ($printstring ne '') {
		   $toprint->enqueue(pack("A*X",$printstring));  # removes trailing |
		   #$toprint->enqueue($printstring);
	   }
	   if ($failedstring ne '') {
		   #$failed->enqueue(pack("A*X",$failedstring));
		   $failed->enqueue($failedstring);	
	   }
 

	}
}
sub simplepaired {
	# local variables speed up threading
	my $tq = $trimq;
	while ( defined(my $rstring = $totrim->dequeue())){
	  my @reads = split(/\|/,$rstring);
          my $printstring = '';
	  my $failedstring = '';
          foreach (@reads) {
		my $read = $_;
		my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read);
		## skip too short reads
		if (length($q1) <  $minlength || length($q2) <  $minlength) {
			$discarded++;
			$failedstring .= $read; # . '|';
			next;
		}

		# initial check for maximal quality value. 
		## quality arrays 
		my @q1array = unpack("C*",$q1);
		if (max(@q1array) < $trimq + 33) {
			#print "Discarding on failed forward qualities\n$read\n";
			$discarded++;
			$failedstring .= $read;# . '|';
			next;
		}
		my @q2array = unpack("C*",$q2);
		if (max(@q2array) < $trimq + 33) {
			#print "Discarding on failed reverse qualities\n$read\n";
			$discarded++;
			$failedstring .= $read;# . '|';
			next;
		}

		#############
		## trim 3' ##
		#############
		if ($trim3 == 1) {
			#trim first
			my $pos = length($q1) - 1;
			my $maxPos = $pos;
			#if ($pos <= 0) {   
			#	$discarded++;
			#	$failedstring .= $read . '|';	
			#	next;
			#} 
			#my @qarray = unpack("C*",$q1);
			while ( $pos>0 ) {
				if ($tq > $q1array[$pos] - 33) {
					# score at position is below threshold
					$pos--;
					next;
				}
				else {
					# score is at or above threshold
					$maxPos = $pos;
					last;
				}
			}
			if ($pos==0) { 		# scanned whole read and didn't integrate to zero? skip (no output for this read)
				$discarded++;
				$failedstring .= $read ;#. '|';	
				next;
			}  
			# integrated to zero?  trim before position where quality was below threshold 
	 		# unpack is one based
			#$maxPos++;
			#$s1 = substr($s1,0,$maxPos);
			$s1 = unpack("A$maxPos",$s1);
			#$q1 = substr($q1,0,$maxPos);
			$q1 = unpack("A$maxPos",$q1);
			
			#trim second
			$pos = length($q2) -1;
			$maxPos = $pos;
			if ($pos <= 0) {   
				$discarded++;
				$failedstring .= $read ;#. '|';	
				next;
			} 
			#my @qarray = unpack("C*",$q2);

			while ($pos>0) {
				if ($tq > $q2array[$pos] - 33) {
					# score at position is below threshold
					$pos--;
					next;
				}
				else {
					# score is at or above threshold
					$maxPos = $pos;
					last;
				}
			}
			if ($pos==0) { 		# scanned whole read and didn't integrate to zero? skip (no output for this read)
				$discarded++;
				$failedstring .= $read ;#. '|';
				next;
			}  
			# integrated to zero?  trim before position where score below threshold  
			#$maxPos++;
			$s2 = unpack("A$maxPos",$s2);
			$q2 = unpack("A$maxPos",$q2);
			#$s2 = substr($s2,0,$maxPos);
			#$q2 = substr($q2,0,$maxPos);

		}
		#############
		## trim 5' ##
		#############
		if ($trim5 == 1) {
			# first
			my $skip = 0; #length($q1);
			#my $maxPos = $skip;
			my @qarray = unpack("C*",$q1);
			while ( $skip < length($q1) ) {
				if ($tq > $qarray[$skip] - 33) {
					# score at position is below threshold
					$skip++;
					next;
				}
				else {
					# score is at or above threshold
					#$maxPos = $skip;
					last;
				}
			}
			if ($skip == length($q1)) { 		# should not fail, as that would have happened on 3' trim
				$discarded++;
				$failedstring .= $read ;#. '|';	
				next;
			}  
			# integrated to zero?  trim before position where quality was below threshold 
	 		# unpack is one based
			$s1 = unpack("x$skip A*",$s1);
			$q1 = unpack("x$skip A*",$q1);
			#$s1 = substr($s1,$skip);
			#$q1 = substr($q1,$skip);

			# second 
			$skip = 0; #length($q1);
			my @qarray = unpack("C*",$q2);
			#$maxPos = $skip;
			while ( $skip < length($q2) ) {
				if ($tq > $qarray[$skip] - 33) {
					# score at position is below threshold
					$skip++;
					next;
				}
				else {
					# score is at or above threshold
					#$maxPos = $skip;
					last;
				}
			}
			if ($skip == length($q2)) { 		# should not fail, as that would have happened on 3' trim
				$discarded++;
				$failedstring .= $read ;#. '|';	
				next;
			}  
			# integrated to zero?  trim before position where quality was below threshold 
 			# unpack is one based
			$s2 = unpack("x$skip A*",$s2);
			$q2 = unpack("x$skip A*",$q2);
			#$s2 = substr($s2,$skip);
			#$q2 = substr($q2,$skip);

		}
		## DISCARD IF TOO SHORT ##	
		if (length($s1) < $minlength || length($s2) < $minlength) {
			$discarded++;
			$failedstring .= $read .#. '|';
			next;

		}

		# queue for printing
		my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2);
		$printstring .= join('~',@arr) . '|';  # these two are not in quality string, and can thus be used as seperator
							# | seperates pairs of reads; ~ seperates lines of single read entry.
	   }
	   if ($printstring ne '') {
		   $toprint->enqueue(pack("A*X",$printstring));  # removes trailing |
		   #$toprint->enqueue($printstring);
	   }
	   if ($failedstring ne '') {
		   #$failed->enqueue(pack("A*X",$failedstring));
		   $failed->enqueue($failedstring);	
	   }
	}
}

sub bwasingle {
	# local variables speed up threading
	my $tq = $trimq;
	while ( defined(my $rstring = $totrim->dequeue())){
	  my @reads = split(/\|/,$rstring);
          my $printstring = '';
	  my $failedstring = '';
          foreach (@reads) {
		my $read = $_;
		my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read);
		## skip too short reads
		if (length($q1) <  $minlength ) {
			
			$discarded++;
			$failedstring .= $read ;#. '|';
			next;
		}
		# initial check for maximal quality value. 
		## quality arrays 
		my @q1array = unpack("C*",$q1);
		if (max(@q1array) < $tq + 33) {
			print "Discarding on failed qualities\n$read\n";
			$discarded++;
			$failedstring .= $read;# . '|';
			next;
		}

		#############
		## trim 3' ##
		#############
		if ($trim3 == 1) {
			my $pos = length($q1) - 1;
			my $maxPos = $pos;
			## skip empty read.
			#if ($pos <= 0) {   
			#	$discarded++;
			#	$failedstring .= $read . '|';	
			#	next;
			#}  
			my $area = 0;
			my $maxArea = 0;
			#my @qarray = unpack("C*",$q1);
			while ($pos>0 && $area>=0) {
				$area += $tq - ($q1array[($pos)] - 33);
				## not <=  for more aggressive trimming : if multiple equal maxima => use shortest read.
				if ($area < $maxArea) {
					$pos--;
					next;
				}
				else {
					$maxArea = $area;
					$maxPos = $pos;
					$pos--;
					next;
				}
			}
			## The online perl approach is wrong if the high-quality part is not long enough to reach 0.
			if ($maxPos == 0) {  # maximal badness on position 0 => no point of no return reached. 
				$discarded++;
				$failedstring .= $read ;#. '|';	
				next;
			}  
			# otherwise trim before position where area reached a maximum 
			# $maxPos as length of substr gives positions zero to just before the maxpos.
			$s1 = unpack("A$maxPos",$s1);
			$q1 = unpack("A$maxPos",$q1);

		}
		#############
		## trim 5' ##
		#############
		if ($trim5 == 1) {
			my $skip = 0; #length($q1);
			my $maxPos = $skip;
			my $area = 0;
			my $maxArea = 0;
			my @qarray = unpack("C*",$q1);
			while ($skip<length($q1) && $area>=0) {
				#$area += $tq - (ord(unpack("x$skip A1",$q1)) - 33);
				$area += $tq - ($qarray[($skip)] - 33);
				if ($area < $maxArea) {
					$skip++;
					next;
				}
				else {
					$maxArea = $area;
					$maxPos = $skip;
					$skip++;
					next;
				}
			}
			#if ($skip==length($q1)) { 		# scanned whole read and didn't integrate to zero? skip (no output for this read
			if ($maxPos == (length($q1)-1)) { # => maximal value at end of string => no point of no return found.
				$discarded++;
				$failedstring .= $read ;#. '|';	
				next;
			}  
			# trim after position where area reached a maximum
			$s1 = unpack("x$maxPos A*",$s1);
			$q1 = unpack("x$maxPos A*",$q1); 
			#$s1 = substr($s1,($maxPos+1));
			#$q1 = substr($q1,($maxPos+1));
		}
		## DISCARD IF TOO SHORT ##	
		if (length($s1) < $minlength) {
			$discarded++;
			$failedstring .= $read ;#. '|';
			next;

		}

		# queue for printing
		my @arr = ($h1a,$s1,$h1b,$q1);
		$printstring .= join("\n",@arr) . "\n";  # join components by newline character for printing 
	   }
	   if ($printstring ne '') {
		   #$toprint->enqueue(pack("A*X",$printstring));  # removes trailing |
		   $toprint->enqueue($printstring);
	   }
	   if ($failedstring ne '') {
		   #$failed->enqueue(pack("A*X",$failedstring));
		   $failed->enqueue($failedstring);	
	   }

	}
}


sub simplesingle {
	# local variables speed up threading
	my $tq = $trimq;
	while ( defined(my $rstring = $totrim->dequeue())){
	  my @reads = split(/\|/,$rstring);
          my $printstring = '';
	  my $failedstring = '';
          foreach (@reads) {
		my $read = $_;
		my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read);
		## skip too short reads
		if (length($q1) <  $minlength ) {
			$discarded++;
			$failedstring .= $read ;#. '|';
			next;
		}
		# initial check for maximal quality value. 
		## quality arrays 
		my @q1array = unpack("C*",$q1);
		if (max(@q1array) < $trimq + 33) {
			#print "Discarding on failed qualities\n$read\n";
			$discarded++;
			$failedstring .= $read ;#. '|';
			next;
		}

		#############
		## trim 3' ##
		#############
		if ($trim3 == 1) {
			my $pos = length($q1);
			my $maxPos = $pos;
			#if ($pos <= 0) {   
			#	$discarded++;
			#	$failedstring .= $read . '|';	
			#	next;
			#} 
			#my @qarray = unpack("C*",$q1);

			while ( $pos>0 ) {
				if ($tq >  $q1array[$pos] - 33) {
					# score at position is below threshold
					$pos--;
					next;
				}
				else {
					# score is at or above threshold
					$maxPos = $pos;
					last;
				}
			}
			if ($pos==0) { 		# scanned whole read and didn't integrate to zero? skip (no output for this read)
				$discarded++;
				$failedstring .= $read ;#. '|';	
				next;
			}  
			# integrated to zero?  trim before position where quality was below threshold 
	 		# unpack is one based
			$s1 = unpack("A$maxPos",$s1);
			$q1 = unpack("A$maxPos",$q1);
			#$s1 = substr($s1,0,$maxPos);
			#$q1 = substr($q1,0,$maxPos);

		}
		#############
		## trim 5' ##
		#############
		if ($trim5 == 1) {
			my $skip = 0; #length($q1);
			#my $maxPos = $skip;
			my @qarray = unpack("C*",$q1);

			while ( $skip < length($q1) ) {
				if ($tq >  $qarray[$skip] - 33) {
					# score at position is below threshold
					$skip++;
					next;
				}
				else {
					# score is at or above threshold
					#$maxPos = $skip;
					last;
				}
			}
			if ($skip == length($q1)) { 		
				$discarded++;
				$failedstring .= $read ;#. '|';	
				next;
			}  
			# integrated to zero?  trim before position where quality was below threshold 
	 		# unpack is one based
			#$s1 = substr($s1,$skip);
			#$q1 = substr($q1,$skip);
			$s1 = unpack("x$skip A*",$s1);
			$q1 = unpack("x$skip A*",$q1);


		}
		## DISCARD IF TOO SHORT ##	
		if (length($s1) < $minlength) {
			$discarded++;
			$failedstring .= $read ;#. '|';
			next;

		}


		# queue for printing
		my @arr = ($h1a,$s1,$h1b,$q1);
		$printstring .= join("\n",@arr) . "\n";  # join components by newline character for printing 
	   }
	   if ($printstring ne '') {
		   $toprint->enqueue($printstring);  # single reads => no | present, just \n
	   }
	   if ($failedstring ne '') {
		   #$failed->enqueue(pack("A*X",$failedstring));
		   $failed->enqueue($failedstring);	
	   }

	}
}

sub printout {
	if ($rand eq '' || !defined($rand)) {
		die('Random value not passed on!');
	}
	if ($paired == 0) {
		my $counter = 0;
		my $string1 = '';
		if (-e "/tmp/$rand.fq") {
			unlink("/tmp/$rand.fq");
		}
		while (defined(my $string = $toprint->dequeue())) {	# each item represents 10000 input reads (might be less by trimming)
			#my @array = split(/\|/,$printstring);
			#$done = $done + scalar(@array);
			$done = $done + 10000;
			$string1 .= $string;
			$counter++;
			#print batches of 500,000 reads
			if ($counter == 50) {
				open OUT1, ">>/tmp/$rand.fq";
				print OUT1 $string1;
				close OUT1;
				$string1 = '';
				#$string2 = '';
				$counter = 0;
			}
		}
		## print last
		open OUT1, ">>/tmp/$rand.fq";
		print OUT1 $string1;
		close OUT1;
		$finish = 1;

  	}
	else {
		my $counter = 0;
		my $string1 = '';
		my $string2 = '';
		if (-e "/tmp/$rand.fq") {
			unlink("/tmp/$rand.fq");
		}
		if (-e "/tmp/$rand.fq") {
			unlink("/tmp/$rand.rq");
		}
  		while (defined(my $printstring = $toprint->dequeue())) {	# each item represents 10000 input reads
			my @array = split(/\|/,$printstring);
			$done = $done + scalar(@array);
			foreach (@array) {
				my @subarr = split(/\~/,$_);
				$string1 .= join("\n",@subarr[0..3])."\n";
				$string2 .= join("\n",@subarr[4..7])."\n";
			}
			$counter++;
			if ($counter == 50) {
				#print "Printing!\n";
				flock("/tmp/sg.write.lock",2);
				open OUT1, ">>/tmp/$rand.fq";
				print OUT1 $string1;
				close OUT1;
				open OUT2, ">>/tmp/$rand.rq";
				print OUT2 $string2;
				close OUT2;
				flock("/tmp/sg.write.lock",8);
				$string1 = '';
				$string2 = '';
				$counter = 0;
			}
		}
		## print last
		flock("/tmp/sg.write.lock",2);
		open OUT1, ">>/tmp/$rand.fq";
		print OUT1 $string1;
		close OUT1;
		open OUT2, ">>/tmp/$rand.rq";
		print OUT2 $string2;
		close OUT2;
		flock("/tmp/sg.write.lock",8);
		$finish = 1;
	}

}

sub printfailed {
	if (-e "/tmp/$rand.failed") {
		unlink("/tmp/$rand.failed");
	}
	#open FAIL, ">/tmp/$rand.failed" or die("could not open file with failed reads, '/tmp/$rand.failed'");
	#FAIL->autoflush(1);
	my $nr = 0;
	my $string = '';
	while ( defined(my $rstring = $failed->dequeue())) {
		#my @array = split(/\|/,$rstring);
		#foreach (@array) {
			$string .= $rstring;
		#}
		$nr++;
		if ($nr == 50) {
			open FAIL, ">>/tmp/$rand.failed";
			print FAIL $string;
			close FAIL;
			$nr = 0;
			$string = '';
		}
		#close FAIL;
	}
	open FAIL, ">>/tmp/$rand.failed";
	print FAIL $string;
	close FAIL;	
}
sub monitor {
	# checks the file read monitor
	my $counter = 0;
	while ($finish == 0) {
		$counter++;
		if ($totrim->pending() > 200) {
			lock($dummy);
			#print "Putting the file reading to sleep for 10 seconds.\n";
			sleep 5;
		}
		else {
			sleep 5;
		}
		if ($counter == 6) {
			if ($verbose == 1) {
				my $string =  ($done/1000000)."M Reads Passed, ".($discarded/1000000)."M reads discarded (".$totrim->pending().".10^4 pending)\n";
				print $string ;
			}
			$counter = 0;
		}
		
	}
}