Mercurial > repos > fcaramia > custom_amplicon_alignment

diff alignCustomAmplicon/alignCustomAmplicon.pl @ 0:d32bddcff685 draft
Uploaded
author: fcaramia
date: Wed, 09 Jan 2013 00:24:09 -0500
children: 22f35f830f48
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alignCustomAmplicon/alignCustomAmplicon.pl	Wed Jan 09 00:24:09 2013 -0500
@@ -0,0 +1,1825 @@
+#!/usr/bin/perl
+
+# script to convert paired end reads to a concensus sequence
+
+# Created by Jason Ellul, Feb 2012
+# Modified by Franco Caramia and Jason Ellul, July 2012 and November 2012
+# 0.4 Fixed mapping of concensus region (large insertions and deletions now picked up correctly)
+#     Added missalignment penalty
+#     Read 1 and 2 are now aligned first then they are aligned to the reference
+# 0.5 Added ability to name amplicons by includinfo the info in the primers file
+#     Added the ability to output the alignments in a particular region using the new flag -a
+#     Added some extra stats to the output
+
+use strict;
+use warnings;
+use Getopt::Std;
+use File::Basename;
+use File::Find;
+use File::Copy;
+use File::MMagic;
+use File::Temp qw/ tempfile tempdir /;
+use Cwd 'abs_path';
+use threads;
+use threads::shared;
+use List::Util qw(sum);
+use Thread::Queue;
+use String::Approx 'adistr';
+use POSIX qw(floor ceil);
+String::Approx::cache_max(25000);
+String::Approx::cache_purge(0.25);
+use constant false => 0;
+use constant true  => 1;
+use Inline CPP => Config => AUTO_INCLUDE => '#include "utils.cpp"';
+use Inline CPP => <<'END_CPP';
+
+
+
+
+int levenshtein(char* seq1, char* seq2)
+{
+	// Returns edit distance of 2 sequences
+	string sam1(seq1);
+	string sam2(seq2);
+	return levenshteinDistance(sam1, sam2);
+}
+
+double align2(char* seq1, char* seq2, SV* res1, SV* res2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug ) 
+{
+	// Returns Sequence alignment of 2 sequences
+	// res1, res2 contain the alignment
+	//seq1: reference sequence
+	//seq2: amplicon sequence
+	//matchScore: reward for matching a nucleotide 
+	//gapPenOpen: penalty for opening a gap
+	//gapPenext: penalty for extending a gap
+	//noFrontGapPenalty: No penalty will be impose for gaps at the beggining of the alignment
+	//noEndGapPenalty: No penalty will be impose for gaps at the end of the alignment
+	//debug: prints scores, alignments, etc...
+	
+	string sam1(seq1);
+	string sam2(seq2);
+	double ret = nw_align2(sam1, sam2, matchScore, missmatch_penalty, gapPenOpen, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug);	
+	//Setting the return alignments
+	sv_setpvn(res1, (char*)sam1.c_str(), sam1.length());
+	sv_setpvn(res2, (char*)sam2.c_str(), sam2.length());	
+	return ret;
+}
+
+void adjustSequence(int refSize, string& seq, char dir)
+{
+
+	int adjust = refSize - seq.length();
+	if(dir == 'r')
+		for(int i=0; i<adjust; ++i) seq = '-' + seq; 
+	else
+		for(int i=0; i<adjust; ++i) seq += '-'; 
+}
+
+double align3(char* ref, char* right,char* left , SV* res1, SV* res2, SV* res3, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool leftAndRightGapPen, bool debug ) 
+{
+	// Returns Sequence alignment of 3 sequences
+	// res1, res2, res3 contain the alignment
+	//ref: reference sequence
+	//left: left amplicon sequence
+	//right: right amplicon sequence
+	//matchScore: reward for matching a nucleotide 
+	//gapPenOpen: penalty for opening a gap
+	//gapPenext: penalty for extending a gap
+	//leftAndRightGapPen: Smart way to penalise according to the amplicon
+	//debug: prints scores, alignments, etc...
+	
+	
+	string leftSeq(left);
+	string rightSeq(right);
+	
+	bool noFrontGapPenalty = false;
+	bool noEndGapPenalty = false;
+	
+	//Align right with left reads
+	if(leftAndRightGapPen)
+	{
+		noFrontGapPenalty = true;
+		noEndGapPenalty = true;
+	}
+	
+	//first alignment the 2 reads
+	double ret = nw_align2(rightSeq, leftSeq, matchScore, missmatch_penalty*8, gapPenOpen*5, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug);	
+		
+	//Setting the return alignments
+	vector <string> msa;
+	vector <string> partial;
+	vector <string> seq;
+
+	partial.push_back(rightSeq);
+	partial.push_back(leftSeq);
+	seq.push_back(ref);
+		
+	msa = nw_alignAlignments(partial, seq , matchScore, missmatch_penalty, gapPenOpen, gapPenExt, false, false, debug);
+	sv_setpvn(res1, (char*)msa[2].c_str(), msa[2].length());
+	sv_setpvn(res2, (char*)msa[1].c_str(), msa[1].length());
+	sv_setpvn(res3, (char*)msa[0].c_str(), msa[0].length());
+	return ret;
+}
+
+END_CPP
+
+$| = 1;
+
+# Grab and set all options
+my $version = "0.5";
+my %OPTIONS = (p => 1);
+getopts('a:dl:L:o:p:rsv', \%OPTIONS);
+
+die qq(version $version
+Usage:   alignCustomAmplicon.pl [OPTIONS] <ref> <fastq[.gz] 1> <fastq[.gz] 2> <primers>
+
+OPTIONS:		-a	STR	print the alignment for any amplicons covering a particular region [chr:loc || chr:start-stop]
+			-d		print debug info
+			-l	STR	filename of the log file [null]
+			-L	STR	append to an existing log file [null]
+			-o	STR	filename for bam file
+			-p	INT	number of threads [$OPTIONS{p}]
+			-r		do not create stats report
+			-s		sort the bam file
+			-v		verbose progress
+) if(@ARGV != 4);
+
+my $Script = 'alignCustomAmplicon';
+my $ref = shift @ARGV;
+my $fastq1 = shift @ARGV;
+my $fastq2 = shift @ARGV;
+my $primers = shift @ARGV;
+my $now;
+
+#create log file
+if(defined $OPTIONS{l}) {
+	my $logfile = $OPTIONS{l};
+	open (FH, ">$logfile") or die 'Couldn\'t create log file!\n';
+	close (FH);
+	# Open the log file and redirect output to it
+	open (STDERR, ">>$logfile");
+	open (STDOUT, ">>$logfile");
+	my $now = localtime time;
+	print "Log File Created $now\n\n";
+} elsif(defined $OPTIONS{L}) {
+	#append to a log file
+	my $logfile = $OPTIONS{L};
+	# Open the log file and redirect output to it
+	open (STDERR, ">>$logfile") or die 'Couldn\'t create log file!\n';
+	open (STDOUT, ">>$logfile") or die 'Couldn\'t create log file!\n';
+	my $now = localtime time;
+	print "Appending To Log File $now\n\n";
+}
+
+# print version of this script
+print "Using $Script version $version \n\n";
+
+if(defined $OPTIONS{d}) {
+	print "Setting threads to 1 as using debug\n";
+	$OPTIONS{p} = 1;
+}
+
+# grab sample name, set output file name and open some files
+my ($name, $path, $suffix) = fileparse($fastq1, ('.fastq', '.fastq.gz'));
+my $sampleName :shared = $name;
+$sampleName =~ s/^(.*?_.*?)_.*/$1/;
+open(PRIM, $primers) or die "can't open $primers: $!\n";
+
+# set samfile name
+my $bamfile_out = "${sampleName}_aligned.bam";
+if(defined $OPTIONS{o}) {
+	$bamfile_out = $OPTIONS{o};
+}
+my $bam_to_sort = $bamfile_out;
+if(defined $OPTIONS{s}) {
+	$bam_to_sort .= '.tmp';
+}
+
+my $stats = $bamfile_out;
+if(!defined $OPTIONS{r}) {
+	$stats =~ s/\..*?$//;
+	open(STATS, ">${stats}_stats.csv");
+}
+
+# read in primer lengths
+my %refFrags;
+$refFrags{min} = 5000000;
+while(my $line = <PRIM>) {
+	chomp $line;
+	my @line_sp = split("\t", $line);
+	$refFrags{primers}{$line_sp[0]}{U}{F}{length} = $line_sp[1];
+	$refFrags{primers}{$line_sp[0]}{D}{F}{length} = $line_sp[2];
+	$refFrags{primers}{$line_sp[0]}{U}{R}{length} = $line_sp[2];
+	$refFrags{primers}{$line_sp[0]}{D}{R}{length} = $line_sp[1];
+	$refFrags{name}{$line_sp[0]} = $line_sp[3];
+	$refFrags{min} = min([$refFrags{min}, $line_sp[1], $line_sp[2]]);
+}
+close(PRIM);
+my @OrderedContigs :shared = sort sortChrom keys %{ $refFrags{primers} };
+
+my %regions;
+if(defined $OPTIONS{a}) {
+	my ($chr, $start, $end) = split_region($OPTIONS{a});
+	foreach my $reg (@OrderedContigs) {
+		my ($chr_reg, $start_reg, $end_reg) = split_region($reg);
+		$regions{$reg} = 1 if $chr eq $chr_reg && $start_reg <= $end && $end_reg >= $start;
+	}
+}
+
+# grab fasta file
+my %refSeq;
+read_fasta_file($ref, \%refSeq);
+
+# calculate read length and read group
+my $readGroup :shared;
+my $readLength;
+my $mm = new File::MMagic;
+my $filetype = $mm->checktype_filename(abs_path($fastq1));
+if($filetype eq "application/x-gzip") {
+	my $sampleRead = `gunzip -c $fastq1 | head -n 2 | tail -n 1`;
+	chomp $sampleRead;
+	$readLength = length($sampleRead);
+	$readGroup = `gunzip -c $fastq1 | head -n 1`;
+	chomp $readGroup;
+	$readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/;
+} elsif($filetype eq "text/plain") {
+	my $sampleRead = `head -n 2 $fastq1 | tail -n 1`;
+	chomp $sampleRead;
+	$readLength = length($sampleRead);
+	$readGroup = `head $fastq1 -n 1`;
+	chomp $readGroup;
+	$readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/;
+} else {
+	die("$fastq1 is filetype $filetype and not text/plain or application/x-gzip\n");
+}
+
+# create hash to keep track of maxdiff and readlengths
+my %maxdiff :shared;
+
+# create hash to keep track of stats
+my %stats :shared;
+$stats{Amb} = 0;
+$stats{PoorQualMates} = 0;
+$stats{PoorMapQual} = 0;
+$stats{PoorQual1} = 0;
+$stats{PoorQual2} = 0;
+
+# setup other shared variables and caches
+my %cache :shared;
+my $cacheHits :shared = 0;
+my %cacheAlign :shared;
+$cacheAlign{Conc} = &share( {} );
+my $cacheAlignHits :shared = 0;
+my %refLength :shared;
+
+# extract primers and fragments on forward and reverse strand and initialize counts
+foreach my $key (@OrderedContigs) {
+	push @{ $refFrags{seqU} }, substr($refSeq{$key}, 0, $refFrags{primers}{$key}{U}{F}{length});
+	push @{ $refFrags{seqD} }, substr($refSeq{$key}, -$refFrags{primers}{$key}{D}{F}{length});
+	push @{ $refFrags{seqU} }, revcompl(substr($refSeq{$key}, -$refFrags{primers}{$key}{U}{R}{length}));
+	push @{ $refFrags{seqD} }, revcompl(substr($refSeq{$key}, 0, $refFrags{primers}{$key}{D}{R}{length}));
+	push @{ $refFrags{lenU} }, ($refFrags{primers}{$key}{U}{F}{length}, $refFrags{primers}{$key}{U}{R}{length});
+	push @{ $refFrags{lenD} }, ($refFrags{primers}{$key}{D}{F}{length}, $refFrags{primers}{$key}{D}{R}{length});
+	push @{ $refFrags{reg} }, ($key, $key);
+	push @{ $refFrags{strand} }, ('F', 'R');
+		
+	# calculate new contig name with primers removed
+	my ($chr, $start, $stop) = split(":|-", $key);
+	$start += $refFrags{primers}{$key}{U}{F}{length};
+	$stop -= $refFrags{primers}{$key}{D}{F}{length};
+	$refLength{$key} = &share( {} );
+	$refLength{$key}{newReg} = "$chr:$start-$stop";
+	$refLength{$key}{length} = $stop - $start + 1;
+
+	$cacheAlign{$key} = &share( {} );
+	$cacheAlign{$key}{R1} = &share( {} );
+	$cacheAlign{$key}{R2} = &share( {} );
+
+	$stats{$key} = &share( {} );
+	$stats{$key}{Reads} = 0;
+	$stats{$key}{Bases} = 0;
+	$stats{$key}{Quality} = 0;
+	$stats{$key}{Mapping} = 0;
+	$stats{$key}{Read1} = 0;
+	$stats{$key}{Read2} = 0;
+	$stats{$key}{ConcMapping} = 0;
+	$stats{$key}{ConcReads} = 0;
+	$stats{$key}{NumberForward} = 0;
+	$stats{$key}{NumberReverse} = 0;
+	$stats{$key}{ConcErrorsBases} = 0;
+	$stats{$key}{ConcCorrectBases} = 0;
+	$stats{$key}{MinLength} = 100000;
+	$stats{$key}{MaxLength} = 0;
+	$stats{$key}{TotalA} = 0;
+	$stats{$key}{TotalC} = 0;
+	$stats{$key}{TotalG} = 0;
+	$stats{$key}{TotalT} = 0;
+	$stats{$key}{TotalN} = 0;
+	$stats{$key}{PoorMapQual} = 0;
+}
+
+# start up worker threads
+my $FindContig = Thread::Queue->new();
+my $ToAlign = Thread::Queue->new();
+my $Aligned = Thread::Queue->new();
+my $Samfile = Thread::Queue->new();
+my $WriteBam = Thread::Queue->new();
+my @threadArray;
+for(my $index = 1; $index <= $OPTIONS{p}; $index++) {
+	push @threadArray, threads->create(\&thread_find_contig, $FindContig, \%refFrags, \%refSeq, $ToAlign, $Aligned, $WriteBam);
+	push @threadArray, threads->create(\&thread_process_align, $Aligned, \%refFrags, \%refSeq, $Samfile, $WriteBam) if $index <= max([1, $OPTIONS{p}/2]);
+	push @threadArray, threads->create(\&samfile_line, $Samfile, $WriteBam, \%refSeq) if $index <= max([1, $OPTIONS{p}/2]);
+}
+push @threadArray, threads->create(\&nwAlign, $ToAlign, $Aligned, \%regions);
+push @threadArray, threads->create(\&writebamfile, $bam_to_sort, $WriteBam);
+
+# read in each sequence and its mate from the fastq file 
+my $read1;
+my $read2;
+my $seq1;
+my $seq2;
+my $qual1;
+my $qual2;
+my $desc;
+my @strand;
+my $cont = 1;
+
+# open fastq files
+my $gz1;
+my $gz2;
+my $f1gz = 0;
+my $f2gz = 0;
+if($filetype eq "application/x-gzip") {
+	open(FQ1, "gunzip -dc $fastq1 |") or die "Can't fork: $!\n";
+} else {
+	open(FQ1, $fastq1) or die "$fastq1: $!";
+}
+
+$filetype = $mm->checktype_filename(abs_path($fastq2));;
+if($filetype eq "application/x-gzip") {
+	open(FQ2, "gunzip -dc $fastq2 |") or die "Can't fork: $!\n";
+} elsif($filetype eq "text/plain") {
+	open(FQ2, $fastq2) or die "$fastq2: $!";
+} else {
+	die("$fastq2 is filetype $filetype and not text/plain or application/x-gzip\n");
+}
+
+# add reads to the queue
+my $readNo = 0;
+while ($cont) {
+	if($read1 = <FQ1>) { # read 1 and 2 read name
+		$seq1 = <FQ1>; # read 1 sequence
+		$desc = <FQ1>; # read 1 desc
+		$qual1 = <FQ1>; # read 1 quality
+	} else {
+		last;
+	}
+
+	if($read2 = <FQ2>) { # read 2 read name
+		$seq2 = <FQ2>; # read 2 sequence
+		$desc = <FQ2>; # read 2 sdesc
+		$qual2 = <FQ2>; # read 2 quality
+	} else {
+		last;
+	}
+	
+	# read 1 the forward read
+	chomp $read1;
+	chomp $seq1;
+	chomp $qual1;
+	$read1 =~ s/^\@(.*)\s?.*/$1/;
+	
+	# read 2 the reverse read
+	chomp $read2;
+	chomp $seq2;
+	$seq2 = revcompl($seq2);
+	chomp $qual2;
+	$qual2 = reverse($qual2);
+
+	$read2 =~ s/^\@(.*)\s?.*/$1/;
+	$readNo++;
+
+	# keep at most 20 reads in buffer
+	while(1) {
+		if(defined $OPTIONS{v}) {
+			my $percAmb = $stats{Amb} / $readNo * 100;
+			my $percPoor = $stats{PoorQualMates} / $readNo * 100;
+			my $percPoorMap = $stats{PoorMapQual} / $readNo * 100;
+			my $percPoor1 = $stats{PoorQual1} / $readNo * 100;
+			my $percPoor2 = $stats{PoorQual2} / $readNo * 100;
+			$percAmb = sprintf("%.2f", $percAmb);
+			$percPoor = sprintf("%.2f", $percPoor);
+			$percPoorMap = sprintf("%.2f", $percPoorMap);
+			$percPoor1 = sprintf("%.2f", $percPoor1);
+			$percPoor2 = sprintf("%.2f", $percPoor2);
+			print STDERR "\r$readNo processed - Amb $percAmb - PoorQual $percPoor - PoorQual1 $percPoor1 - PoorQual2 $percPoor2 - PoorMappingQual $percPoorMap - FindContig ".$FindContig->pending()." - ToAlign ".$ToAlign->pending()." - Aligned ".$Aligned->pending()." - Sam ".$Samfile->pending()." - Writebam ".$WriteBam->pending()."        ";
+		}
+		last if $FindContig->pending() <= 100;
+	}
+	my @readArray = ($read1, $seq1, $qual1, $read2, $seq2, $qual2);
+	{
+		lock($FindContig); # Keep other threads from changing the queue's contents
+		$FindContig->enqueue(\@readArray);
+	}
+}
+close(FQ1);
+close(FQ2);
+
+# wait for all records to be processed and close down threads
+$FindContig->enqueue( (undef) x $OPTIONS{p} );
+while($FindContig->pending() > 0 ) {
+	next;
+}
+$ToAlign->enqueue( (undef) );
+while($ToAlign->pending() > 0 ) {
+	next;
+}
+$Aligned->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) );
+while($Aligned->pending() > 0 ) {
+	next;
+}
+$Samfile->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) );
+while($Samfile->pending() > 0 ) {
+	next;
+}
+$WriteBam->enqueue( undef );
+
+$_->join() for @threadArray;
+
+# sort the bam file
+if(defined $OPTIONS{s}) {
+	system("MergeSamFiles.sh  MAX_RECORDS_IN_RAM=2000000 I=$bam_to_sort O=$bamfile_out VALIDATION_STRINGENCY=LENIENT SO=coordinate USE_THREADING=true CREATE_INDEX=true");
+	unlink $bam_to_sort;
+}
+
+if(!defined $OPTIONS{r}) {
+	# print out statistics
+	# setup variables
+	my $totalRegionLengths = 0;
+	my $totalReads = 0;
+	my $totalFwd = 0;
+	my $totalRev = 0;
+	my $totalMatch = 0;
+	my $totalMiss = 0;
+	my $totalBases = 0;
+	my $totalRead1 = 0;
+	my $totalRead2 = 0;
+	my $totalA = 0;
+	my $totalC = 0;
+	my $totalG = 0;
+	my $totalT = 0;
+	my $totalN = 0;
+	my $totalConc = 0;
+	my $totalQuality = 0;
+	my $AllmaxRead = 0;
+	my $AllminRead = 100000;
+	my $avequal = 'NA';
+	my $aveRead = 'NA';
+	my $perA = 'NA';
+	my $perC = 'NA';
+	my $perG = 'NA';
+	my $perT = 'NA';
+	my $perN = 'NA';
+	my $readsUsed = 0;
+	my @coverage;
+	my @off_coverage;
+
+	# print column headers
+	print STATS 'AmpliconName,Region,RegionLengthMinusPrimers,TotalReadsPoorMapping,TotalReadsUsed,Reads1Kept,Reads2Kept,ReadsOut,ReadsFwd,ReadsRev,ReadsOverlap,';
+	print STATS "Bases,AveQuality,MatchBases,ErrorBases,ErrorRate,MinReadLength,AveReadLength,MaxReadLength,PercA,PercC,PercG,PercT,PercN\n";
+
+	# for each region
+	foreach my $key (@OrderedContigs){
+		# if we have some reads calculate averages
+		if($stats{$key}{Reads} > 0) {
+			$avequal = $stats{$key}{Quality} / $stats{$key}{Bases};
+			$avequal = sprintf("%.4f", $avequal);
+			$aveRead = $stats{$key}{Bases} / $stats{$key}{Reads};
+			$aveRead = sprintf("%.4f", $aveRead);
+			$perA = $stats{$key}{TotalA} / $stats{$key}{Bases} * 100;
+			$perA = sprintf("%.2f", $perA);
+			$perC = $stats{$key}{TotalC} / $stats{$key}{Bases} * 100;
+			$perC  = sprintf("%.2f", $perC);
+			$perG = $stats{$key}{TotalG} / $stats{$key}{Bases} * 100;
+			$perG  = sprintf("%.2f", $perG);
+			$perT = $stats{$key}{TotalT} / $stats{$key}{Bases} * 100;
+			$perT = sprintf("%.2f", $perT);
+			$perN = $stats{$key}{TotalN} / $stats{$key}{Bases} * 100;
+			$perN = sprintf("%.2f", $perN);
+			$AllminRead = min([$AllminRead, $stats{$key}{MinLength}]);
+			$AllmaxRead = max([$AllmaxRead, $stats{$key}{MaxLength}]);
+			$readsUsed = $stats{$key}{Reads} + $stats{$key}{ConcReads};
+		} else {
+			$stats{$key}{MaxLength} = 'NA';
+			$stats{$key}{MinLength} = 'NA';
+		}
+	
+		# if we have reads which overlapped calculate an error rate
+		my $errRate = 'NA';
+		if($stats{$key}{ConcReads} > 0) {
+			$errRate = $stats{$key}{ConcErrorsBases} / ($stats{$key}{ConcErrorsBases} + $stats{$key}{ConcCorrectBases});
+		}
+
+		# print the stats for this region
+		print STATS "$refFrags{name}{$key}," if defined $refFrags{name}{$key};
+		print STATS "Unknown," unless defined $refFrags{name}{$key};
+		print STATS "$refLength{$key}{newReg},$refLength{$key}{length},$stats{$key}{PoorMapQual},$readsUsed,$stats{$key}{Read1},$stats{$key}{Read2},$stats{$key}{Reads},$stats{$key}{NumberForward},";
+		print STATS "$stats{$key}{NumberReverse},$stats{$key}{ConcReads},$stats{$key}{Bases},$avequal,$stats{$key}{ConcCorrectBases},";
+		print STATS "$stats{$key}{ConcErrorsBases},$errRate,$stats{$key}{MinLength},$aveRead,$stats{$key}{MaxLength},$perA,$perC,$perG,$perT,$perN\n";
+
+		# add stats to grand totals
+		$totalRegionLengths += $refLength{$key}{length};
+		$totalQuality += $stats{$key}{Quality};
+		$totalReads += $stats{$key}{Reads};
+		$totalFwd += $stats{$key}{NumberForward};
+		$totalRev += $stats{$key}{NumberReverse};
+		$totalBases += $stats{$key}{Bases};
+		$totalRead1 += $stats{$key}{Read1};
+		$totalRead2 += $stats{$key}{Read2};
+		$totalA += $stats{$key}{TotalA};
+		$totalC += $stats{$key}{TotalC};
+		$totalT += $stats{$key}{TotalT};
+		$totalG += $stats{$key}{TotalG};
+		$totalN += $stats{$key}{TotalN};
+		$totalMatch += $stats{$key}{ConcCorrectBases};
+		$totalMiss += $stats{$key}{ConcErrorsBases};
+		$totalConc += $stats{$key}{ConcReads};
+	
+		# reset ave scores
+		$avequal = 'NA';
+		$aveRead = 'NA';
+		$perA = 'NA';
+		$perC = 'NA';
+		$perG = 'NA';
+		$perT = 'NA';
+		$perN = 'NA';
+		$readsUsed = 0;
+		
+		if(defined $refFrags{name}{$key} && grep /^Off_target/, $refFrags{name}{$key}) {
+			push @off_coverage, $stats{$key}{Reads};
+		} else {
+			push @coverage, $stats{$key}{Reads};
+		}
+	}
+
+	# if we have any reads calculate the averages
+	if($totalReads > 0) {
+		$avequal = $totalQuality / $totalBases;
+		$avequal = sprintf("%.4f", $avequal);
+		$aveRead = $totalBases / $totalReads;
+		$aveRead = sprintf("%.4f", $aveRead);
+		$perA = $totalA / $totalBases * 100;
+		$perA = sprintf("%.2f", $perA);
+		$perC = $totalC / $totalBases * 100;
+		$perC  = sprintf("%.2f", $perC);
+		$perG = $totalG / $totalBases * 100;
+		$perG  = sprintf("%.2f", $perG);
+		$perT = $totalT / $totalBases * 100;
+		$perT = sprintf("%.2f", $perT);
+		$perN = $totalN / $totalBases * 100;
+		$perN = sprintf("%.2f", $perN);
+		$readsUsed = $totalReads + $totalConc;
+	} else {
+		$AllmaxRead = 'NA';
+		$AllminRead = 'NA';
+	}
+
+	# if we had any reads which overlapped calculate error rate
+	my $errRate = 'NA';
+	if($totalConc > 0) {
+		$errRate = $totalMiss / ($totalMiss + $totalMatch);
+	}
+
+	# print out totals
+	my $pcAmb = sprintf("%.2f", $stats{Amb} / $readNo * 100);
+	my $pcPQ = sprintf("%.2f", $stats{PoorQualMates} / $readNo * 100);
+	my $TotalReads = $readNo * 2;
+	my $Unmapped = $TotalReads - $readsUsed;
+	my $pcUnmapped = sprintf("%.2f", $Unmapped/$TotalReads * 100);
+	my $avCov = sprintf("%.0f", sum(@coverage)/@coverage);
+	my $avOff = sprintf("%.2f", sum(@off_coverage)/@off_coverage) if @off_coverage;
+	my $greater = scalar (grep { $_ > 0.2*$avCov } @coverage);
+	my $pcgreater = sprintf("%.2f", $greater/@coverage * 100);
+
+	print STATS ",Total,$totalRegionLengths,$stats{PoorMapQual},$readsUsed,$totalRead1,$totalRead2,$totalReads,$totalFwd,$totalRev,$totalConc,$totalBases,$avequal,$totalMatch,";
+	print STATS "$totalMiss,$errRate,$AllminRead,$aveRead,$AllmaxRead,$perA,$perC,$perG,$perT,$perN\n";
+	print STATS "\nNo mate pairs:,$readNo\n";
+	print STATS "No ambiguous mates:,$stats{Amb},$pcAmb\%\n";
+	print STATS "No poor quality mates:,$stats{PoorQualMates},$pcPQ\%\n\n";
+	print STATS "Total Number Of Reads:,$TotalReads\n";
+	print STATS "Unmapped Reads:,$Unmapped,$pcUnmapped\%\n\n";
+	print STATS "Min Mean Max ReadsOut:,".min([@coverage]).",$avCov,".max([@coverage])."\n";
+	print STATS "No. Amplicons > 0.2 x ReadsOut:,$greater,$pcgreater\%\n";
+	print STATS "Average ReadsOut Off Target Regions:,$avOff\n" if @off_coverage;
+	print STATS "\nAmplicons\nNo. Reads < 200, 200 <= No. Reads < 1000, No. Reads >= 1000\n";
+	$greater = scalar (grep { $_ < 200 } @coverage);
+	print STATS "$greater,";
+	$greater = scalar (grep { $_ >= 200 && $_ < 1000 } @coverage);
+	print STATS "$greater,";
+	$greater = scalar (grep { $_ >= 1000 } @coverage);
+	print STATS "$greater\n";
+	
+	print "Cache hits align contig: $cacheHits\n";
+	print "Alignment cache hits: $cacheAlignHits\n";
+	close(STATS);
+}
+
+# close the log file
+if(defined $OPTIONS{l} || defined $OPTIONS{L}) {
+	my $now = localtime time;
+	print "\nLog File Closed $now\n";
+	close(STDERR);
+	close(STDOUT);
+}
+
+# thread to find matching amplicon
+sub thread_find_contig {
+	my $FindContig = shift;
+	my $refFrags = shift;
+	my $refSeq = shift;
+	my $ToAlign = shift;
+	my $Aligned = shift;
+	my $WriteBam = shift;
+	my $readArrayRef;
+	{
+		lock($FindContig); # Keep other threads from changing the queue's contents
+		$readArrayRef = $FindContig->dequeue();
+	}
+
+	while(defined $readArrayRef) {
+		my ($read1, $seq1, $qual1, $read2, $seq2, $qual2) = @{ $readArrayRef };
+		my $contig = '';
+		my $readRef = '';
+		my $strand = '';	
+		
+		# trim the reads
+		my ($trimmed1, $qtrim1) = trim($seq1, $qual1, 30, 1);	
+		my ($trimmed2, $qtrim2) = trim($seq2, $qual2, 30, 2);
+
+		# if we have already mapped the reads use the results in the cache
+		if(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) {
+			my @array;
+			{ lock(%cacheAlign);  @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } };
+			$array[0] = $read1;
+			$array[3] = $qtrim1;
+			$array[6] = $qtrim2;
+			{ lock($cacheAlignHits); $cacheAlignHits++ };
+					
+			# add aligned reads to queue
+			{
+				lock($Aligned); # Keep other threads from changing the queue's contents
+				$Aligned->enqueue(\@array);
+			}
+		} else {
+			# Use approx match to find most similar amplicon primer to reads
+			my @index1;
+			my @index2;
+			my $index_read1 = -2;
+			my $index_read2 = -2;
+			my $index_found = 0;
+		
+			if(length($trimmed1) > length($seq1)/2 || length($trimmed2) > length($seq2)/2) {
+				# we have at least 1 good read
+				# find closest matching amplicon for read 1
+				my $string = substr($seq1, 0, $$refFrags{min});
+				if(defined $cache{"$string.1"}) {
+					$cacheHits++;
+					{ lock(%cache); @index1 = @{ $cache{"$string.1"} } };
+				} else {
+					@index1 = amatch($string, \@{ $$refFrags{seqU} });
+					{
+						lock(%cache);
+						$cache{"$string.1"} = &share( [] );
+						@{ $cache{"$string.1"} } = @index1;
+					}
+				}
+
+				# find closest matching amplicon for read 2
+				$string = substr($seq2, -$$refFrags{min});
+				if(defined $cache{"$string.2"}) {
+					$cacheHits++;
+					{ lock(%cache); @index2 = @{ $cache{"$string.2"} } };
+				} else {
+					@index2 = amatch($string, \@{ $$refFrags{seqD} });
+					{
+						lock(%cache);
+						$cache{"$string.2"} = &share( [] );
+						@{ $cache{"$string.2"} } = @index2;
+					}
+				}
+						
+				if($#index1 > 0 || $#index2 > 0) {
+					# if one read maps to multiple amplicons and the other read matches to just one of them use that
+					my %in_array2 = map { $_ => 1 } @index2;
+					my @common = grep { $in_array2{$_} } @index1;
+					if($#common == 0) {
+						@index1 = @common;
+						@index2 = @common;
+					} elsif($#common > 0) {
+						if($#index1 > 0) {
+							# matched to multiple primers so try and use whole read
+							my @refs;
+							foreach(@index1) {
+								if($$refFrags{strand}[$_] eq "F") {
+									push @refs, $$refSeq{${ $$refFrags{reg} }[$_]};
+								} else {
+									push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]});
+								}
+							}
+							my @index1b = amatch($seq1, \@refs);
+							if($#index1b >= 0) {
+								@index1 = @index1[@index1b];
+							}
+						}
+						if($#index2 > 0) {
+							# matched to multiple primers so try and use whole read
+							my @refs;
+							foreach(@index2) {
+								if($$refFrags{strand}[$_] eq "F") {
+									push @refs, $$refSeq{${ $$refFrags{reg} }[$_]};
+								} else {
+									push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]});
+								}
+							}
+							my @index2b = amatch($seq2, \@refs);
+							if($#index2b >= 0)  {
+								@index2 = @index2[@index2b];
+							}
+						}
+				
+						# if one read still maps to multiple amplicons and the other read matches to just one of them use that
+						%in_array2 = map { $_ => 1 } @index2;
+						@common = grep { $in_array2{$_} } @index1;
+						if($#common == 0) {
+							@index1 = @common;
+							@index2 = @common;
+						}
+					}
+				}
+		
+				# if read 1 and read 2 match to the same amplicon assign it
+				if($#index1 == 0 && @index1 ~~ @index2 ) {
+					$index_read1 = $index1[0];
+					$index_read2 = $index2[0];
+				} else {
+					$index_read1 = -1;
+					$index_read2 = -1;
+				} 	
+		
+				# if any read after trimming is less than half the original length discard
+				if(length($trimmed1) < length($seq1)/2) {
+					$index_read1 = -2;				
+				}
+			
+				if(length($trimmed2) < length($seq2)/2) {
+					$index_read2 = -2;
+				}
+			}
+		
+			if($index_read1 > -1 || $index_read2 > -1) {
+				my $conc = 0;
+				my %align;
+			
+				# at least 1 read matched to an amplicon
+				if($index_read1 > -1) {
+					$contig = $$refFrags{reg}[$index_read1];
+					$readRef = $$refSeq{$contig};
+					$strand = $$refFrags{strand}[$index_read1];
+				} elsif($index_read2 > -1) {
+					$contig = $$refFrags{reg}[$index_read2];
+					$readRef = $$refSeq{$contig};
+					$strand = $$refFrags{strand}[$index_read2];
+				}
+			
+				if($index_read1 == $index_read2 && length($readRef) < length($trimmed1) + length($trimmed2)) {
+					# reads must overlap so can use concensus
+					$conc = 1;
+				} else {
+					my $substr1;
+					my $substr2;
+					for(my $length = 5; $length <= min([length($trimmed1), length($trimmed2)]); $length++) {
+						$substr1 = substr($trimmed1, -$length);
+						$substr2 = substr($trimmed2, 0, $length);
+						if($substr1 eq $substr2) {
+							$conc = 1;
+							last;
+						} else {
+							# calculate max diff allowed
+							if(!defined $maxdiff{$length}) {
+								{ lock(%maxdiff); $maxdiff{$length} = maxdiff($length, 0.02, 0.04); }
+							}
+					
+							my $diff = levenshtein($substr1, $substr2 );
+							if($diff < $maxdiff{$length}) {
+								$conc = 1;
+								last;
+							}
+						}
+					}
+				}
+			
+				if($conc) {
+					# remove any trailing Ns
+					$trimmed1 = $seq1;
+					$trimmed2 = $seq2;
+					$trimmed1 =~ s/N*$//;
+					$trimmed2 =~ s/^N*//;
+					$qtrim1 = $qual1;
+					$qtrim2 = $qual2;
+					$qtrim1 = substr($qtrim1, 0, length($trimmed1));
+					$qtrim2 = substr($qtrim2 , -length($trimmed2));
+				}
+		
+				if($strand eq "R") {
+					my $trimmed1_tmp = revcompl($trimmed1);
+					my $qtrim1_tmp = reverse($qtrim1);
+					$trimmed1 = revcompl($trimmed2);
+					$qtrim1 = reverse($qtrim2);
+					$trimmed2 = $trimmed1_tmp;
+					$qtrim2 = $qtrim1_tmp;
+					my $index_read_tmp = $index_read1;
+					$index_read1 = $index_read2;
+					$index_read2 = $index_read_tmp;
+				}
+			
+				if($conc) {
+					my $align;
+					my $search1 = $trimmed1;
+					$search1 =~ s/N/./g;
+					my $search2 = $trimmed2;
+					$search2 =~ s/N/./g;
+					if($$refSeq{$contig} =~ /$search1/ && $$refSeq{$contig} =~ /$search2/) {
+						# the read matches exactly to the reference
+						$align{Ref1} = $$refSeq{$contig};
+						$align{Read1} = $$refSeq{$contig};
+						$align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/;
+						$align{Read1} =~ s/[^X]/-/g;
+						$align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X+/$trimmed1/ });
+						$align{Read2} = $$refSeq{$contig};
+						$align{Read2} =~ s/^(.*)$search2(.*?)$/${1}XX$2/;
+						$align{Read2} =~ s/[^X]/-/g;
+						$align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X+/$trimmed2/ });
+						
+						# add aligned reads to queue
+						my @array;
+						{
+							lock($Aligned); # Keep other threads from changing the queue's contents
+							@array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, $align{Ref1}, $align{Read2}, $qtrim2, $contig, $strand, $conc);
+							$Aligned->enqueue(\@array);
+						}
+					
+						# add alignment to cache
+						{
+							lock(%cacheAlign);
+							$cacheAlign{Conc}{$name} = &share( [] );
+							@{ $cacheAlign{Conc}{$name} } = @array;
+						}
+					} elsif(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) {
+						my @array;
+						{ lock(%cacheAlign);  @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } };
+						$array[0] = $read1;
+						$array[3] = $qtrim1;
+						$array[6] = $qtrim2;
+						{ lock($cacheAlignHits); $cacheAlignHits++ };
+					
+						# add aligned reads to queue
+						{
+							lock($Aligned); # Keep other threads from changing the queue's contents
+							$Aligned->enqueue(\@array);
+						}
+					} else {
+					
+						# align reads using NW2
+						{
+							lock($ToAlign); # Keep other threads from changing the queue's contents
+							my @array = ($$refSeq{$contig}, $trimmed1, $trimmed2, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc);					
+							#my @array = ($fastaIn, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc, "$trimmed1.$trimmed2");
+							$ToAlign->enqueue(\@array);
+						}
+					}
+				}
+			
+				if(!$conc && $index_read1 > -1) {
+					my $search1 = $trimmed1;
+					$search1 =~ s/N/./g;
+					if($$refSeq{$contig} =~ /$search1/) {
+						# the read matches exactly to the reference
+						$align{Ref1} = $$refSeq{$contig};
+						$align{Read1} = $$refSeq{$contig};
+						$align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/;
+						$align{Read1} =~ s/[^X]/-/g;
+						$align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X/$trimmed1/ });
+					
+						# add aligned reads to queue
+						{
+							lock($Aligned); # Keep other threads from changing the queue's contents
+							my @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, undef, undef, undef, $contig, $strand, $conc);
+							$Aligned->enqueue(\@array);
+						}
+					} elsif(defined $cacheAlign{$contig}{R1}{$trimmed1}) {
+						my @array;
+						{ lock(%cacheAlign);  @array = @{ $cacheAlign{$contig}{R1}{$trimmed1} } };
+						$array[0] = $read1;
+						$array[3] = $qtrim1;
+						{ lock($cacheAlignHits); $cacheAlignHits++ };
+					
+						# add aligned reads to queue
+						{
+							lock($Aligned); # Keep other threads from changing the queue's contents
+							$Aligned->enqueue(\@array);
+						}
+					} else {
+						# align read using NW2
+						{
+							lock($ToAlign); # Keep other threads from changing the queue's contents
+							my @array = ($$refSeq{$contig}, $trimmed1, undef, $read1, $qtrim1, undef, $contig, $strand, $conc);
+							#my @array = ($fastaIn, $read1, $qtrim1, undef, $contig, $strand, $conc, $trimmed1);
+							$ToAlign->enqueue(\@array);
+						}
+					}
+				}
+			
+				if(!$conc && $index_read2 > -1) {
+					my $search2 = $trimmed2;
+					$search2 =~ s/N/./g;
+					if($$refSeq{$contig} =~ /$search2/) {
+						# the read matches exactly to the reference
+						$align{Ref2} = $$refSeq{$contig};
+						$align{Read2} = $$refSeq{$contig};
+						$align{Read2} =~ s/^(.*)$search2(.*?)$/${1}X$2/;
+						$align{Read2} =~ s/[^X]/-/g;
+						$align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X/$trimmed2/ });
+
+						# add aligned reads to queue
+						{
+							lock($Aligned); # Keep other threads from changing the queue's contents
+							my @array = ($read2, undef, undef, undef, $align{Ref2}, $align{Read2}, $qtrim2, $contig, $strand, $conc);
+							$Aligned->enqueue(\@array);
+						}
+					} elsif(defined $cacheAlign{$contig}{R2}{$trimmed2}) {
+						my @array;
+						{ lock(%cacheAlign);  @array = @{ $cacheAlign{$contig}{R2}{$trimmed2} } };
+						$array[0] = $read2;
+						$array[6] = $qtrim2;
+						{ lock($cacheAlignHits); $cacheAlignHits++ };
+
+						# add aligned reads to queue
+						{
+							lock($Aligned); # Keep other threads from changing the queue's contents
+							$Aligned->enqueue(\@array);
+						}
+					} else {
+						# align read using NW2
+						{
+							lock($ToAlign); # Keep other threads from changing the queue's contents
+							my @array = ($$refSeq{$contig}, undef, $trimmed2, $read2, undef, $qtrim2, $contig, $strand, $conc);
+							#my @array = ($fastaIn, $read2, undef, $qtrim2, $contig, $strand, $conc, $trimmed2);
+							$ToAlign->enqueue(\@array);
+						}
+					}
+				}
+			}
+		
+			if($index_read1 < 0) {
+				# discard read 1
+				# send to writebamfile thread	
+				{
+					lock($WriteBam); # Keep other threads from changing the queue's contents
+					$WriteBam->enqueue("$read1\t4\t*\t0\t0\t*\t*\t0\t0\t$seq1\t$qual1\tRG:Z:$readGroup\n");
+				}
+				{ lock(%stats); $stats{PoorQual1}++ };	
+			}
+			
+			if($index_read2 < 0) {
+				# discard read 2			
+				# send to writebamfile thread	
+				{
+					lock($WriteBam); # Keep other threads from changing the queue's contents
+					$WriteBam->enqueue("$read2\t4\t*\t0\t0\t*\t*\t0\t0\t$seq2\t$qual2\tRG:Z:$readGroup\n");
+				}			
+				{ lock(%stats); $stats{PoorQual2}++ };	
+			}
+		
+			if($index_read1 == -1 && $index_read2 == -1) {
+				{ lock(%stats); $stats{Amb}++ };
+			} elsif($index_read1 == -2 && $index_read2 == -2) {
+				{ lock(%stats); $stats{PoorQualMates}++ };
+			}
+		}
+		
+		# grab the next reads in queue
+		{
+			lock($FindContig); # Keep other threads from changing the queue's contents
+			$readArrayRef = $FindContig->dequeue();
+		}
+	}
+	print "Closing Find Amplicon Thread\n";
+	return;
+}
+
+sub writebamfile {
+	my $bamfile = shift;
+	my $WriteBam = shift;
+
+	# write samfile header
+	open(PIPE_TO_SAMTOOLS, "| samtools view -Sb -o $bamfile - 2>&1") or die "Can't fork: $!\n";
+	foreach my $key (@OrderedContigs) {
+		print PIPE_TO_SAMTOOLS "\@SQ\tSN:$refLength{$key}{newReg}\tLN:$refLength{$key}{length}\n";
+	}
+	print PIPE_TO_SAMTOOLS "\@RG\tID:$readGroup\tPL:Illumina\tSM:$sampleName\n";
+
+	my $read;
+	{
+		lock($WriteBam); # Keep other threads from changing the queue's contents
+		$read = $WriteBam->dequeue();
+	}
+	while(defined $read) {
+	
+		# write read to bamfile
+		print PIPE_TO_SAMTOOLS $read;
+		
+		# grab next item in queue
+		{
+			lock($WriteBam); # Keep other threads from changing the queue's contents
+			$read  = $WriteBam->dequeue();
+		}
+	}
+	print "Closing Bam Writer\n";
+	close PIPE_TO_SAMTOOLS or die "pipe could not be closed";
+	return;
+}
+
+sub thread_process_align {
+	my $Aligned = shift;
+	my $refFrags = shift;
+	my $refSeq = shift;
+	my $Samfile = shift;
+	my $WriteBam = shift;
+	
+	my $readArrayRef;
+	{
+		lock($Aligned); # Keep other threads from changing the queue's contents
+		$readArrayRef = $Aligned->dequeue();
+	}
+	while(defined $readArrayRef) {
+		# grab the alignment
+		my ($read, $ref1, $read1, $qtrim1, $ref2, $read2, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef };
+		# grab primer lengths
+		my $PU = $$refFrags{primers}{$contig}{U}{F}{length};
+		my $PD = $$refFrags{primers}{$contig}{D}{F}{length};	
+		my %readStats;
+		my $orig_read1 = $read1;
+		my $orig_read2 = $read2;
+		my $orig_qtrim1 = $qtrim1;
+		my $orig_qtrim2 = $qtrim2;
+		$orig_read1 =~ s/-//g if defined $orig_read1;
+		$orig_read2 =~ s/-//g if defined $orig_read2;
+
+		if(defined $OPTIONS{d}) {
+			# debug info
+			print "Alignment With Primers\n";
+			print "$ref1\n$read1\n" if defined $read1;
+			print "$ref2\n$read2\n" if defined $read2;
+			print "\nFinal Alignment:\n";
+		}
+			
+		# calculate where primers begin and end
+		if(defined $read1) {
+			if($strand eq 'F') {
+				$readStats{$contig}{Read1} = 1;
+			} else {
+				$readStats{$contig}{Read2} = 1;
+			}
+						
+			# determine the primer locations
+			my $offset = 0;
+			for(my $occurance = 0; $occurance < $PU; $occurance++) {
+				substr($ref1, $offset) =~ /^.*?([^-])/;
+				$offset += $-[1] + 1;
+			}
+			my $length = 0;
+			for(my $occurance = 0; $occurance < $PD; $occurance++) {
+				substr($ref1, 0, length($ref1) - $length) =~ /.*([^-]).*?$/;
+				$length = length($ref1) - $-[1];
+			}
+			$length = length($ref1) - $length - $offset;
+		
+			# remove primers and adjust quality scores
+			$ref1 = substr($ref1, $offset, $length);
+			my $qualOffset = ( substr($read1, 0, $offset) =~ tr/ACTGN// );
+			$read1 = substr($read1, $offset, $length);
+			$length = ( $read1 =~ tr/ACTGN// );
+			$qtrim1 = substr($qtrim1, $qualOffset, $length);
+		}
+
+		if(defined $read2) {
+			if($strand eq 'F') {
+				$readStats{$contig}{Read2} = 1;
+			} else {
+				$readStats{$contig}{Read1} = 1;
+			}
+			
+			# determine the primer locations
+			my $offset = 0;
+			for(my $occurance = 0; $occurance < $PU; $occurance++) {
+				substr($ref2, $offset) =~ /^.*?([^-])/;
+				$offset += $-[1] + 1;
+			}
+			my $length = 0;
+			for(my $occurance = 0; $occurance < $PD; $occurance++) {
+				substr($ref2, 0, length($ref2) - $length) =~ /.*([^-]).*?$/;
+				$length = length($ref2) - $-[1];
+			}
+			$length = length($ref2) - $length - $offset;
+		
+			# remove primers and adjust quality scores
+			$ref2 = substr($ref2, $offset, $length);
+			my $qualOffset = ( substr($read2, 0, $offset) =~ tr/ACTGN// );
+			$read2 = substr($read2, $offset, $length);
+			$length = ( $read2 =~ tr/ACTGN// );
+			$qtrim2 = substr($qtrim2, $qualOffset, $length);
+		}	
+		
+		# generate a concensus sequence if we have both read1 and read2 and they overlap
+		if($conc) {
+			# the following are 0 based indexes
+			my $R1qual = 0; # index for sanger qual read 1
+			my $R2qual = 0; # index for sanger qual read 2
+			my $aindex = 0; # index for alignment		
+			my $q1;
+			my $q2;
+			my $concensus;
+			my $concqual;
+			
+			$readStats{$contig}{ConcReads} = 1;
+			$readStats{$contig}{ConcCorrectBases} = 0;
+			$readStats{$contig}{ConcErrorsBases} = 0;					
+			while($aindex < length($ref1)) {
+				# base1 is current base from read 1
+				# base2 is current base from read 2
+				my $base1 = substr($read1, $aindex, 1);
+				my $base2 = substr($read2, $aindex, 1);
+					
+				# grab the next quality score
+				if($R1qual < length($qtrim1)) {
+					$q1 = substr($qtrim1, $R1qual, 1);
+				}
+				if($R2qual < length($qtrim2)) {
+					$q2 = substr($qtrim2, $R2qual, 1);
+				}
+				
+				# if we have a base in either
+				if(($base1 ne '-' && $base1 ne 'N') || ($base2 ne '-' && $base2 ne 'N')) {				
+					if(($base2 eq "-" || $base2 eq 'N') && $base1 ne "-" && $base1 ne 'N') {
+						# we are only in read 1 or read 2 is a N
+						$concensus .= $base1;
+						$concqual .= $q1;
+						$R1qual++;
+						if($base2 eq 'N') {
+							$R2qual++;
+						}
+					} elsif(($base1 eq "-" || $base1 eq 'N') && $base2 ne "-" && $base2 ne 'N') {
+						# we are only in read 2 or read 1 is a N
+						$concensus .= $base2;
+						$concqual .= $q2;
+						$R2qual++;
+						if($base1 eq 'N') {
+							$R1qual++;
+						}
+					} else {
+						# we are in the overlap region
+						my $maxqual = sangerMax($q1, $q2);
+						if($base1 eq $base2) {
+							# both reads agree
+							$readStats{$contig}{ConcCorrectBases}++;
+							$concensus .= $base1;
+							$concqual .= $maxqual;
+						} elsif($q1 eq $q2) {
+							# reads have different bases but the same score
+							$readStats{$contig}{ConcErrorsBases}++;
+							$concensus .= 'N';
+							$concqual .= '#';
+						} elsif($q1 eq $maxqual) {
+							# reads have different bases but read 1 has the best score
+							$concensus .= $base1;
+							$concqual .= $maxqual;
+							$readStats{$contig}{ConcErrorsBases}++;
+						} else {
+							# reads have different bases but read 2 has the best score
+							$concensus .= $base2;
+							$concqual .= $maxqual;
+							$readStats{$contig}{ConcErrorsBases}++;
+						}
+						$R1qual++;
+						$R2qual++;
+					}
+				} elsif($base1 eq 'N' || $base2 eq 'N') {
+					if($base2 eq '-') {
+						# one is a deletion the other is an N
+						$concensus .= $base1;
+						$concqual .= $q1;
+						$R1qual++;
+					} elsif($base1 eq '-') {
+						# one is a deletion the other is an N
+						$concensus .= $base2;
+						$concqual .= $q2;
+						$R2qual++;
+					} elsif($base1 eq 'N' && $base2 eq 'N') {
+						# both reads are an N
+						$concensus .= 'N';
+						$concqual .= sangerMax($q1, $q2);
+						$R1qual++;
+						$R2qual++;
+					}
+				} else {
+					$concensus .= '-';
+				}
+				$aindex++;
+			}
+			$read1 = $concensus;
+			$qtrim1 = $concqual;
+			$read2 = undef;
+			$qtrim2 = undef;
+		}
+		
+		if(defined $read1) {
+			# remove any trailing - and send to samfile_line
+			$read1 =~ s/^-*?([^-].*)/$1/;
+			$ref1 = substr($ref1, -length($read1));
+			$read1 =~ s/(.*[^-])-*?$/$1/;
+			$ref1 = substr($ref1, 0, length($read1));
+			
+			my $tmp = $ref1;
+			$tmp =~ s/-//g;
+			my $rstart = index($$refSeq{$contig}, $tmp) - $PU + 1;
+
+			# generate sam file line
+			{
+				lock($Samfile); # Keep other threads from changing the queue's contents
+				my @array = ($read, $qtrim1, $rstart, $read1, $ref1, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats);
+				$Samfile->enqueue(\@array);
+			}
+		}
+		
+		if(defined $read2) {
+			# remove any trailing - and send to samfile_line
+			$read2 =~ s/^-*?([^-].*)/$1/;
+			$ref2 = substr($ref2, -length($read2));
+			$read2 =~ s/(.*[^-])-*?$/$1/;
+			$ref2 = substr($ref2, 0, length($read2));
+			
+			my $tmp = $ref2;
+			$tmp =~ s/-//g;
+			my $rstart = rindex($$refSeq{$contig}, $tmp) - $PU + 1;
+
+			# generate sam file line
+			{
+				lock($Samfile); # Keep other threads from changing the queue's contents
+				my @array = ($read, $qtrim2, $rstart, $read2, $ref2, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats);
+				$Samfile->enqueue(\@array);
+			}
+		}
+
+		if(defined $OPTIONS{d}) {
+			# debug info
+			print "$ref1\n$read1\n" if defined $read1;
+			print "$ref2\n$read2\n" if defined $read2;
+			print "$qtrim1\n" if defined $read1;		
+			print "$qtrim2\n" if defined $read2;		
+			print "\n\n";
+		}
+
+		# grab next item in queue
+		{
+			lock($Aligned); # Keep other threads from changing the queue's contents
+			$readArrayRef = $Aligned->dequeue();
+		}
+	}
+	print "Closing Alignment Processor\n";
+	return;
+}
+
+# function to take reverse compliment
+sub revcompl {
+  my $string = shift @_;
+  my $revcomp = reverse($string);
+  $revcomp =~ tr/ACGTacgtKMSWRYBDHVkmswrybdhv/TGCAtgcaMKSWYRVHDBmkswyrvhdb/;
+  return $revcomp;
+}
+
+# function to convert sanger fastq quality scores to phred scores
+sub sanger2phred {
+  my $sang = shift @_;
+  my @sanger = split("", $sang);
+  my @phred;
+  foreach my $q (@sanger) {
+  	my $Q = ord($q) - 33;
+  	push @phred, $Q;
+  }
+  return @phred;
+}
+
+# function to return the largest sanger quality score
+sub sangerMax {
+	my ($s1, $s2) = @_;
+	my $Q1 = ord($s1) - 33;
+	my $Q2 = ord($s2) - 33;
+	if($Q1 > $Q2) {
+		return $s1;
+	} else {
+		return $s2;
+	} 
+}
+
+# function to read a fasta file as a hash
+sub read_fasta_file {
+   my ($fasta_file, $seqs) = @_;
+   my $header;
+   my $seq = "";
+   open (IN, $fasta_file) or die "can't open $fasta_file: $!\n";
+   while (my $line = <IN>) {
+      chomp $line;
+      if (grep />/, $line) {
+         if ($seq ne "") {    
+            $seqs->{$header} = $seq;
+         }
+	 $header = $line;          
+
+         $header =~ s/^>//; # remove ">"
+         $header =~ s/\s+$//; # remove trailing whitespace
+         $seq = ""; # clear out old sequence
+      } else {    
+         $line =~ s/\s+//g; # remove whitespace
+         $seq .= $line; # add sequence
+      }         
+   }    
+   close IN; # finished with file
+
+   if ($seq ne "") { # handle last sequence
+      $seqs->{$header} = $seq;
+   }    
+
+   return;
+}
+
+# minimal element of a list
+#
+sub min
+{
+    my @list = @{$_[0]};
+    my $min = $list[0];
+
+    foreach my $i (@list)
+    {
+        $min = $i if ($i < $min);
+    }
+
+    return $min;
+}
+
+# maximal element of a list
+#
+sub max
+{
+    my @list = @{$_[0]};
+    my $max = $list[0];
+
+    foreach my $i (@list)
+    {
+        $max = $i if ($i > $max);
+    }
+
+    return $max;
+}
+
+# trim sequences using the BWA algorithm
+sub trim {
+	
+	my ($s, $q, $threshold, $read) = @_;  # read sequence and quality scores
+	if($read == 2) {
+		$q = reverse($q);
+		$s = reverse($s);
+	}
+
+	my @array  = sanger2phred($q);
+	my $length = scalar @array;
+	
+	# only calculate if quality fails near end
+	if( $array[$#array] >= $threshold ){
+		if($read == 2) {
+			$q = reverse($q);
+			$s = reverse($s);
+		}
+		return ($s, $q);
+	}
+	
+	# run bwa equation
+	my @arg;
+	for( my $i = 0; $i < $length - 1; $i++ ){
+		my $x = $i + 1;
+		for( my $j = $x; $j < $length; $j++ ){	
+			$arg[$x] += $threshold - $array[$j];
+		}
+	}
+	
+	# find number of 5' bases to retain
+	my $index = 0;
+	my $maxval = 0;
+	for ( 1 .. $#arg ){
+		if ( $maxval < $arg[$_] ){
+			$index = $_;
+			$maxval = $arg[$_];
+    		}
+	}
+	
+	# trim reads
+	$s = substr($s,0,$index);
+	$q = substr($q,0,$index);
+	if($read == 2) {
+		$s = reverse($s);
+		$q = reverse($q);
+	}
+
+	# return seq and quality
+	return ($s, $q);
+}
+
+
+sub samfile_line {
+	# grab the variables
+	my ($Samfile, $WriteBam, $refSeq) = @_;
+	my %SamStats;
+	my %minLength;
+	my %maxLength;
+	my $readArrayRef;
+	{
+		lock($Samfile); # Keep other threads from changing the queue's contents
+		$readArrayRef = $Samfile->dequeue();
+	}
+	
+	while(defined $readArrayRef) {
+		my ($read, $qual, $rstart, $align, $ref, $reg, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, $readStats) = @{ $readArrayRef };
+		my $seq = $align;
+		$seq =~ s/-//g;
+
+		# calculate the edit difference - count indels as only 1 missmatch hence tranform the strings so that all indels have length 1
+		my $sam;
+		my $s1;
+		my $s2;
+		my @s1 = split("", $align);
+		my @s2 = split("", $ref);
+		
+		my $indel = 0;
+		for(my $i = 0; $i <= $#s1; $i++) {
+			if(($s1[$i] ne "-" && $s2[$i] ne "-") || !$indel) {
+				$s1 .= $s1[$i];
+				$s2 .= $s2[$i];
+				$indel = 0;
+				if($s1[$i] eq "-" || $s2[$i] eq "-") {
+					$indel = 1;
+				}
+			}
+		}
+		my $mq = levenshtein($s1, $s2);
+		
+		my $cl = length($seq);
+		# calculate max diff allowed
+		if(!defined $maxdiff{$cl}) {
+			{ lock(%maxdiff); $maxdiff{$cl} = maxdiff($cl, 0.02, 0.04); }
+		}
+		
+		if($mq > $maxdiff{$cl} || $cl == 0) {
+			{ lock(%stats); $stats{$reg}{PoorMapQual}++; $stats{PoorMapQual}++; }
+			if(defined $orig_read1) {
+				$sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read1\t$orig_qtrim1\tRG:Z:$readGroup\n";
+				# send to writebamfile thread	
+				{
+					lock($WriteBam); # Keep other threads from changing the queue's contents
+					$WriteBam->enqueue($sam);
+				}
+			}
+			if(defined $orig_read2) {
+				$read =~ s/[1](:[A-Z]+:[0-9]+:[ACTGN]+)$/2$1/;
+				$sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read2\t$orig_qtrim2\tRG:Z:$readGroup\n";
+				# send to writebamfile thread	
+				{
+					lock($WriteBam); # Keep other threads from changing the queue's contents
+					$WriteBam->enqueue($sam);
+				}
+			}
+		} else {
+			$mq = 60;
+
+			# add any stats from earlier
+			foreach my $key1 (keys %$readStats) {
+				foreach my $key2 (keys % { $$readStats{$key1} }) {
+					$SamStats{$key1}{$key2} += $$readStats{$key1}{$key2};
+				}
+			}
+					
+			# keep track of quality scores
+			$SamStats{$reg}{Bases} += $cl;
+			$SamStats{$reg}{Quality} += $_ for sanger2phred($qual);
+	
+			# keep track of read lengths
+			if(defined $minLength{$reg}) {
+				$minLength{$reg} = min([$cl, $minLength{$reg}]);
+				$maxLength{$reg} = max([$cl, $maxLength{$reg}]);
+			} else {
+				$minLength{$reg} = $cl;
+				$maxLength{$reg} = $cl;
+			}
+			
+			# keep track of strand and bases seen
+			if($strand eq 'F') {
+				$SamStats{$reg}{NumberForward}++;
+			} else {
+				$SamStats{$reg}{NumberReverse}++;
+			}
+			$SamStats{$reg}{TotalA} += ($seq =~ tr/A//);
+			$SamStats{$reg}{TotalC} += ($seq =~ tr/C//);
+			$SamStats{$reg}{TotalG} += ($seq =~ tr/G//);
+			$SamStats{$reg}{TotalT} += ($seq =~ tr/T//);
+			$SamStats{$reg}{TotalN} += ($seq =~ tr/N//);
+
+			# keep count the number of reads for each contig
+			$SamStats{$reg}{Reads}++;
+
+			# calculate strand info
+			if($strand eq 'F') {
+				$strand = 0;
+			} else {
+				$strand = 16;
+			}
+			# calculate cigar
+			my $cigar = "";
+			my $M = 0;
+			my $D = 0;
+			my $I = 0;
+			my $Subs = 0;
+			for(my $i = 0; $i < length($align); $i++) {
+				if(substr($align, $i, 1) eq "-") {
+					$D++;
+					if($M) {
+						$cigar .= "${M}M";
+						$M = 0;
+					} elsif($I) {
+						$cigar .= "${I}I";
+						$I = 0;
+					}
+				} elsif(substr($ref, $i, 1) eq "-") {
+					$I++;
+					if($M) {
+						$cigar .= "${M}M";
+						$M = 0;
+					} elsif($D) {
+						$cigar .= "${D}D";
+						$D = 0;
+					}
+				} else {
+					$M++;
+					if($D) {
+						$cigar .= "${D}D";
+						$D = 0;
+					} elsif($I) {
+						$cigar .= "${I}I";
+						$I = 0;
+					}
+					$Subs++ if substr($ref, $i, 1) ne substr($align, $i, 1);
+				}
+			}
+			if($M) {
+				$cigar .= "${M}M";
+			} elsif($I) {
+				$cigar .= "${I}I";
+			} elsif($D) {
+				$cigar .= "${D}D";
+			}
+		
+			# construct sam file line
+			$sam = "$read\t$strand\t$refLength{$reg}{newReg}\t$rstart\t$mq\t$cigar\t*\t0\t0\t$seq\t$qual\tRG:Z:$readGroup\n";
+
+			# send to writebamfile thread	
+			{
+				lock($WriteBam); # Keep other threads from changing the queue's contents
+				$WriteBam->enqueue($sam);
+			}
+		}
+				
+		#grab the next read	
+		{
+			lock($Samfile); # Keep other threads from changing the queue's contents
+			$readArrayRef = $Samfile->dequeue();
+		}
+	}
+	
+	{
+		# add stats from earlier
+		lock(%stats);					
+		foreach my $key1 (keys %SamStats) {
+			$stats{$key1}{MinLength} = min([$minLength{$key1}, $stats{$key1}{MinLength}]);
+			$stats{$key1}{MaxLength} = max([$maxLength{$key1}, $stats{$key1}{MaxLength}]);
+			foreach my $key2 (keys % { $SamStats{$key1} }) {
+				$stats{$key1}{$key2} += $SamStats{$key1}{$key2};
+			}
+		}	
+
+	}
+	print "Closing Sam File Line Generator\n";
+
+	return;
+}
+
+# perform the alignment
+sub nwAlign {
+	my ($ToAlign, $Aligned, $regions) = @_;
+	my $readArrayRef;
+	{
+		lock($ToAlign); # Keep other threads from changing the queue's contents
+		$readArrayRef = $ToAlign->dequeue();
+	}
+	
+	while(defined $readArrayRef) {
+		# we have picked a contig
+		# check strand and take rev comp of ref if necessary
+		my ($ref, $left, $right, $read, $qtrim1, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef };
+		my %align;
+		my $name;
+		#! Parameters
+		#! ClustalW Parameters 
+		my $matchScore = 1.9;
+		my $missmatch_penalty = -0.4;
+		my $gapPenOpen = -10.0;
+		my $gapPenExt = -0.4;
+		my $leftAndRightGapPen = true;
+		my $debug = false;
+		my $cons = "";
+		my $align1= "";
+		my $align2= "";
+		my $align3= "";
+		my $score = 0.0;
+		my $singleAlign = false; #aligning 2 or 3 sequences
+		if($conc)
+		{
+			#got to concatenate seqs and align
+			$score = align3($ref, $right, $left, $align1, $align2, $align3, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, $leftAndRightGapPen, $debug);
+			$name = "$left.$right";
+			print "$contig\n$align1\n$align2\n$align3\n\n" if defined $$regions{$contig};
+		}
+		elsif(!defined $right)
+		{
+			#align with left
+			$score = align2($ref, $left, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, false, true, $debug);
+			$singleAlign = true;
+			$name = $left;
+			print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig};
+		}
+		elsif(!defined $left)
+		{
+			#align with right
+			$score = align2($ref, $right, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen ,$gapPenExt ,true, false, $debug);
+			$singleAlign = true;
+			$name = $right;
+			print "$contig\n$align1\n$align2\n\n"  if defined $$regions{$contig};
+		}
+		
+		# add aligned reads to queue
+		# cache results
+		my $type;
+		{
+			lock($Aligned); # Keep other threads from changing the queue's contents
+			
+			my @array;			
+			if($singleAlign && !defined $right)
+			{
+				@array = ($read, $align1, $align2, $qtrim1, undef, undef, undef, $contig, $strand, $conc);
+				{
+					lock(%cacheAlign);
+					$cacheAlign{$contig}{R1}{$name} = &share( [] );
+					@{ $cacheAlign{$contig}{R1}{$name} } = @array;
+				}
+			}
+			elsif($singleAlign && !defined $left )
+			{
+				@array = ($read, undef, undef, undef, $align1, $align2, $qtrim2, $contig, $strand, $conc);
+				{
+					lock(%cacheAlign);
+					$cacheAlign{$contig}{R2}{$name} = &share( [] );
+					@{ $cacheAlign{$contig}{R2}{$name} } = @array;
+				}
+			}
+			else
+			{
+				@array = ($read, $align1, $align2, $qtrim1, $align1, $align3, $qtrim2, $contig, $strand, $conc);
+				{
+					lock(%cacheAlign);
+					$cacheAlign{Conc}{$name} = &share( [] );
+					@{ $cacheAlign{Conc}{$name} } = @array;
+				}
+			}
+			$Aligned->enqueue(\@array);
+		}
+
+		#grab the next read	
+		{
+			lock($ToAlign); # Keep other threads from changing the queue's contents
+			$readArrayRef = $ToAlign->dequeue();
+		}
+	}
+	# clean up
+	print "Closing NW2 Aligner\n";
+	return;
+}
+
+# function to calculat the minimum edit distance threshold (taken from bwa)
+sub maxdiff {
+	my ($l, $err, $thres) = @_;
+	my $lambda = exp(-$l*$err);
+	my $y = 1.0; 
+	my $x = 1;
+	my $sum = $lambda;
+	for(my $k = 1; $k < 1000; $k++) {
+		$y *= $l * $err;
+		$x *= $k;
+		$sum += $lambda * $y / $x;
+		if(1.0 - $sum < $thres) {
+			return $k;
+		} 
+	} 
+	return 2;
+}
+
+# using approx matching return the positions in $list of those most closely matching $string
+sub amatch {
+	my ($string, $list) = @_;
+	my @d = adistr($string, @$list);
+        my @d_index = sort { abs($d[$a]) <=> abs($d[$b]) } 0..$#$list;
+        
+        return if $d[$d_index[0]] > 0.25;
+        
+	my @pos = ($d_index[0]);
+	my $i = 1;    
+        while($i <= $#d_index && $d[$d_index[0]] == $d[$d_index[$i]]) {
+        	push @pos, $d_index[$i];
+         	$i++;
+        }
+ 	return @pos;
+}
+
+# a sorting algorithm to sort the chromosomes
+sub sortChrom {
+	my ($achr, $astart, $aend) = split(":|-", $a);
+	my ($bchr, $bstart, $bend) = split(":|-", $b);
+	$achr =~ s/X/23/;
+	$achr =~ s/Y/24/;
+	$achr =~ s/MT?/25/;
+	$bchr =~ s/X/23/;
+	$bchr =~ s/Y/24/;
+	$bchr =~ s/MT?/25/;
+	if($achr < $bchr) {
+		return -1;
+	} elsif($bchr < $achr) {
+		return 1;
+	} elsif($astart < $bstart) {
+		return -1;
+	} elsif($bstart < $astart) {
+		return 1;
+	} elsif($aend < $bend) {
+		return -1;
+	} elsif($bend < $aend) {
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
+# pass a code reference
+sub modify {
+  my($text, $code) = @_;
+  $code->($text);
+  return $text;
+}
+
+sub split_region {
+	my ($chr, $start, $end) = split(":|-", shift);
+	$end = $start unless defined $end;
+	return($chr, $start, $end);
+}
+
+sub error_rate {
+	my ($x, $y) = @_;
+	my @x = split("", $x);
+	my @y = split("", $y);
+	my $s = 0;
+	my $l = 0;
+	for(my $i = 0; $i <= $#x; $i++) {
+		if($x[$i] ne "-" && $y[$i] ne "-") {
+			$s++ if $x[$i] ne $y[$i];
+			$l++;
+		}
+	}
+	return($s / $l);
+}
+
+
author	fcaramia
date	Wed, 09 Jan 2013 00:24:09 -0500
parents
children	22f35f830f48