Mercurial > repos > fcaramia > custom_amplicon_alignment

#!/usr/bin/perl

# script to convert paired end reads to a concensus sequence

# Created by Jason Ellul, Feb 2012
# Modified by Franco Caramia and Jason Ellul, July 2012 and November 2012
# 0.4 Fixed mapping of concensus region (large insertions and deletions now picked up correctly)
#     Added missalignment penalty
#     Read 1 and 2 are now aligned first then they are aligned to the reference
# 0.5 Added ability to name amplicons by includinfo the info in the primers file
#     Added the ability to output the alignments in a particular region using the new flag -a
#     Added some extra stats to the output

use strict;
use warnings;
use Getopt::Std;
use File::Basename;
use File::Find;
use File::Copy;
use File::MMagic;
use File::Temp qw/ tempfile tempdir /;
use Cwd 'abs_path';
use threads;
use threads::shared;
use List::Util qw(sum);
use Thread::Queue;
use String::Approx 'adistr';
use POSIX qw(floor ceil);
String::Approx::cache_max(25000);
String::Approx::cache_purge(0.25);
use constant false => 0;
use constant true  => 1;
use Inline CPP => Config => AUTO_INCLUDE => '#include "utils.cpp"';
use Inline CPP => <<'END_CPP';


int levenshtein(char* seq1, char* seq2)
{
	// Returns edit distance of 2 sequences
	string sam1(seq1);
	string sam2(seq2);
	return levenshteinDistance(sam1, sam2);
}

double align2(char* seq1, char* seq2, SV* res1, SV* res2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug )
{
	// Returns Sequence alignment of 2 sequences
	// res1, res2 contain the alignment
	//seq1: reference sequence
	//seq2: amplicon sequence
	//matchScore: reward for matching a nucleotide
	//gapPenOpen: penalty for opening a gap
	//gapPenext: penalty for extending a gap
	//noFrontGapPenalty: No penalty will be impose for gaps at the beggining of the alignment
	//noEndGapPenalty: No penalty will be impose for gaps at the end of the alignment
	//debug: prints scores, alignments, etc...

	string sam1(seq1);
	string sam2(seq2);
	double ret = nw_align2(sam1, sam2, matchScore, missmatch_penalty, gapPenOpen, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug);
	//Setting the return alignments
	sv_setpvn(res1, (char*)sam1.c_str(), sam1.length());
	sv_setpvn(res2, (char*)sam2.c_str(), sam2.length());
	return ret;
}

void adjustSequence(int refSize, string& seq, char dir)
{

	int adjust = refSize - seq.length();
	if(dir == 'r')
		for(int i=0; i<adjust; ++i) seq = '-' + seq;
	else
		for(int i=0; i<adjust; ++i) seq += '-';
}

double align3(char* ref, char* right,char* left , SV* res1, SV* res2, SV* res3, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool leftAndRightGapPen, bool debug )
{
	// Returns Sequence alignment of 3 sequences
	// res1, res2, res3 contain the alignment
	//ref: reference sequence
	//left: left amplicon sequence
	//right: right amplicon sequence
	//matchScore: reward for matching a nucleotide
	//gapPenOpen: penalty for opening a gap
	//gapPenext: penalty for extending a gap
	//leftAndRightGapPen: Smart way to penalise according to the amplicon
	//debug: prints scores, alignments, etc...


	string leftSeq(left);
	string rightSeq(right);

	bool noFrontGapPenalty = false;
	bool noEndGapPenalty = false;

	//Align right with left reads
	if(leftAndRightGapPen)
	{
		noFrontGapPenalty = true;
		noEndGapPenalty = true;
	}

	//first alignment the 2 reads
	double ret = nw_align2(rightSeq, leftSeq, matchScore, missmatch_penalty*8, gapPenOpen*5, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug);

	//Setting the return alignments
	vector <string> msa;
	vector <string> partial;
	vector <string> seq;

	partial.push_back(rightSeq);
	partial.push_back(leftSeq);
	seq.push_back(ref);

	msa = nw_alignAlignments(partial, seq , matchScore, missmatch_penalty, gapPenOpen, gapPenExt, false, false, debug);
	sv_setpvn(res1, (char*)msa[2].c_str(), msa[2].length());
	sv_setpvn(res2, (char*)msa[1].c_str(), msa[1].length());
	sv_setpvn(res3, (char*)msa[0].c_str(), msa[0].length());
	return ret;
}

END_CPP

$| = 1;

# Grab and set all options
my $version = "0.5";
my %OPTIONS = (p => 1);
getopts('a:dl:L:o:p:rsv', \%OPTIONS);

die qq(version $version
Usage:   alignCustomAmplicon.pl [OPTIONS] <ref> <fastq[.gz] 1> <fastq[.gz] 2> <primers>

OPTIONS:		-a	STR	print the alignment for any amplicons covering a particular region [chr:loc || chr:start-stop]
			-d		print debug info
			-l	STR	filename of the log file [null]
			-L	STR	append to an existing log file [null]
			-o	STR	filename for bam file
			-p	INT	number of threads [$OPTIONS{p}]
			-r		do not create stats report
			-s		sort the bam file
			-v		verbose progress
			-j	STR	picard jar
) if(@ARGV != 4);

my $Script = 'alignCustomAmplicon';
my $ref = shift @ARGV;
my $fastq1 = shift @ARGV;
my $fastq2 = shift @ARGV;
my $primers = shift @ARGV;
my $now;

#create log file
if(defined $OPTIONS{l}) {
	my $logfile = $OPTIONS{l};
	open (FH, ">$logfile") or die 'Couldn\'t create log file!\n';
	close (FH);
	# Open the log file and redirect output to it
	open (STDERR, ">>$logfile");
	open (STDOUT, ">>$logfile");
	my $now = localtime time;
	print "Log File Created $now\n\n";
} elsif(defined $OPTIONS{L}) {
	#append to a log file
	my $logfile = $OPTIONS{L};
	# Open the log file and redirect output to it
	open (STDERR, ">>$logfile") or die 'Couldn\'t create log file!\n';
	open (STDOUT, ">>$logfile") or die 'Couldn\'t create log file!\n';
	my $now = localtime time;
	print "Appending To Log File $now\n\n";
}

if (defined $OPTIONS{j})
{
	my $JAR = $OPTIONS{j};

}

# print version of this script
print "Using $Script version $version \n\n";

if(defined $OPTIONS{d}) {
	print "Setting threads to 1 as using debug\n";
	$OPTIONS{p} = 1;
}

# grab sample name, set output file name and open some files
my ($name, $path, $suffix) = fileparse($fastq1, ('.fastq', '.fastq.gz'));
my $sampleName :shared = $name;
$sampleName =~ s/^(.*?_.*?)_.*/$1/;
open(PRIM, $primers) or die "can't open $primers: $!\n";

# set samfile name
my $bamfile_out = "${sampleName}_aligned.bam";
if(defined $OPTIONS{o}) {
	$bamfile_out = $OPTIONS{o};
}
my $bam_to_sort = $bamfile_out;
if(defined $OPTIONS{s}) {
	$bam_to_sort .= '.tmp';
}

my $stats = $bamfile_out;
if(!defined $OPTIONS{r}) {
	$stats =~ s/\..*?$//;
	open(STATS, ">${stats}_stats.csv");
}

# read in primer lengths
my %refFrags;
$refFrags{min} = 5000000;
while(my $line = <PRIM>) {
	chomp $line;
	my @line_sp = split("\t", $line);
	$refFrags{primers}{$line_sp[0]}{U}{F}{length} = $line_sp[1];
	$refFrags{primers}{$line_sp[0]}{D}{F}{length} = $line_sp[2];
	$refFrags{primers}{$line_sp[0]}{U}{R}{length} = $line_sp[2];
	$refFrags{primers}{$line_sp[0]}{D}{R}{length} = $line_sp[1];
	$refFrags{name}{$line_sp[0]} = $line_sp[3];
	$refFrags{min} = min([$refFrags{min}, $line_sp[1], $line_sp[2]]);
}
close(PRIM);
my @OrderedContigs :shared = sort sortChrom keys %{ $refFrags{primers} };

my %regions;
if(defined $OPTIONS{a}) {
	my ($chr, $start, $end) = split_region($OPTIONS{a});
	foreach my $reg (@OrderedContigs) {
		my ($chr_reg, $start_reg, $end_reg) = split_region($reg);
		$regions{$reg} = 1 if $chr eq $chr_reg && $start_reg <= $end && $end_reg >= $start;
	}
}

# grab fasta file
my %refSeq;
read_fasta_file($ref, \%refSeq);

# calculate read length and read group
my $readGroup :shared;
my $readLength;
my $mm = new File::MMagic;
my $filetype = $mm->checktype_filename(abs_path($fastq1));
if($filetype eq "application/x-gzip") {
	my $sampleRead = `gunzip -c $fastq1 | head -n 2 | tail -n 1`;
	chomp $sampleRead;
	$readLength = length($sampleRead);
	$readGroup = `gunzip -c $fastq1 | head -n 1`;
	chomp $readGroup;
	$readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/;
} elsif($filetype eq "text/plain") {
	my $sampleRead = `head -n 2 $fastq1 | tail -n 1`;
	chomp $sampleRead;
	$readLength = length($sampleRead);
	$readGroup = `head $fastq1 -n 1`;
	chomp $readGroup;
	$readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/;
} else {
	die("$fastq1 is filetype $filetype and not text/plain or application/x-gzip\n");
}

# create hash to keep track of maxdiff and readlengths
my %maxdiff :shared;

# create hash to keep track of stats
my %stats :shared;
$stats{Amb} = 0;
$stats{PoorQualMates} = 0;
$stats{PoorMapQual} = 0;
$stats{PoorQual1} = 0;
$stats{PoorQual2} = 0;

# setup other shared variables and caches
my %cache :shared;
my $cacheHits :shared = 0;
my %cacheAlign :shared;
$cacheAlign{Conc} = &share( {} );
my $cacheAlignHits :shared = 0;
my %refLength :shared;

# extract primers and fragments on forward and reverse strand and initialize counts
foreach my $key (@OrderedContigs) {
	push @{ $refFrags{seqU} }, substr($refSeq{$key}, 0, $refFrags{primers}{$key}{U}{F}{length});
	push @{ $refFrags{seqD} }, substr($refSeq{$key}, -$refFrags{primers}{$key}{D}{F}{length});
	push @{ $refFrags{seqU} }, revcompl(substr($refSeq{$key}, -$refFrags{primers}{$key}{U}{R}{length}));
	push @{ $refFrags{seqD} }, revcompl(substr($refSeq{$key}, 0, $refFrags{primers}{$key}{D}{R}{length}));
	push @{ $refFrags{lenU} }, ($refFrags{primers}{$key}{U}{F}{length}, $refFrags{primers}{$key}{U}{R}{length});
	push @{ $refFrags{lenD} }, ($refFrags{primers}{$key}{D}{F}{length}, $refFrags{primers}{$key}{D}{R}{length});
	push @{ $refFrags{reg} }, ($key, $key);
	push @{ $refFrags{strand} }, ('F', 'R');

	# calculate new contig name with primers removed
	my ($chr, $start, $stop) = split(":|-", $key);
	$start += $refFrags{primers}{$key}{U}{F}{length};
	$stop -= $refFrags{primers}{$key}{D}{F}{length};
	$refLength{$key} = &share( {} );
	$refLength{$key}{newReg} = "$chr:$start-$stop";
	$refLength{$key}{length} = $stop - $start + 1;

	$cacheAlign{$key} = &share( {} );
	$cacheAlign{$key}{R1} = &share( {} );
	$cacheAlign{$key}{R2} = &share( {} );

	$stats{$key} = &share( {} );
	$stats{$key}{Reads} = 0;
	$stats{$key}{Bases} = 0;
	$stats{$key}{Quality} = 0;
	$stats{$key}{Mapping} = 0;
	$stats{$key}{Read1} = 0;
	$stats{$key}{Read2} = 0;
	$stats{$key}{ConcMapping} = 0;
	$stats{$key}{ConcReads} = 0;
	$stats{$key}{NumberForward} = 0;
	$stats{$key}{NumberReverse} = 0;
	$stats{$key}{ConcErrorsBases} = 0;
	$stats{$key}{ConcCorrectBases} = 0;
	$stats{$key}{MinLength} = 100000;
	$stats{$key}{MaxLength} = 0;
	$stats{$key}{TotalA} = 0;
	$stats{$key}{TotalC} = 0;
	$stats{$key}{TotalG} = 0;
	$stats{$key}{TotalT} = 0;
	$stats{$key}{TotalN} = 0;
	$stats{$key}{PoorMapQual} = 0;
}

# start up worker threads
my $FindContig = Thread::Queue->new();
my $ToAlign = Thread::Queue->new();
my $Aligned = Thread::Queue->new();
my $Samfile = Thread::Queue->new();
my $WriteBam = Thread::Queue->new();
my @threadArray;
for(my $index = 1; $index <= $OPTIONS{p}; $index++) {
	push @threadArray, threads->create(\&thread_find_contig, $FindContig, \%refFrags, \%refSeq, $ToAlign, $Aligned, $WriteBam);
	push @threadArray, threads->create(\&thread_process_align, $Aligned, \%refFrags, \%refSeq, $Samfile, $WriteBam) if $index <= max([1, $OPTIONS{p}/2]);
	push @threadArray, threads->create(\&samfile_line, $Samfile, $WriteBam, \%refSeq) if $index <= max([1, $OPTIONS{p}/2]);
}
push @threadArray, threads->create(\&nwAlign, $ToAlign, $Aligned, \%regions);
push @threadArray, threads->create(\&writebamfile, $bam_to_sort, $WriteBam);

# read in each sequence and its mate from the fastq file
my $read1;
my $read2;
my $seq1;
my $seq2;
my $qual1;
my $qual2;
my $desc;
my @strand;
my $cont = 1;

# open fastq files
my $gz1;
my $gz2;
my $f1gz = 0;
my $f2gz = 0;
if($filetype eq "application/x-gzip") {
	open(FQ1, "gunzip -dc $fastq1 |") or die "Can't fork: $!\n";
} else {
	open(FQ1, $fastq1) or die "$fastq1: $!";
}

$filetype = $mm->checktype_filename(abs_path($fastq2));;
if($filetype eq "application/x-gzip") {
	open(FQ2, "gunzip -dc $fastq2 |") or die "Can't fork: $!\n";
} elsif($filetype eq "text/plain") {
	open(FQ2, $fastq2) or die "$fastq2: $!";
} else {
	die("$fastq2 is filetype $filetype and not text/plain or application/x-gzip\n");
}

# add reads to the queue
my $readNo = 0;
while ($cont) {
	if($read1 = <FQ1>) { # read 1 and 2 read name
		$seq1 = <FQ1>; # read 1 sequence
		$desc = <FQ1>; # read 1 desc
		$qual1 = <FQ1>; # read 1 quality
	} else {
		last;
	}

	if($read2 = <FQ2>) { # read 2 read name
		$seq2 = <FQ2>; # read 2 sequence
		$desc = <FQ2>; # read 2 sdesc
		$qual2 = <FQ2>; # read 2 quality
	} else {
		last;
	}

	# read 1 the forward read
	chomp $read1;
	chomp $seq1;
	chomp $qual1;
	$read1 =~ s/^\@(.*)\s?.*/$1/;

	# read 2 the reverse read
	chomp $read2;
	chomp $seq2;
	$seq2 = revcompl($seq2);
	chomp $qual2;
	$qual2 = reverse($qual2);

	$read2 =~ s/^\@(.*)\s?.*/$1/;
	$readNo++;

	# keep at most 20 reads in buffer
	while(1) {
		if(defined $OPTIONS{v}) {
			my $percAmb = $stats{Amb} / $readNo * 100;
			my $percPoor = $stats{PoorQualMates} / $readNo * 100;
			my $percPoorMap = $stats{PoorMapQual} / $readNo * 100;
			my $percPoor1 = $stats{PoorQual1} / $readNo * 100;
			my $percPoor2 = $stats{PoorQual2} / $readNo * 100;
			$percAmb = sprintf("%.2f", $percAmb);
			$percPoor = sprintf("%.2f", $percPoor);
			$percPoorMap = sprintf("%.2f", $percPoorMap);
			$percPoor1 = sprintf("%.2f", $percPoor1);
			$percPoor2 = sprintf("%.2f", $percPoor2);
			print STDERR "\r$readNo processed - Amb $percAmb - PoorQual $percPoor - PoorQual1 $percPoor1 - PoorQual2 $percPoor2 - PoorMappingQual $percPoorMap - FindContig ".$FindContig->pending()." - ToAlign ".$ToAlign->pending()." - Aligned ".$Aligned->pending()." - Sam ".$Samfile->pending()." - Writebam ".$WriteBam->pending()."        ";
		}
		last if $FindContig->pending() <= 100;
	}
	my @readArray = ($read1, $seq1, $qual1, $read2, $seq2, $qual2);
	{
		lock($FindContig); # Keep other threads from changing the queue's contents
		$FindContig->enqueue(\@readArray);
	}
}
close(FQ1);
close(FQ2);

# wait for all records to be processed and close down threads
$FindContig->enqueue( (undef) x $OPTIONS{p} );
while($FindContig->pending() > 0 ) {
	next;
}
$ToAlign->enqueue( (undef) );
while($ToAlign->pending() > 0 ) {
	next;
}
$Aligned->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) );
while($Aligned->pending() > 0 ) {
	next;
}
$Samfile->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) );
while($Samfile->pending() > 0 ) {
	next;
}
$WriteBam->enqueue( undef );

$_->join() for @threadArray;

# sort the bam file
if(defined $OPTIONS{s}) {
	system("java -Xmx8g -jar $JAR  MAX_RECORDS_IN_RAM=2000000 I=$bam_to_sort O=$bamfile_out VALIDATION_STRINGENCY=LENIENT SO=coordinate USE_THREADING=true CREATE_INDEX=true");
	unlink $bam_to_sort;
}

if(!defined $OPTIONS{r}) {
	# print out statistics
	# setup variables
	my $totalRegionLengths = 0;
	my $totalReads = 0;
	my $totalFwd = 0;
	my $totalRev = 0;
	my $totalMatch = 0;
	my $totalMiss = 0;
	my $totalBases = 0;
	my $totalRead1 = 0;
	my $totalRead2 = 0;
	my $totalA = 0;
	my $totalC = 0;
	my $totalG = 0;
	my $totalT = 0;
	my $totalN = 0;
	my $totalConc = 0;
	my $totalQuality = 0;
	my $AllmaxRead = 0;
	my $AllminRead = 100000;
	my $avequal = 'NA';
	my $aveRead = 'NA';
	my $perA = 'NA';
	my $perC = 'NA';
	my $perG = 'NA';
	my $perT = 'NA';
	my $perN = 'NA';
	my $readsUsed = 0;
	my @coverage;
	my @off_coverage;

	# print column headers
	print STATS 'AmpliconName,Region,RegionLengthMinusPrimers,TotalReadsPoorMapping,TotalReadsUsed,Reads1Kept,Reads2Kept,ReadsOut,ReadsFwd,ReadsRev,ReadsOverlap,';
	print STATS "Bases,AveQuality,MatchBases,ErrorBases,ErrorRate,MinReadLength,AveReadLength,MaxReadLength,PercA,PercC,PercG,PercT,PercN\n";

	# for each region
	foreach my $key (@OrderedContigs){
		# if we have some reads calculate averages
		if($stats{$key}{Reads} > 0) {
			$avequal = $stats{$key}{Quality} / $stats{$key}{Bases};
			$avequal = sprintf("%.4f", $avequal);
			$aveRead = $stats{$key}{Bases} / $stats{$key}{Reads};
			$aveRead = sprintf("%.4f", $aveRead);
			$perA = $stats{$key}{TotalA} / $stats{$key}{Bases} * 100;
			$perA = sprintf("%.2f", $perA);
			$perC = $stats{$key}{TotalC} / $stats{$key}{Bases} * 100;
			$perC  = sprintf("%.2f", $perC);
			$perG = $stats{$key}{TotalG} / $stats{$key}{Bases} * 100;
			$perG  = sprintf("%.2f", $perG);
			$perT = $stats{$key}{TotalT} / $stats{$key}{Bases} * 100;
			$perT = sprintf("%.2f", $perT);
			$perN = $stats{$key}{TotalN} / $stats{$key}{Bases} * 100;
			$perN = sprintf("%.2f", $perN);
			$AllminRead = min([$AllminRead, $stats{$key}{MinLength}]);
			$AllmaxRead = max([$AllmaxRead, $stats{$key}{MaxLength}]);
			$readsUsed = $stats{$key}{Reads} + $stats{$key}{ConcReads};
		} else {
			$stats{$key}{MaxLength} = 'NA';
			$stats{$key}{MinLength} = 'NA';
		}

		# if we have reads which overlapped calculate an error rate
		my $errRate = 'NA';
		if($stats{$key}{ConcReads} > 0) {
			$errRate = $stats{$key}{ConcErrorsBases} / ($stats{$key}{ConcErrorsBases} + $stats{$key}{ConcCorrectBases});
		}

		# print the stats for this region
		print STATS "$refFrags{name}{$key}," if defined $refFrags{name}{$key};
		print STATS "Unknown," unless defined $refFrags{name}{$key};
		print STATS "$refLength{$key}{newReg},$refLength{$key}{length},$stats{$key}{PoorMapQual},$readsUsed,$stats{$key}{Read1},$stats{$key}{Read2},$stats{$key}{Reads},$stats{$key}{NumberForward},";
		print STATS "$stats{$key}{NumberReverse},$stats{$key}{ConcReads},$stats{$key}{Bases},$avequal,$stats{$key}{ConcCorrectBases},";
		print STATS "$stats{$key}{ConcErrorsBases},$errRate,$stats{$key}{MinLength},$aveRead,$stats{$key}{MaxLength},$perA,$perC,$perG,$perT,$perN\n";

		# add stats to grand totals
		$totalRegionLengths += $refLength{$key}{length};
		$totalQuality += $stats{$key}{Quality};
		$totalReads += $stats{$key}{Reads};
		$totalFwd += $stats{$key}{NumberForward};
		$totalRev += $stats{$key}{NumberReverse};
		$totalBases += $stats{$key}{Bases};
		$totalRead1 += $stats{$key}{Read1};
		$totalRead2 += $stats{$key}{Read2};
		$totalA += $stats{$key}{TotalA};
		$totalC += $stats{$key}{TotalC};
		$totalT += $stats{$key}{TotalT};
		$totalG += $stats{$key}{TotalG};
		$totalN += $stats{$key}{TotalN};
		$totalMatch += $stats{$key}{ConcCorrectBases};
		$totalMiss += $stats{$key}{ConcErrorsBases};
		$totalConc += $stats{$key}{ConcReads};

		# reset ave scores
		$avequal = 'NA';
		$aveRead = 'NA';
		$perA = 'NA';
		$perC = 'NA';
		$perG = 'NA';
		$perT = 'NA';
		$perN = 'NA';
		$readsUsed = 0;

		if(defined $refFrags{name}{$key} && grep /^Off_target/, $refFrags{name}{$key}) {
			push @off_coverage, $stats{$key}{Reads};
		} else {
			push @coverage, $stats{$key}{Reads};
		}
	}

	# if we have any reads calculate the averages
	if($totalReads > 0) {
		$avequal = $totalQuality / $totalBases;
		$avequal = sprintf("%.4f", $avequal);
		$aveRead = $totalBases / $totalReads;
		$aveRead = sprintf("%.4f", $aveRead);
		$perA = $totalA / $totalBases * 100;
		$perA = sprintf("%.2f", $perA);
		$perC = $totalC / $totalBases * 100;
		$perC  = sprintf("%.2f", $perC);
		$perG = $totalG / $totalBases * 100;
		$perG  = sprintf("%.2f", $perG);
		$perT = $totalT / $totalBases * 100;
		$perT = sprintf("%.2f", $perT);
		$perN = $totalN / $totalBases * 100;
		$perN = sprintf("%.2f", $perN);
		$readsUsed = $totalReads + $totalConc;
	} else {
		$AllmaxRead = 'NA';
		$AllminRead = 'NA';
	}

	# if we had any reads which overlapped calculate error rate
	my $errRate = 'NA';
	if($totalConc > 0) {
		$errRate = $totalMiss / ($totalMiss + $totalMatch);
	}

	# print out totals
	my $pcAmb = sprintf("%.2f", $stats{Amb} / $readNo * 100);
	my $pcPQ = sprintf("%.2f", $stats{PoorQualMates} / $readNo * 100);
	my $TotalReads = $readNo * 2;
	my $Unmapped = $TotalReads - $readsUsed;
	my $pcUnmapped = sprintf("%.2f", $Unmapped/$TotalReads * 100);
	my $avCov = sprintf("%.0f", sum(@coverage)/@coverage);
	my $avOff = sprintf("%.2f", sum(@off_coverage)/@off_coverage) if @off_coverage;
	my $greater = scalar (grep { $_ > 0.2*$avCov } @coverage);
	my $pcgreater = sprintf("%.2f", $greater/@coverage * 100);

	print STATS ",Total,$totalRegionLengths,$stats{PoorMapQual},$readsUsed,$totalRead1,$totalRead2,$totalReads,$totalFwd,$totalRev,$totalConc,$totalBases,$avequal,$totalMatch,";
	print STATS "$totalMiss,$errRate,$AllminRead,$aveRead,$AllmaxRead,$perA,$perC,$perG,$perT,$perN\n";
	print STATS "\nNo mate pairs:,$readNo\n";
	print STATS "No ambiguous mates:,$stats{Amb},$pcAmb\%\n";
	print STATS "No poor quality mates:,$stats{PoorQualMates},$pcPQ\%\n\n";
	print STATS "Total Number Of Reads:,$TotalReads\n";
	print STATS "Unmapped Reads:,$Unmapped,$pcUnmapped\%\n\n";
	print STATS "Min Mean Max ReadsOut:,".min([@coverage]).",$avCov,".max([@coverage])."\n";
	print STATS "No. Amplicons > 0.2 x ReadsOut:,$greater,$pcgreater\%\n";
	print STATS "Average ReadsOut Off Target Regions:,$avOff\n" if @off_coverage;
	print STATS "\nAmplicons\nNo. Reads < 200, 200 <= No. Reads < 1000, No. Reads >= 1000\n";
	$greater = scalar (grep { $_ < 200 } @coverage);
	print STATS "$greater,";
	$greater = scalar (grep { $_ >= 200 && $_ < 1000 } @coverage);
	print STATS "$greater,";
	$greater = scalar (grep { $_ >= 1000 } @coverage);
	print STATS "$greater\n";

	print "Cache hits align contig: $cacheHits\n";
	print "Alignment cache hits: $cacheAlignHits\n";
	close(STATS);
}

# close the log file
if(defined $OPTIONS{l} || defined $OPTIONS{L}) {
	my $now = localtime time;
	print "\nLog File Closed $now\n";
	close(STDERR);
	close(STDOUT);
}

# thread to find matching amplicon
sub thread_find_contig {
	my $FindContig = shift;
	my $refFrags = shift;
	my $refSeq = shift;
	my $ToAlign = shift;
	my $Aligned = shift;
	my $WriteBam = shift;
	my $readArrayRef;
	{
		lock($FindContig); # Keep other threads from changing the queue's contents
		$readArrayRef = $FindContig->dequeue();
	}

	while(defined $readArrayRef) {
		my ($read1, $seq1, $qual1, $read2, $seq2, $qual2) = @{ $readArrayRef };
		my $contig = '';
		my $readRef = '';
		my $strand = '';

		# trim the reads
		my ($trimmed1, $qtrim1) = trim($seq1, $qual1, 30, 1);
		my ($trimmed2, $qtrim2) = trim($seq2, $qual2, 30, 2);

		# if we have already mapped the reads use the results in the cache
		if(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) {
			my @array;
			{ lock(%cacheAlign);  @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } };
			$array[0] = $read1;
			$array[3] = $qtrim1;
			$array[6] = $qtrim2;
			{ lock($cacheAlignHits); $cacheAlignHits++ };

			# add aligned reads to queue
			{
				lock($Aligned); # Keep other threads from changing the queue's contents
				$Aligned->enqueue(\@array);
			}
		} else {
			# Use approx match to find most similar amplicon primer to reads
			my @index1;
			my @index2;
			my $index_read1 = -2;
			my $index_read2 = -2;
			my $index_found = 0;

			if(length($trimmed1) > length($seq1)/2 || length($trimmed2) > length($seq2)/2) {
				# we have at least 1 good read
				# find closest matching amplicon for read 1
				my $string = substr($seq1, 0, $$refFrags{min});
				if(defined $cache{"$string.1"}) {
					$cacheHits++;
					{ lock(%cache); @index1 = @{ $cache{"$string.1"} } };
				} else {
					@index1 = amatch($string, \@{ $$refFrags{seqU} });
					{
						lock(%cache);
						$cache{"$string.1"} = &share( [] );
						@{ $cache{"$string.1"} } = @index1;
					}
				}

				# find closest matching amplicon for read 2
				$string = substr($seq2, -$$refFrags{min});
				if(defined $cache{"$string.2"}) {
					$cacheHits++;
					{ lock(%cache); @index2 = @{ $cache{"$string.2"} } };
				} else {
					@index2 = amatch($string, \@{ $$refFrags{seqD} });
					{
						lock(%cache);
						$cache{"$string.2"} = &share( [] );
						@{ $cache{"$string.2"} } = @index2;
					}
				}

				if($#index1 > 0 || $#index2 > 0) {
					# if one read maps to multiple amplicons and the other read matches to just one of them use that
					my %in_array2 = map { $_ => 1 } @index2;
					my @common = grep { $in_array2{$_} } @index1;
					if($#common == 0) {
						@index1 = @common;
						@index2 = @common;
					} elsif($#common > 0) {
						if($#index1 > 0) {
							# matched to multiple primers so try and use whole read
							my @refs;
							foreach(@index1) {
								if($$refFrags{strand}[$_] eq "F") {
									push @refs, $$refSeq{${ $$refFrags{reg} }[$_]};
								} else {
									push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]});
								}
							}
							my @index1b = amatch($seq1, \@refs);
							if($#index1b >= 0) {
								@index1 = @index1[@index1b];
							}
						}
						if($#index2 > 0) {
							# matched to multiple primers so try and use whole read
							my @refs;
							foreach(@index2) {
								if($$refFrags{strand}[$_] eq "F") {
									push @refs, $$refSeq{${ $$refFrags{reg} }[$_]};
								} else {
									push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]});
								}
							}
							my @index2b = amatch($seq2, \@refs);
							if($#index2b >= 0)  {
								@index2 = @index2[@index2b];
							}
						}

						# if one read still maps to multiple amplicons and the other read matches to just one of them use that
						%in_array2 = map { $_ => 1 } @index2;
						@common = grep { $in_array2{$_} } @index1;
						if($#common == 0) {
							@index1 = @common;
							@index2 = @common;
						}
					}
				}

				# if read 1 and read 2 match to the same amplicon assign it
				if($#index1 == 0 && @index1 ~~ @index2 ) {
					$index_read1 = $index1[0];
					$index_read2 = $index2[0];
				} else {
					$index_read1 = -1;
					$index_read2 = -1;
				}

				# if any read after trimming is less than half the original length discard
				if(length($trimmed1) < length($seq1)/2) {
					$index_read1 = -2;
				}

				if(length($trimmed2) < length($seq2)/2) {
					$index_read2 = -2;
				}
			}

			if($index_read1 > -1 || $index_read2 > -1) {
				my $conc = 0;
				my %align;

				# at least 1 read matched to an amplicon
				if($index_read1 > -1) {
					$contig = $$refFrags{reg}[$index_read1];
					$readRef = $$refSeq{$contig};
					$strand = $$refFrags{strand}[$index_read1];
				} elsif($index_read2 > -1) {
					$contig = $$refFrags{reg}[$index_read2];
					$readRef = $$refSeq{$contig};
					$strand = $$refFrags{strand}[$index_read2];
				}

				if($index_read1 == $index_read2 && length($readRef) < length($trimmed1) + length($trimmed2)) {
					# reads must overlap so can use concensus
					$conc = 1;
				} else {
					my $substr1;
					my $substr2;
					for(my $length = 5; $length <= min([length($trimmed1), length($trimmed2)]); $length++) {
						$substr1 = substr($trimmed1, -$length);
						$substr2 = substr($trimmed2, 0, $length);
						if($substr1 eq $substr2) {
							$conc = 1;
							last;
						} else {
							# calculate max diff allowed
							if(!defined $maxdiff{$length}) {
								{ lock(%maxdiff); $maxdiff{$length} = maxdiff($length, 0.02, 0.04); }
							}

							my $diff = levenshtein($substr1, $substr2 );
							if($diff < $maxdiff{$length}) {
								$conc = 1;
								last;
							}
						}
					}
				}

				if($conc) {
					# remove any trailing Ns
					$trimmed1 = $seq1;
					$trimmed2 = $seq2;
					$trimmed1 =~ s/N*$//;
					$trimmed2 =~ s/^N*//;
					$qtrim1 = $qual1;
					$qtrim2 = $qual2;
					$qtrim1 = substr($qtrim1, 0, length($trimmed1));
					$qtrim2 = substr($qtrim2 , -length($trimmed2));
				}

				if($strand eq "R") {
					my $trimmed1_tmp = revcompl($trimmed1);
					my $qtrim1_tmp = reverse($qtrim1);
					$trimmed1 = revcompl($trimmed2);
					$qtrim1 = reverse($qtrim2);
					$trimmed2 = $trimmed1_tmp;
					$qtrim2 = $qtrim1_tmp;
					my $index_read_tmp = $index_read1;
					$index_read1 = $index_read2;
					$index_read2 = $index_read_tmp;
				}

				if($conc) {
					my $align;
					my $search1 = $trimmed1;
					$search1 =~ s/N/./g;
					my $search2 = $trimmed2;
					$search2 =~ s/N/./g;
					if($$refSeq{$contig} =~ /$search1/ && $$refSeq{$contig} =~ /$search2/) {
						# the read matches exactly to the reference
						$align{Ref1} = $$refSeq{$contig};
						$align{Read1} = $$refSeq{$contig};
						$align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/;
						$align{Read1} =~ s/[^X]/-/g;
						$align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X+/$trimmed1/ });
						$align{Read2} = $$refSeq{$contig};
						$align{Read2} =~ s/^(.*)$search2(.*?)$/${1}XX$2/;
						$align{Read2} =~ s/[^X]/-/g;
						$align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X+/$trimmed2/ });

						# add aligned reads to queue
						my @array;
						{
							lock($Aligned); # Keep other threads from changing the queue's contents
							@array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, $align{Ref1}, $align{Read2}, $qtrim2, $contig, $strand, $conc);
							$Aligned->enqueue(\@array);
						}

						# add alignment to cache
						{
							lock(%cacheAlign);
							$cacheAlign{Conc}{$name} = &share( [] );
							@{ $cacheAlign{Conc}{$name} } = @array;
						}
					} elsif(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) {
						my @array;
						{ lock(%cacheAlign);  @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } };
						$array[0] = $read1;
						$array[3] = $qtrim1;
						$array[6] = $qtrim2;
						{ lock($cacheAlignHits); $cacheAlignHits++ };

						# add aligned reads to queue
						{
							lock($Aligned); # Keep other threads from changing the queue's contents
							$Aligned->enqueue(\@array);
						}
					} else {

						# align reads using NW2
						{
							lock($ToAlign); # Keep other threads from changing the queue's contents
							my @array = ($$refSeq{$contig}, $trimmed1, $trimmed2, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc);
							#my @array = ($fastaIn, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc, "$trimmed1.$trimmed2");
							$ToAlign->enqueue(\@array);
						}
					}
				}

				if(!$conc && $index_read1 > -1) {
					my $search1 = $trimmed1;
					$search1 =~ s/N/./g;
					if($$refSeq{$contig} =~ /$search1/) {
						# the read matches exactly to the reference
						$align{Ref1} = $$refSeq{$contig};
						$align{Read1} = $$refSeq{$contig};
						$align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/;
						$align{Read1} =~ s/[^X]/-/g;
						$align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X/$trimmed1/ });

						# add aligned reads to queue
						{
							lock($Aligned); # Keep other threads from changing the queue's contents
							my @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, undef, undef, undef, $contig, $strand, $conc);
							$Aligned->enqueue(\@array);
						}
					} elsif(defined $cacheAlign{$contig}{R1}{$trimmed1}) {
						my @array;
						{ lock(%cacheAlign);  @array = @{ $cacheAlign{$contig}{R1}{$trimmed1} } };
						$array[0] = $read1;
						$array[3] = $qtrim1;
						{ lock($cacheAlignHits); $cacheAlignHits++ };

						# add aligned reads to queue
						{
							lock($Aligned); # Keep other threads from changing the queue's contents
							$Aligned->enqueue(\@array);
						}
					} else {
						# align read using NW2
						{
							lock($ToAlign); # Keep other threads from changing the queue's contents
							my @array = ($$refSeq{$contig}, $trimmed1, undef, $read1, $qtrim1, undef, $contig, $strand, $conc);
							#my @array = ($fastaIn, $read1, $qtrim1, undef, $contig, $strand, $conc, $trimmed1);
							$ToAlign->enqueue(\@array);
						}
					}
				}

				if(!$conc && $index_read2 > -1) {
					my $search2 = $trimmed2;
					$search2 =~ s/N/./g;
					if($$refSeq{$contig} =~ /$search2/) {
						# the read matches exactly to the reference
						$align{Ref2} = $$refSeq{$contig};
						$align{Read2} = $$refSeq{$contig};
						$align{Read2} =~ s/^(.*)$search2(.*?)$/${1}X$2/;
						$align{Read2} =~ s/[^X]/-/g;
						$align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X/$trimmed2/ });

						# add aligned reads to queue
						{
							lock($Aligned); # Keep other threads from changing the queue's contents
							my @array = ($read2, undef, undef, undef, $align{Ref2}, $align{Read2}, $qtrim2, $contig, $strand, $conc);
							$Aligned->enqueue(\@array);
						}
					} elsif(defined $cacheAlign{$contig}{R2}{$trimmed2}) {
						my @array;
						{ lock(%cacheAlign);  @array = @{ $cacheAlign{$contig}{R2}{$trimmed2} } };
						$array[0] = $read2;
						$array[6] = $qtrim2;
						{ lock($cacheAlignHits); $cacheAlignHits++ };

						# add aligned reads to queue
						{
							lock($Aligned); # Keep other threads from changing the queue's contents
							$Aligned->enqueue(\@array);
						}
					} else {
						# align read using NW2
						{
							lock($ToAlign); # Keep other threads from changing the queue's contents
							my @array = ($$refSeq{$contig}, undef, $trimmed2, $read2, undef, $qtrim2, $contig, $strand, $conc);
							#my @array = ($fastaIn, $read2, undef, $qtrim2, $contig, $strand, $conc, $trimmed2);
							$ToAlign->enqueue(\@array);
						}
					}
				}
			}

			if($index_read1 < 0) {
				# discard read 1
				# send to writebamfile thread
				{
					lock($WriteBam); # Keep other threads from changing the queue's contents
					$WriteBam->enqueue("$read1\t4\t*\t0\t0\t*\t*\t0\t0\t$seq1\t$qual1\tRG:Z:$readGroup\n");
				}
				{ lock(%stats); $stats{PoorQual1}++ };
			}

			if($index_read2 < 0) {
				# discard read 2
				# send to writebamfile thread
				{
					lock($WriteBam); # Keep other threads from changing the queue's contents
					$WriteBam->enqueue("$read2\t4\t*\t0\t0\t*\t*\t0\t0\t$seq2\t$qual2\tRG:Z:$readGroup\n");
				}
				{ lock(%stats); $stats{PoorQual2}++ };
			}

			if($index_read1 == -1 && $index_read2 == -1) {
				{ lock(%stats); $stats{Amb}++ };
			} elsif($index_read1 == -2 && $index_read2 == -2) {
				{ lock(%stats); $stats{PoorQualMates}++ };
			}
		}

		# grab the next reads in queue
		{
			lock($FindContig); # Keep other threads from changing the queue's contents
			$readArrayRef = $FindContig->dequeue();
		}
	}
	print "Closing Find Amplicon Thread\n";
	return;
}

sub writebamfile {
	my $bamfile = shift;
	my $WriteBam = shift;

	# write samfile header
	open(PIPE_TO_SAMTOOLS, "| samtools view -Sb -o $bamfile - 2>&1") or die "Can't fork: $!\n";
	foreach my $key (@OrderedContigs) {
		print PIPE_TO_SAMTOOLS "\@SQ\tSN:$refLength{$key}{newReg}\tLN:$refLength{$key}{length}\n";
	}
	print PIPE_TO_SAMTOOLS "\@RG\tID:$readGroup\tPL:Illumina\tSM:$sampleName\n";

	my $read;
	{
		lock($WriteBam); # Keep other threads from changing the queue's contents
		$read = $WriteBam->dequeue();
	}
	while(defined $read) {

		# write read to bamfile
		print PIPE_TO_SAMTOOLS $read;

		# grab next item in queue
		{
			lock($WriteBam); # Keep other threads from changing the queue's contents
			$read  = $WriteBam->dequeue();
		}
	}
	print "Closing Bam Writer\n";
	close PIPE_TO_SAMTOOLS or die "pipe could not be closed";
	return;
}

sub thread_process_align {
	my $Aligned = shift;
	my $refFrags = shift;
	my $refSeq = shift;
	my $Samfile = shift;
	my $WriteBam = shift;

	my $readArrayRef;
	{
		lock($Aligned); # Keep other threads from changing the queue's contents
		$readArrayRef = $Aligned->dequeue();
	}
	while(defined $readArrayRef) {
		# grab the alignment
		my ($read, $ref1, $read1, $qtrim1, $ref2, $read2, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef };
		# grab primer lengths
		my $PU = $$refFrags{primers}{$contig}{U}{F}{length};
		my $PD = $$refFrags{primers}{$contig}{D}{F}{length};
		my %readStats;
		my $orig_read1 = $read1;
		my $orig_read2 = $read2;
		my $orig_qtrim1 = $qtrim1;
		my $orig_qtrim2 = $qtrim2;
		$orig_read1 =~ s/-//g if defined $orig_read1;
		$orig_read2 =~ s/-//g if defined $orig_read2;

		if(defined $OPTIONS{d}) {
			# debug info
			print "Alignment With Primers\n";
			print "$ref1\n$read1\n" if defined $read1;
			print "$ref2\n$read2\n" if defined $read2;
			print "\nFinal Alignment:\n";
		}

		# calculate where primers begin and end
		if(defined $read1) {
			if($strand eq 'F') {
				$readStats{$contig}{Read1} = 1;
			} else {
				$readStats{$contig}{Read2} = 1;
			}

			# determine the primer locations
			my $offset = 0;
			for(my $occurance = 0; $occurance < $PU; $occurance++) {
				substr($ref1, $offset) =~ /^.*?([^-])/;
				$offset += $-[1] + 1;
			}
			my $length = 0;
			for(my $occurance = 0; $occurance < $PD; $occurance++) {
				substr($ref1, 0, length($ref1) - $length) =~ /.*([^-]).*?$/;
				$length = length($ref1) - $-[1];
			}
			$length = length($ref1) - $length - $offset;

			# remove primers and adjust quality scores
			$ref1 = substr($ref1, $offset, $length);
			my $qualOffset = ( substr($read1, 0, $offset) =~ tr/ACTGN// );
			$read1 = substr($read1, $offset, $length);
			$length = ( $read1 =~ tr/ACTGN// );
			$qtrim1 = substr($qtrim1, $qualOffset, $length);
		}

		if(defined $read2) {
			if($strand eq 'F') {
				$readStats{$contig}{Read2} = 1;
			} else {
				$readStats{$contig}{Read1} = 1;
			}

			# determine the primer locations
			my $offset = 0;
			for(my $occurance = 0; $occurance < $PU; $occurance++) {
				substr($ref2, $offset) =~ /^.*?([^-])/;
				$offset += $-[1] + 1;
			}
			my $length = 0;
			for(my $occurance = 0; $occurance < $PD; $occurance++) {
				substr($ref2, 0, length($ref2) - $length) =~ /.*([^-]).*?$/;
				$length = length($ref2) - $-[1];
			}
			$length = length($ref2) - $length - $offset;

			# remove primers and adjust quality scores
			$ref2 = substr($ref2, $offset, $length);
			my $qualOffset = ( substr($read2, 0, $offset) =~ tr/ACTGN// );
			$read2 = substr($read2, $offset, $length);
			$length = ( $read2 =~ tr/ACTGN// );
			$qtrim2 = substr($qtrim2, $qualOffset, $length);
		}

		# generate a concensus sequence if we have both read1 and read2 and they overlap
		if($conc) {
			# the following are 0 based indexes
			my $R1qual = 0; # index for sanger qual read 1
			my $R2qual = 0; # index for sanger qual read 2
			my $aindex = 0; # index for alignment
			my $q1;
			my $q2;
			my $concensus;
			my $concqual;

			$readStats{$contig}{ConcReads} = 1;
			$readStats{$contig}{ConcCorrectBases} = 0;
			$readStats{$contig}{ConcErrorsBases} = 0;
			while($aindex < length($ref1)) {
				# base1 is current base from read 1
				# base2 is current base from read 2
				my $base1 = substr($read1, $aindex, 1);
				my $base2 = substr($read2, $aindex, 1);

				# grab the next quality score
				if($R1qual < length($qtrim1)) {
					$q1 = substr($qtrim1, $R1qual, 1);
				}
				if($R2qual < length($qtrim2)) {
					$q2 = substr($qtrim2, $R2qual, 1);
				}

				# if we have a base in either
				if(($base1 ne '-' && $base1 ne 'N') || ($base2 ne '-' && $base2 ne 'N')) {
					if(($base2 eq "-" || $base2 eq 'N') && $base1 ne "-" && $base1 ne 'N') {
						# we are only in read 1 or read 2 is a N
						$concensus .= $base1;
						$concqual .= $q1;
						$R1qual++;
						if($base2 eq 'N') {
							$R2qual++;
						}
					} elsif(($base1 eq "-" || $base1 eq 'N') && $base2 ne "-" && $base2 ne 'N') {
						# we are only in read 2 or read 1 is a N
						$concensus .= $base2;
						$concqual .= $q2;
						$R2qual++;
						if($base1 eq 'N') {
							$R1qual++;
						}
					} else {
						# we are in the overlap region
						my $maxqual = sangerMax($q1, $q2);
						if($base1 eq $base2) {
							# both reads agree
							$readStats{$contig}{ConcCorrectBases}++;
							$concensus .= $base1;
							$concqual .= $maxqual;
						} elsif($q1 eq $q2) {
							# reads have different bases but the same score
							$readStats{$contig}{ConcErrorsBases}++;
							$concensus .= 'N';
							$concqual .= '#';
						} elsif($q1 eq $maxqual) {
							# reads have different bases but read 1 has the best score
							$concensus .= $base1;
							$concqual .= $maxqual;
							$readStats{$contig}{ConcErrorsBases}++;
						} else {
							# reads have different bases but read 2 has the best score
							$concensus .= $base2;
							$concqual .= $maxqual;
							$readStats{$contig}{ConcErrorsBases}++;
						}
						$R1qual++;
						$R2qual++;
					}
				} elsif($base1 eq 'N' || $base2 eq 'N') {
					if($base2 eq '-') {
						# one is a deletion the other is an N
						$concensus .= $base1;
						$concqual .= $q1;
						$R1qual++;
					} elsif($base1 eq '-') {
						# one is a deletion the other is an N
						$concensus .= $base2;
						$concqual .= $q2;
						$R2qual++;
					} elsif($base1 eq 'N' && $base2 eq 'N') {
						# both reads are an N
						$concensus .= 'N';
						$concqual .= sangerMax($q1, $q2);
						$R1qual++;
						$R2qual++;
					}
				} else {
					$concensus .= '-';
				}
				$aindex++;
			}
			$read1 = $concensus;
			$qtrim1 = $concqual;
			$read2 = undef;
			$qtrim2 = undef;
		}

		if(defined $read1) {
			# remove any trailing - and send to samfile_line
			$read1 =~ s/^-*?([^-].*)/$1/;
			$ref1 = substr($ref1, -length($read1));
			$read1 =~ s/(.*[^-])-*?$/$1/;
			$ref1 = substr($ref1, 0, length($read1));

			my $tmp = $ref1;
			$tmp =~ s/-//g;
			my $rstart = index($$refSeq{$contig}, $tmp) - $PU + 1;

			# generate sam file line
			{
				lock($Samfile); # Keep other threads from changing the queue's contents
				my @array = ($read, $qtrim1, $rstart, $read1, $ref1, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats);
				$Samfile->enqueue(\@array);
			}
		}

		if(defined $read2) {
			# remove any trailing - and send to samfile_line
			$read2 =~ s/^-*?([^-].*)/$1/;
			$ref2 = substr($ref2, -length($read2));
			$read2 =~ s/(.*[^-])-*?$/$1/;
			$ref2 = substr($ref2, 0, length($read2));

			my $tmp = $ref2;
			$tmp =~ s/-//g;
			my $rstart = rindex($$refSeq{$contig}, $tmp) - $PU + 1;

			# generate sam file line
			{
				lock($Samfile); # Keep other threads from changing the queue's contents
				my @array = ($read, $qtrim2, $rstart, $read2, $ref2, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats);
				$Samfile->enqueue(\@array);
			}
		}

		if(defined $OPTIONS{d}) {
			# debug info
			print "$ref1\n$read1\n" if defined $read1;
			print "$ref2\n$read2\n" if defined $read2;
			print "$qtrim1\n" if defined $read1;
			print "$qtrim2\n" if defined $read2;
			print "\n\n";
		}

		# grab next item in queue
		{
			lock($Aligned); # Keep other threads from changing the queue's contents
			$readArrayRef = $Aligned->dequeue();
		}
	}
	print "Closing Alignment Processor\n";
	return;
}

# function to take reverse compliment
sub revcompl {
  my $string = shift @_;
  my $revcomp = reverse($string);
  $revcomp =~ tr/ACGTacgtKMSWRYBDHVkmswrybdhv/TGCAtgcaMKSWYRVHDBmkswyrvhdb/;
  return $revcomp;
}

# function to convert sanger fastq quality scores to phred scores
sub sanger2phred {
  my $sang = shift @_;
  my @sanger = split("", $sang);
  my @phred;
  foreach my $q (@sanger) {
  	my $Q = ord($q) - 33;
  	push @phred, $Q;
  }
  return @phred;
}

# function to return the largest sanger quality score
sub sangerMax {
	my ($s1, $s2) = @_;
	my $Q1 = ord($s1) - 33;
	my $Q2 = ord($s2) - 33;
	if($Q1 > $Q2) {
		return $s1;
	} else {
		return $s2;
	}
}

# function to read a fasta file as a hash
sub read_fasta_file {
   my ($fasta_file, $seqs) = @_;
   my $header;
   my $seq = "";
   open (IN, $fasta_file) or die "can't open $fasta_file: $!\n";
   while (my $line = <IN>) {
      chomp $line;
      if (grep />/, $line) {
         if ($seq ne "") {
            $seqs->{$header} = $seq;
         }
	 $header = $line;

         $header =~ s/^>//; # remove ">"
         $header =~ s/\s+$//; # remove trailing whitespace
         $seq = ""; # clear out old sequence
      } else {
         $line =~ s/\s+//g; # remove whitespace
         $seq .= $line; # add sequence
      }
   }
   close IN; # finished with file

   if ($seq ne "") { # handle last sequence
      $seqs->{$header} = $seq;
   }

   return;
}

# minimal element of a list
#
sub min
{
    my @list = @{$_[0]};
    my $min = $list[0];

    foreach my $i (@list)
    {
        $min = $i if ($i < $min);
    }

    return $min;
}

# maximal element of a list
#
sub max
{
    my @list = @{$_[0]};
    my $max = $list[0];

    foreach my $i (@list)
    {
        $max = $i if ($i > $max);
    }

    return $max;
}

# trim sequences using the BWA algorithm
sub trim {

	my ($s, $q, $threshold, $read) = @_;  # read sequence and quality scores
	if($read == 2) {
		$q = reverse($q);
		$s = reverse($s);
	}

	my @array  = sanger2phred($q);
	my $length = scalar @array;

	# only calculate if quality fails near end
	if( $array[$#array] >= $threshold ){
		if($read == 2) {
			$q = reverse($q);
			$s = reverse($s);
		}
		return ($s, $q);
	}

	# run bwa equation
	my @arg;
	for( my $i = 0; $i < $length - 1; $i++ ){
		my $x = $i + 1;
		for( my $j = $x; $j < $length; $j++ ){
			$arg[$x] += $threshold - $array[$j];
		}
	}

	# find number of 5' bases to retain
	my $index = 0;
	my $maxval = 0;
	for ( 1 .. $#arg ){
		if ( $maxval < $arg[$_] ){
			$index = $_;
			$maxval = $arg[$_];
    		}
	}

	# trim reads
	$s = substr($s,0,$index);
	$q = substr($q,0,$index);
	if($read == 2) {
		$s = reverse($s);
		$q = reverse($q);
	}

	# return seq and quality
	return ($s, $q);
}


sub samfile_line {
	# grab the variables
	my ($Samfile, $WriteBam, $refSeq) = @_;
	my %SamStats;
	my %minLength;
	my %maxLength;
	my $readArrayRef;
	{
		lock($Samfile); # Keep other threads from changing the queue's contents
		$readArrayRef = $Samfile->dequeue();
	}

	while(defined $readArrayRef) {
		my ($read, $qual, $rstart, $align, $ref, $reg, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, $readStats) = @{ $readArrayRef };
		my $seq = $align;
		$seq =~ s/-//g;

		# calculate the edit difference - count indels as only 1 missmatch hence tranform the strings so that all indels have length 1
		my $sam;
		my $s1;
		my $s2;
		my @s1 = split("", $align);
		my @s2 = split("", $ref);

		my $indel = 0;
		for(my $i = 0; $i <= $#s1; $i++) {
			if(($s1[$i] ne "-" && $s2[$i] ne "-") || !$indel) {
				$s1 .= $s1[$i];
				$s2 .= $s2[$i];
				$indel = 0;
				if($s1[$i] eq "-" || $s2[$i] eq "-") {
					$indel = 1;
				}
			}
		}
		my $mq = levenshtein($s1, $s2);

		my $cl = length($seq);
		# calculate max diff allowed
		if(!defined $maxdiff{$cl}) {
			{ lock(%maxdiff); $maxdiff{$cl} = maxdiff($cl, 0.02, 0.04); }
		}

		if($mq > $maxdiff{$cl} || $cl == 0) {
			{ lock(%stats); $stats{$reg}{PoorMapQual}++; $stats{PoorMapQual}++; }
			if(defined $orig_read1) {
				$sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read1\t$orig_qtrim1\tRG:Z:$readGroup\n";
				# send to writebamfile thread
				{
					lock($WriteBam); # Keep other threads from changing the queue's contents
					$WriteBam->enqueue($sam);
				}
			}
			if(defined $orig_read2) {
				$read =~ s/[1](:[A-Z]+:[0-9]+:[ACTGN]+)$/2$1/;
				$sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read2\t$orig_qtrim2\tRG:Z:$readGroup\n";
				# send to writebamfile thread
				{
					lock($WriteBam); # Keep other threads from changing the queue's contents
					$WriteBam->enqueue($sam);
				}
			}
		} else {
			$mq = 60;

			# add any stats from earlier
			foreach my $key1 (keys %$readStats) {
				foreach my $key2 (keys % { $$readStats{$key1} }) {
					$SamStats{$key1}{$key2} += $$readStats{$key1}{$key2};
				}
			}

			# keep track of quality scores
			$SamStats{$reg}{Bases} += $cl;
			$SamStats{$reg}{Quality} += $_ for sanger2phred($qual);

			# keep track of read lengths
			if(defined $minLength{$reg}) {
				$minLength{$reg} = min([$cl, $minLength{$reg}]);
				$maxLength{$reg} = max([$cl, $maxLength{$reg}]);
			} else {
				$minLength{$reg} = $cl;
				$maxLength{$reg} = $cl;
			}

			# keep track of strand and bases seen
			if($strand eq 'F') {
				$SamStats{$reg}{NumberForward}++;
			} else {
				$SamStats{$reg}{NumberReverse}++;
			}
			$SamStats{$reg}{TotalA} += ($seq =~ tr/A//);
			$SamStats{$reg}{TotalC} += ($seq =~ tr/C//);
			$SamStats{$reg}{TotalG} += ($seq =~ tr/G//);
			$SamStats{$reg}{TotalT} += ($seq =~ tr/T//);
			$SamStats{$reg}{TotalN} += ($seq =~ tr/N//);

			# keep count the number of reads for each contig
			$SamStats{$reg}{Reads}++;

			# calculate strand info
			if($strand eq 'F') {
				$strand = 0;
			} else {
				$strand = 16;
			}
			# calculate cigar
			my $cigar = "";
			my $M = 0;
			my $D = 0;
			my $I = 0;
			my $Subs = 0;
			for(my $i = 0; $i < length($align); $i++) {
				if(substr($align, $i, 1) eq "-") {
					$D++;
					if($M) {
						$cigar .= "${M}M";
						$M = 0;
					} elsif($I) {
						$cigar .= "${I}I";
						$I = 0;
					}
				} elsif(substr($ref, $i, 1) eq "-") {
					$I++;
					if($M) {
						$cigar .= "${M}M";
						$M = 0;
					} elsif($D) {
						$cigar .= "${D}D";
						$D = 0;
					}
				} else {
					$M++;
					if($D) {
						$cigar .= "${D}D";
						$D = 0;
					} elsif($I) {
						$cigar .= "${I}I";
						$I = 0;
					}
					$Subs++ if substr($ref, $i, 1) ne substr($align, $i, 1);
				}
			}
			if($M) {
				$cigar .= "${M}M";
			} elsif($I) {
				$cigar .= "${I}I";
			} elsif($D) {
				$cigar .= "${D}D";
			}

			# construct sam file line
			$sam = "$read\t$strand\t$refLength{$reg}{newReg}\t$rstart\t$mq\t$cigar\t*\t0\t0\t$seq\t$qual\tRG:Z:$readGroup\n";

			# send to writebamfile thread
			{
				lock($WriteBam); # Keep other threads from changing the queue's contents
				$WriteBam->enqueue($sam);
			}
		}

		#grab the next read
		{
			lock($Samfile); # Keep other threads from changing the queue's contents
			$readArrayRef = $Samfile->dequeue();
		}
	}

	{
		# add stats from earlier
		lock(%stats);
		foreach my $key1 (keys %SamStats) {
			$stats{$key1}{MinLength} = min([$minLength{$key1}, $stats{$key1}{MinLength}]);
			$stats{$key1}{MaxLength} = max([$maxLength{$key1}, $stats{$key1}{MaxLength}]);
			foreach my $key2 (keys % { $SamStats{$key1} }) {
				$stats{$key1}{$key2} += $SamStats{$key1}{$key2};
			}
		}

	}
	print "Closing Sam File Line Generator\n";

	return;
}

# perform the alignment
sub nwAlign {
	my ($ToAlign, $Aligned, $regions) = @_;
	my $readArrayRef;
	{
		lock($ToAlign); # Keep other threads from changing the queue's contents
		$readArrayRef = $ToAlign->dequeue();
	}

	while(defined $readArrayRef) {
		# we have picked a contig
		# check strand and take rev comp of ref if necessary
		my ($ref, $left, $right, $read, $qtrim1, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef };
		my %align;
		my $name;
		#! Parameters
		#! ClustalW Parameters
		my $matchScore = 1.9;
		my $missmatch_penalty = -0.4;
		my $gapPenOpen = -10.0;
		my $gapPenExt = -0.4;
		my $leftAndRightGapPen = true;
		my $debug = false;
		my $cons = "";
		my $align1= "";
		my $align2= "";
		my $align3= "";
		my $score = 0.0;
		my $singleAlign = false; #aligning 2 or 3 sequences
		if($conc)
		{
			#got to concatenate seqs and align
			$score = align3($ref, $right, $left, $align1, $align2, $align3, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, $leftAndRightGapPen, $debug);
			$name = "$left.$right";
			print "$contig\n$align1\n$align2\n$align3\n\n" if defined $$regions{$contig};
		}
		elsif(!defined $right)
		{
			#align with left
			$score = align2($ref, $left, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, false, true, $debug);
			$singleAlign = true;
			$name = $left;
			print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig};
		}
		elsif(!defined $left)
		{
			#align with right
			$score = align2($ref, $right, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen ,$gapPenExt ,true, false, $debug);
			$singleAlign = true;
			$name = $right;
			print "$contig\n$align1\n$align2\n\n"  if defined $$regions{$contig};
		}

		# add aligned reads to queue
		# cache results
		my $type;
		{
			lock($Aligned); # Keep other threads from changing the queue's contents

			my @array;
			if($singleAlign && !defined $right)
			{
				@array = ($read, $align1, $align2, $qtrim1, undef, undef, undef, $contig, $strand, $conc);
				{
					lock(%cacheAlign);
					$cacheAlign{$contig}{R1}{$name} = &share( [] );
					@{ $cacheAlign{$contig}{R1}{$name} } = @array;
				}
			}
			elsif($singleAlign && !defined $left )
			{
				@array = ($read, undef, undef, undef, $align1, $align2, $qtrim2, $contig, $strand, $conc);
				{
					lock(%cacheAlign);
					$cacheAlign{$contig}{R2}{$name} = &share( [] );
					@{ $cacheAlign{$contig}{R2}{$name} } = @array;
				}
			}
			else
			{
				@array = ($read, $align1, $align2, $qtrim1, $align1, $align3, $qtrim2, $contig, $strand, $conc);
				{
					lock(%cacheAlign);
					$cacheAlign{Conc}{$name} = &share( [] );
					@{ $cacheAlign{Conc}{$name} } = @array;
				}
			}
			$Aligned->enqueue(\@array);
		}

		#grab the next read
		{
			lock($ToAlign); # Keep other threads from changing the queue's contents
			$readArrayRef = $ToAlign->dequeue();
		}
	}
	# clean up
	print "Closing NW2 Aligner\n";
	return;
}

# function to calculat the minimum edit distance threshold (taken from bwa)
sub maxdiff {
	my ($l, $err, $thres) = @_;
	my $lambda = exp(-$l*$err);
	my $y = 1.0;
	my $x = 1;
	my $sum = $lambda;
	for(my $k = 1; $k < 1000; $k++) {
		$y *= $l * $err;
		$x *= $k;
		$sum += $lambda * $y / $x;
		if(1.0 - $sum < $thres) {
			return $k;
		}
	}
	return 2;
}

# using approx matching return the positions in $list of those most closely matching $string
sub amatch {
	my ($string, $list) = @_;
	my @d = adistr($string, @$list);
        my @d_index = sort { abs($d[$a]) <=> abs($d[$b]) } 0..$#$list;

        return if $d[$d_index[0]] > 0.25;

	my @pos = ($d_index[0]);
	my $i = 1;
        while($i <= $#d_index && $d[$d_index[0]] == $d[$d_index[$i]]) {
        	push @pos, $d_index[$i];
         	$i++;
        }
 	return @pos;
}

# a sorting algorithm to sort the chromosomes
sub sortChrom {
	my ($achr, $astart, $aend) = split(":|-", $a);
	my ($bchr, $bstart, $bend) = split(":|-", $b);
	$achr =~ s/X/23/;
	$achr =~ s/Y/24/;
	$achr =~ s/MT?/25/;
	$bchr =~ s/X/23/;
	$bchr =~ s/Y/24/;
	$bchr =~ s/MT?/25/;
	if($achr < $bchr) {
		return -1;
	} elsif($bchr < $achr) {
		return 1;
	} elsif($astart < $bstart) {
		return -1;
	} elsif($bstart < $astart) {
		return 1;
	} elsif($aend < $bend) {
		return -1;
	} elsif($bend < $aend) {
		return 1;
	} else {
		return 0;
	}
}

# pass a code reference
sub modify {
  my($text, $code) = @_;
  $code->($text);
  return $text;
}

sub split_region {
	my ($chr, $start, $end) = split(":|-", shift);
	$end = $start unless defined $end;
	return($chr, $start, $end);
}

sub error_rate {
	my ($x, $y) = @_;
	my @x = split("", $x);
	my @y = split("", $y);
	my $s = 0;
	my $l = 0;
	for(my $i = 0; $i <= $#x; $i++) {
		if($x[$i] ne "-" && $y[$i] ne "-") {
			$s++ if $x[$i] ne $y[$i];
			$l++;
		}
	}
	return($s / $l);
}
author	fcaramia
date	Wed, 09 Jan 2013 00:45:53 -0500
parents	d32bddcff685
children