Mercurial > repos > fcaramia > custom_amplicon_alignment
view alignCustomAmplicon/alignCustomAmplicon.pl @ 5:0aaf65fbb48a draft default tip
Uploaded
author | fcaramia |
---|---|
date | Wed, 20 Mar 2013 00:22:08 -0400 |
parents | 22f35f830f48 |
children |
line wrap: on
line source
#!/usr/bin/perl # script to convert paired end reads to a concensus sequence # Created by Jason Ellul, Feb 2012 # Modified by Franco Caramia and Jason Ellul, July 2012 and November 2012 # 0.4 Fixed mapping of concensus region (large insertions and deletions now picked up correctly) # Added missalignment penalty # Read 1 and 2 are now aligned first then they are aligned to the reference # 0.5 Added ability to name amplicons by includinfo the info in the primers file # Added the ability to output the alignments in a particular region using the new flag -a # Added some extra stats to the output use strict; use warnings; use Getopt::Std; use File::Basename; use File::Find; use File::Copy; use File::MMagic; use File::Temp qw/ tempfile tempdir /; use Cwd 'abs_path'; use threads; use threads::shared; use List::Util qw(sum); use Thread::Queue; use String::Approx 'adistr'; use POSIX qw(floor ceil); String::Approx::cache_max(25000); String::Approx::cache_purge(0.25); use constant false => 0; use constant true => 1; use Inline CPP => Config => AUTO_INCLUDE => '#include "utils.cpp"'; use Inline CPP => <<'END_CPP'; int levenshtein(char* seq1, char* seq2) { // Returns edit distance of 2 sequences string sam1(seq1); string sam2(seq2); return levenshteinDistance(sam1, sam2); } double align2(char* seq1, char* seq2, SV* res1, SV* res2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug ) { // Returns Sequence alignment of 2 sequences // res1, res2 contain the alignment //seq1: reference sequence //seq2: amplicon sequence //matchScore: reward for matching a nucleotide //gapPenOpen: penalty for opening a gap //gapPenext: penalty for extending a gap //noFrontGapPenalty: No penalty will be impose for gaps at the beggining of the alignment //noEndGapPenalty: No penalty will be impose for gaps at the end of the alignment //debug: prints scores, alignments, etc... string sam1(seq1); string sam2(seq2); double ret = nw_align2(sam1, sam2, matchScore, missmatch_penalty, gapPenOpen, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug); //Setting the return alignments sv_setpvn(res1, (char*)sam1.c_str(), sam1.length()); sv_setpvn(res2, (char*)sam2.c_str(), sam2.length()); return ret; } void adjustSequence(int refSize, string& seq, char dir) { int adjust = refSize - seq.length(); if(dir == 'r') for(int i=0; i<adjust; ++i) seq = '-' + seq; else for(int i=0; i<adjust; ++i) seq += '-'; } double align3(char* ref, char* right,char* left , SV* res1, SV* res2, SV* res3, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool leftAndRightGapPen, bool debug ) { // Returns Sequence alignment of 3 sequences // res1, res2, res3 contain the alignment //ref: reference sequence //left: left amplicon sequence //right: right amplicon sequence //matchScore: reward for matching a nucleotide //gapPenOpen: penalty for opening a gap //gapPenext: penalty for extending a gap //leftAndRightGapPen: Smart way to penalise according to the amplicon //debug: prints scores, alignments, etc... string leftSeq(left); string rightSeq(right); bool noFrontGapPenalty = false; bool noEndGapPenalty = false; //Align right with left reads if(leftAndRightGapPen) { noFrontGapPenalty = true; noEndGapPenalty = true; } //first alignment the 2 reads double ret = nw_align2(rightSeq, leftSeq, matchScore, missmatch_penalty*8, gapPenOpen*5, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug); //Setting the return alignments vector <string> msa; vector <string> partial; vector <string> seq; partial.push_back(rightSeq); partial.push_back(leftSeq); seq.push_back(ref); msa = nw_alignAlignments(partial, seq , matchScore, missmatch_penalty, gapPenOpen, gapPenExt, false, false, debug); sv_setpvn(res1, (char*)msa[2].c_str(), msa[2].length()); sv_setpvn(res2, (char*)msa[1].c_str(), msa[1].length()); sv_setpvn(res3, (char*)msa[0].c_str(), msa[0].length()); return ret; } END_CPP $| = 1; # Grab and set all options my $version = "0.5"; my %OPTIONS = (p => 1); getopts('a:dl:L:o:p:rsv', \%OPTIONS); die qq(version $version Usage: alignCustomAmplicon.pl [OPTIONS] <ref> <fastq[.gz] 1> <fastq[.gz] 2> <primers> OPTIONS: -a STR print the alignment for any amplicons covering a particular region [chr:loc || chr:start-stop] -d print debug info -l STR filename of the log file [null] -L STR append to an existing log file [null] -o STR filename for bam file -p INT number of threads [$OPTIONS{p}] -r do not create stats report -s sort the bam file -v verbose progress -j STR picard jar ) if(@ARGV != 4); my $Script = 'alignCustomAmplicon'; my $ref = shift @ARGV; my $fastq1 = shift @ARGV; my $fastq2 = shift @ARGV; my $primers = shift @ARGV; my $now; #create log file if(defined $OPTIONS{l}) { my $logfile = $OPTIONS{l}; open (FH, ">$logfile") or die 'Couldn\'t create log file!\n'; close (FH); # Open the log file and redirect output to it open (STDERR, ">>$logfile"); open (STDOUT, ">>$logfile"); my $now = localtime time; print "Log File Created $now\n\n"; } elsif(defined $OPTIONS{L}) { #append to a log file my $logfile = $OPTIONS{L}; # Open the log file and redirect output to it open (STDERR, ">>$logfile") or die 'Couldn\'t create log file!\n'; open (STDOUT, ">>$logfile") or die 'Couldn\'t create log file!\n'; my $now = localtime time; print "Appending To Log File $now\n\n"; } if (defined $OPTIONS{j}) { my $JAR = $OPTIONS{j}; } # print version of this script print "Using $Script version $version \n\n"; if(defined $OPTIONS{d}) { print "Setting threads to 1 as using debug\n"; $OPTIONS{p} = 1; } # grab sample name, set output file name and open some files my ($name, $path, $suffix) = fileparse($fastq1, ('.fastq', '.fastq.gz')); my $sampleName :shared = $name; $sampleName =~ s/^(.*?_.*?)_.*/$1/; open(PRIM, $primers) or die "can't open $primers: $!\n"; # set samfile name my $bamfile_out = "${sampleName}_aligned.bam"; if(defined $OPTIONS{o}) { $bamfile_out = $OPTIONS{o}; } my $bam_to_sort = $bamfile_out; if(defined $OPTIONS{s}) { $bam_to_sort .= '.tmp'; } my $stats = $bamfile_out; if(!defined $OPTIONS{r}) { $stats =~ s/\..*?$//; open(STATS, ">${stats}_stats.csv"); } # read in primer lengths my %refFrags; $refFrags{min} = 5000000; while(my $line = <PRIM>) { chomp $line; my @line_sp = split("\t", $line); $refFrags{primers}{$line_sp[0]}{U}{F}{length} = $line_sp[1]; $refFrags{primers}{$line_sp[0]}{D}{F}{length} = $line_sp[2]; $refFrags{primers}{$line_sp[0]}{U}{R}{length} = $line_sp[2]; $refFrags{primers}{$line_sp[0]}{D}{R}{length} = $line_sp[1]; $refFrags{name}{$line_sp[0]} = $line_sp[3]; $refFrags{min} = min([$refFrags{min}, $line_sp[1], $line_sp[2]]); } close(PRIM); my @OrderedContigs :shared = sort sortChrom keys %{ $refFrags{primers} }; my %regions; if(defined $OPTIONS{a}) { my ($chr, $start, $end) = split_region($OPTIONS{a}); foreach my $reg (@OrderedContigs) { my ($chr_reg, $start_reg, $end_reg) = split_region($reg); $regions{$reg} = 1 if $chr eq $chr_reg && $start_reg <= $end && $end_reg >= $start; } } # grab fasta file my %refSeq; read_fasta_file($ref, \%refSeq); # calculate read length and read group my $readGroup :shared; my $readLength; my $mm = new File::MMagic; my $filetype = $mm->checktype_filename(abs_path($fastq1)); if($filetype eq "application/x-gzip") { my $sampleRead = `gunzip -c $fastq1 | head -n 2 | tail -n 1`; chomp $sampleRead; $readLength = length($sampleRead); $readGroup = `gunzip -c $fastq1 | head -n 1`; chomp $readGroup; $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/; } elsif($filetype eq "text/plain") { my $sampleRead = `head -n 2 $fastq1 | tail -n 1`; chomp $sampleRead; $readLength = length($sampleRead); $readGroup = `head $fastq1 -n 1`; chomp $readGroup; $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/; } else { die("$fastq1 is filetype $filetype and not text/plain or application/x-gzip\n"); } # create hash to keep track of maxdiff and readlengths my %maxdiff :shared; # create hash to keep track of stats my %stats :shared; $stats{Amb} = 0; $stats{PoorQualMates} = 0; $stats{PoorMapQual} = 0; $stats{PoorQual1} = 0; $stats{PoorQual2} = 0; # setup other shared variables and caches my %cache :shared; my $cacheHits :shared = 0; my %cacheAlign :shared; $cacheAlign{Conc} = &share( {} ); my $cacheAlignHits :shared = 0; my %refLength :shared; # extract primers and fragments on forward and reverse strand and initialize counts foreach my $key (@OrderedContigs) { push @{ $refFrags{seqU} }, substr($refSeq{$key}, 0, $refFrags{primers}{$key}{U}{F}{length}); push @{ $refFrags{seqD} }, substr($refSeq{$key}, -$refFrags{primers}{$key}{D}{F}{length}); push @{ $refFrags{seqU} }, revcompl(substr($refSeq{$key}, -$refFrags{primers}{$key}{U}{R}{length})); push @{ $refFrags{seqD} }, revcompl(substr($refSeq{$key}, 0, $refFrags{primers}{$key}{D}{R}{length})); push @{ $refFrags{lenU} }, ($refFrags{primers}{$key}{U}{F}{length}, $refFrags{primers}{$key}{U}{R}{length}); push @{ $refFrags{lenD} }, ($refFrags{primers}{$key}{D}{F}{length}, $refFrags{primers}{$key}{D}{R}{length}); push @{ $refFrags{reg} }, ($key, $key); push @{ $refFrags{strand} }, ('F', 'R'); # calculate new contig name with primers removed my ($chr, $start, $stop) = split(":|-", $key); $start += $refFrags{primers}{$key}{U}{F}{length}; $stop -= $refFrags{primers}{$key}{D}{F}{length}; $refLength{$key} = &share( {} ); $refLength{$key}{newReg} = "$chr:$start-$stop"; $refLength{$key}{length} = $stop - $start + 1; $cacheAlign{$key} = &share( {} ); $cacheAlign{$key}{R1} = &share( {} ); $cacheAlign{$key}{R2} = &share( {} ); $stats{$key} = &share( {} ); $stats{$key}{Reads} = 0; $stats{$key}{Bases} = 0; $stats{$key}{Quality} = 0; $stats{$key}{Mapping} = 0; $stats{$key}{Read1} = 0; $stats{$key}{Read2} = 0; $stats{$key}{ConcMapping} = 0; $stats{$key}{ConcReads} = 0; $stats{$key}{NumberForward} = 0; $stats{$key}{NumberReverse} = 0; $stats{$key}{ConcErrorsBases} = 0; $stats{$key}{ConcCorrectBases} = 0; $stats{$key}{MinLength} = 100000; $stats{$key}{MaxLength} = 0; $stats{$key}{TotalA} = 0; $stats{$key}{TotalC} = 0; $stats{$key}{TotalG} = 0; $stats{$key}{TotalT} = 0; $stats{$key}{TotalN} = 0; $stats{$key}{PoorMapQual} = 0; } # start up worker threads my $FindContig = Thread::Queue->new(); my $ToAlign = Thread::Queue->new(); my $Aligned = Thread::Queue->new(); my $Samfile = Thread::Queue->new(); my $WriteBam = Thread::Queue->new(); my @threadArray; for(my $index = 1; $index <= $OPTIONS{p}; $index++) { push @threadArray, threads->create(\&thread_find_contig, $FindContig, \%refFrags, \%refSeq, $ToAlign, $Aligned, $WriteBam); push @threadArray, threads->create(\&thread_process_align, $Aligned, \%refFrags, \%refSeq, $Samfile, $WriteBam) if $index <= max([1, $OPTIONS{p}/2]); push @threadArray, threads->create(\&samfile_line, $Samfile, $WriteBam, \%refSeq) if $index <= max([1, $OPTIONS{p}/2]); } push @threadArray, threads->create(\&nwAlign, $ToAlign, $Aligned, \%regions); push @threadArray, threads->create(\&writebamfile, $bam_to_sort, $WriteBam); # read in each sequence and its mate from the fastq file my $read1; my $read2; my $seq1; my $seq2; my $qual1; my $qual2; my $desc; my @strand; my $cont = 1; # open fastq files my $gz1; my $gz2; my $f1gz = 0; my $f2gz = 0; if($filetype eq "application/x-gzip") { open(FQ1, "gunzip -dc $fastq1 |") or die "Can't fork: $!\n"; } else { open(FQ1, $fastq1) or die "$fastq1: $!"; } $filetype = $mm->checktype_filename(abs_path($fastq2));; if($filetype eq "application/x-gzip") { open(FQ2, "gunzip -dc $fastq2 |") or die "Can't fork: $!\n"; } elsif($filetype eq "text/plain") { open(FQ2, $fastq2) or die "$fastq2: $!"; } else { die("$fastq2 is filetype $filetype and not text/plain or application/x-gzip\n"); } # add reads to the queue my $readNo = 0; while ($cont) { if($read1 = <FQ1>) { # read 1 and 2 read name $seq1 = <FQ1>; # read 1 sequence $desc = <FQ1>; # read 1 desc $qual1 = <FQ1>; # read 1 quality } else { last; } if($read2 = <FQ2>) { # read 2 read name $seq2 = <FQ2>; # read 2 sequence $desc = <FQ2>; # read 2 sdesc $qual2 = <FQ2>; # read 2 quality } else { last; } # read 1 the forward read chomp $read1; chomp $seq1; chomp $qual1; $read1 =~ s/^\@(.*)\s?.*/$1/; # read 2 the reverse read chomp $read2; chomp $seq2; $seq2 = revcompl($seq2); chomp $qual2; $qual2 = reverse($qual2); $read2 =~ s/^\@(.*)\s?.*/$1/; $readNo++; # keep at most 20 reads in buffer while(1) { if(defined $OPTIONS{v}) { my $percAmb = $stats{Amb} / $readNo * 100; my $percPoor = $stats{PoorQualMates} / $readNo * 100; my $percPoorMap = $stats{PoorMapQual} / $readNo * 100; my $percPoor1 = $stats{PoorQual1} / $readNo * 100; my $percPoor2 = $stats{PoorQual2} / $readNo * 100; $percAmb = sprintf("%.2f", $percAmb); $percPoor = sprintf("%.2f", $percPoor); $percPoorMap = sprintf("%.2f", $percPoorMap); $percPoor1 = sprintf("%.2f", $percPoor1); $percPoor2 = sprintf("%.2f", $percPoor2); print STDERR "\r$readNo processed - Amb $percAmb - PoorQual $percPoor - PoorQual1 $percPoor1 - PoorQual2 $percPoor2 - PoorMappingQual $percPoorMap - FindContig ".$FindContig->pending()." - ToAlign ".$ToAlign->pending()." - Aligned ".$Aligned->pending()." - Sam ".$Samfile->pending()." - Writebam ".$WriteBam->pending()." "; } last if $FindContig->pending() <= 100; } my @readArray = ($read1, $seq1, $qual1, $read2, $seq2, $qual2); { lock($FindContig); # Keep other threads from changing the queue's contents $FindContig->enqueue(\@readArray); } } close(FQ1); close(FQ2); # wait for all records to be processed and close down threads $FindContig->enqueue( (undef) x $OPTIONS{p} ); while($FindContig->pending() > 0 ) { next; } $ToAlign->enqueue( (undef) ); while($ToAlign->pending() > 0 ) { next; } $Aligned->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) ); while($Aligned->pending() > 0 ) { next; } $Samfile->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) ); while($Samfile->pending() > 0 ) { next; } $WriteBam->enqueue( undef ); $_->join() for @threadArray; # sort the bam file if(defined $OPTIONS{s}) { system("java -Xmx8g -jar $JAR MAX_RECORDS_IN_RAM=2000000 I=$bam_to_sort O=$bamfile_out VALIDATION_STRINGENCY=LENIENT SO=coordinate USE_THREADING=true CREATE_INDEX=true"); unlink $bam_to_sort; } if(!defined $OPTIONS{r}) { # print out statistics # setup variables my $totalRegionLengths = 0; my $totalReads = 0; my $totalFwd = 0; my $totalRev = 0; my $totalMatch = 0; my $totalMiss = 0; my $totalBases = 0; my $totalRead1 = 0; my $totalRead2 = 0; my $totalA = 0; my $totalC = 0; my $totalG = 0; my $totalT = 0; my $totalN = 0; my $totalConc = 0; my $totalQuality = 0; my $AllmaxRead = 0; my $AllminRead = 100000; my $avequal = 'NA'; my $aveRead = 'NA'; my $perA = 'NA'; my $perC = 'NA'; my $perG = 'NA'; my $perT = 'NA'; my $perN = 'NA'; my $readsUsed = 0; my @coverage; my @off_coverage; # print column headers print STATS 'AmpliconName,Region,RegionLengthMinusPrimers,TotalReadsPoorMapping,TotalReadsUsed,Reads1Kept,Reads2Kept,ReadsOut,ReadsFwd,ReadsRev,ReadsOverlap,'; print STATS "Bases,AveQuality,MatchBases,ErrorBases,ErrorRate,MinReadLength,AveReadLength,MaxReadLength,PercA,PercC,PercG,PercT,PercN\n"; # for each region foreach my $key (@OrderedContigs){ # if we have some reads calculate averages if($stats{$key}{Reads} > 0) { $avequal = $stats{$key}{Quality} / $stats{$key}{Bases}; $avequal = sprintf("%.4f", $avequal); $aveRead = $stats{$key}{Bases} / $stats{$key}{Reads}; $aveRead = sprintf("%.4f", $aveRead); $perA = $stats{$key}{TotalA} / $stats{$key}{Bases} * 100; $perA = sprintf("%.2f", $perA); $perC = $stats{$key}{TotalC} / $stats{$key}{Bases} * 100; $perC = sprintf("%.2f", $perC); $perG = $stats{$key}{TotalG} / $stats{$key}{Bases} * 100; $perG = sprintf("%.2f", $perG); $perT = $stats{$key}{TotalT} / $stats{$key}{Bases} * 100; $perT = sprintf("%.2f", $perT); $perN = $stats{$key}{TotalN} / $stats{$key}{Bases} * 100; $perN = sprintf("%.2f", $perN); $AllminRead = min([$AllminRead, $stats{$key}{MinLength}]); $AllmaxRead = max([$AllmaxRead, $stats{$key}{MaxLength}]); $readsUsed = $stats{$key}{Reads} + $stats{$key}{ConcReads}; } else { $stats{$key}{MaxLength} = 'NA'; $stats{$key}{MinLength} = 'NA'; } # if we have reads which overlapped calculate an error rate my $errRate = 'NA'; if($stats{$key}{ConcReads} > 0) { $errRate = $stats{$key}{ConcErrorsBases} / ($stats{$key}{ConcErrorsBases} + $stats{$key}{ConcCorrectBases}); } # print the stats for this region print STATS "$refFrags{name}{$key}," if defined $refFrags{name}{$key}; print STATS "Unknown," unless defined $refFrags{name}{$key}; print STATS "$refLength{$key}{newReg},$refLength{$key}{length},$stats{$key}{PoorMapQual},$readsUsed,$stats{$key}{Read1},$stats{$key}{Read2},$stats{$key}{Reads},$stats{$key}{NumberForward},"; print STATS "$stats{$key}{NumberReverse},$stats{$key}{ConcReads},$stats{$key}{Bases},$avequal,$stats{$key}{ConcCorrectBases},"; print STATS "$stats{$key}{ConcErrorsBases},$errRate,$stats{$key}{MinLength},$aveRead,$stats{$key}{MaxLength},$perA,$perC,$perG,$perT,$perN\n"; # add stats to grand totals $totalRegionLengths += $refLength{$key}{length}; $totalQuality += $stats{$key}{Quality}; $totalReads += $stats{$key}{Reads}; $totalFwd += $stats{$key}{NumberForward}; $totalRev += $stats{$key}{NumberReverse}; $totalBases += $stats{$key}{Bases}; $totalRead1 += $stats{$key}{Read1}; $totalRead2 += $stats{$key}{Read2}; $totalA += $stats{$key}{TotalA}; $totalC += $stats{$key}{TotalC}; $totalT += $stats{$key}{TotalT}; $totalG += $stats{$key}{TotalG}; $totalN += $stats{$key}{TotalN}; $totalMatch += $stats{$key}{ConcCorrectBases}; $totalMiss += $stats{$key}{ConcErrorsBases}; $totalConc += $stats{$key}{ConcReads}; # reset ave scores $avequal = 'NA'; $aveRead = 'NA'; $perA = 'NA'; $perC = 'NA'; $perG = 'NA'; $perT = 'NA'; $perN = 'NA'; $readsUsed = 0; if(defined $refFrags{name}{$key} && grep /^Off_target/, $refFrags{name}{$key}) { push @off_coverage, $stats{$key}{Reads}; } else { push @coverage, $stats{$key}{Reads}; } } # if we have any reads calculate the averages if($totalReads > 0) { $avequal = $totalQuality / $totalBases; $avequal = sprintf("%.4f", $avequal); $aveRead = $totalBases / $totalReads; $aveRead = sprintf("%.4f", $aveRead); $perA = $totalA / $totalBases * 100; $perA = sprintf("%.2f", $perA); $perC = $totalC / $totalBases * 100; $perC = sprintf("%.2f", $perC); $perG = $totalG / $totalBases * 100; $perG = sprintf("%.2f", $perG); $perT = $totalT / $totalBases * 100; $perT = sprintf("%.2f", $perT); $perN = $totalN / $totalBases * 100; $perN = sprintf("%.2f", $perN); $readsUsed = $totalReads + $totalConc; } else { $AllmaxRead = 'NA'; $AllminRead = 'NA'; } # if we had any reads which overlapped calculate error rate my $errRate = 'NA'; if($totalConc > 0) { $errRate = $totalMiss / ($totalMiss + $totalMatch); } # print out totals my $pcAmb = sprintf("%.2f", $stats{Amb} / $readNo * 100); my $pcPQ = sprintf("%.2f", $stats{PoorQualMates} / $readNo * 100); my $TotalReads = $readNo * 2; my $Unmapped = $TotalReads - $readsUsed; my $pcUnmapped = sprintf("%.2f", $Unmapped/$TotalReads * 100); my $avCov = sprintf("%.0f", sum(@coverage)/@coverage); my $avOff = sprintf("%.2f", sum(@off_coverage)/@off_coverage) if @off_coverage; my $greater = scalar (grep { $_ > 0.2*$avCov } @coverage); my $pcgreater = sprintf("%.2f", $greater/@coverage * 100); print STATS ",Total,$totalRegionLengths,$stats{PoorMapQual},$readsUsed,$totalRead1,$totalRead2,$totalReads,$totalFwd,$totalRev,$totalConc,$totalBases,$avequal,$totalMatch,"; print STATS "$totalMiss,$errRate,$AllminRead,$aveRead,$AllmaxRead,$perA,$perC,$perG,$perT,$perN\n"; print STATS "\nNo mate pairs:,$readNo\n"; print STATS "No ambiguous mates:,$stats{Amb},$pcAmb\%\n"; print STATS "No poor quality mates:,$stats{PoorQualMates},$pcPQ\%\n\n"; print STATS "Total Number Of Reads:,$TotalReads\n"; print STATS "Unmapped Reads:,$Unmapped,$pcUnmapped\%\n\n"; print STATS "Min Mean Max ReadsOut:,".min([@coverage]).",$avCov,".max([@coverage])."\n"; print STATS "No. Amplicons > 0.2 x ReadsOut:,$greater,$pcgreater\%\n"; print STATS "Average ReadsOut Off Target Regions:,$avOff\n" if @off_coverage; print STATS "\nAmplicons\nNo. Reads < 200, 200 <= No. Reads < 1000, No. Reads >= 1000\n"; $greater = scalar (grep { $_ < 200 } @coverage); print STATS "$greater,"; $greater = scalar (grep { $_ >= 200 && $_ < 1000 } @coverage); print STATS "$greater,"; $greater = scalar (grep { $_ >= 1000 } @coverage); print STATS "$greater\n"; print "Cache hits align contig: $cacheHits\n"; print "Alignment cache hits: $cacheAlignHits\n"; close(STATS); } # close the log file if(defined $OPTIONS{l} || defined $OPTIONS{L}) { my $now = localtime time; print "\nLog File Closed $now\n"; close(STDERR); close(STDOUT); } # thread to find matching amplicon sub thread_find_contig { my $FindContig = shift; my $refFrags = shift; my $refSeq = shift; my $ToAlign = shift; my $Aligned = shift; my $WriteBam = shift; my $readArrayRef; { lock($FindContig); # Keep other threads from changing the queue's contents $readArrayRef = $FindContig->dequeue(); } while(defined $readArrayRef) { my ($read1, $seq1, $qual1, $read2, $seq2, $qual2) = @{ $readArrayRef }; my $contig = ''; my $readRef = ''; my $strand = ''; # trim the reads my ($trimmed1, $qtrim1) = trim($seq1, $qual1, 30, 1); my ($trimmed2, $qtrim2) = trim($seq2, $qual2, 30, 2); # if we have already mapped the reads use the results in the cache if(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) { my @array; { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } }; $array[0] = $read1; $array[3] = $qtrim1; $array[6] = $qtrim2; { lock($cacheAlignHits); $cacheAlignHits++ }; # add aligned reads to queue { lock($Aligned); # Keep other threads from changing the queue's contents $Aligned->enqueue(\@array); } } else { # Use approx match to find most similar amplicon primer to reads my @index1; my @index2; my $index_read1 = -2; my $index_read2 = -2; my $index_found = 0; if(length($trimmed1) > length($seq1)/2 || length($trimmed2) > length($seq2)/2) { # we have at least 1 good read # find closest matching amplicon for read 1 my $string = substr($seq1, 0, $$refFrags{min}); if(defined $cache{"$string.1"}) { $cacheHits++; { lock(%cache); @index1 = @{ $cache{"$string.1"} } }; } else { @index1 = amatch($string, \@{ $$refFrags{seqU} }); { lock(%cache); $cache{"$string.1"} = &share( [] ); @{ $cache{"$string.1"} } = @index1; } } # find closest matching amplicon for read 2 $string = substr($seq2, -$$refFrags{min}); if(defined $cache{"$string.2"}) { $cacheHits++; { lock(%cache); @index2 = @{ $cache{"$string.2"} } }; } else { @index2 = amatch($string, \@{ $$refFrags{seqD} }); { lock(%cache); $cache{"$string.2"} = &share( [] ); @{ $cache{"$string.2"} } = @index2; } } if($#index1 > 0 || $#index2 > 0) { # if one read maps to multiple amplicons and the other read matches to just one of them use that my %in_array2 = map { $_ => 1 } @index2; my @common = grep { $in_array2{$_} } @index1; if($#common == 0) { @index1 = @common; @index2 = @common; } elsif($#common > 0) { if($#index1 > 0) { # matched to multiple primers so try and use whole read my @refs; foreach(@index1) { if($$refFrags{strand}[$_] eq "F") { push @refs, $$refSeq{${ $$refFrags{reg} }[$_]}; } else { push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]}); } } my @index1b = amatch($seq1, \@refs); if($#index1b >= 0) { @index1 = @index1[@index1b]; } } if($#index2 > 0) { # matched to multiple primers so try and use whole read my @refs; foreach(@index2) { if($$refFrags{strand}[$_] eq "F") { push @refs, $$refSeq{${ $$refFrags{reg} }[$_]}; } else { push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]}); } } my @index2b = amatch($seq2, \@refs); if($#index2b >= 0) { @index2 = @index2[@index2b]; } } # if one read still maps to multiple amplicons and the other read matches to just one of them use that %in_array2 = map { $_ => 1 } @index2; @common = grep { $in_array2{$_} } @index1; if($#common == 0) { @index1 = @common; @index2 = @common; } } } # if read 1 and read 2 match to the same amplicon assign it if($#index1 == 0 && @index1 ~~ @index2 ) { $index_read1 = $index1[0]; $index_read2 = $index2[0]; } else { $index_read1 = -1; $index_read2 = -1; } # if any read after trimming is less than half the original length discard if(length($trimmed1) < length($seq1)/2) { $index_read1 = -2; } if(length($trimmed2) < length($seq2)/2) { $index_read2 = -2; } } if($index_read1 > -1 || $index_read2 > -1) { my $conc = 0; my %align; # at least 1 read matched to an amplicon if($index_read1 > -1) { $contig = $$refFrags{reg}[$index_read1]; $readRef = $$refSeq{$contig}; $strand = $$refFrags{strand}[$index_read1]; } elsif($index_read2 > -1) { $contig = $$refFrags{reg}[$index_read2]; $readRef = $$refSeq{$contig}; $strand = $$refFrags{strand}[$index_read2]; } if($index_read1 == $index_read2 && length($readRef) < length($trimmed1) + length($trimmed2)) { # reads must overlap so can use concensus $conc = 1; } else { my $substr1; my $substr2; for(my $length = 5; $length <= min([length($trimmed1), length($trimmed2)]); $length++) { $substr1 = substr($trimmed1, -$length); $substr2 = substr($trimmed2, 0, $length); if($substr1 eq $substr2) { $conc = 1; last; } else { # calculate max diff allowed if(!defined $maxdiff{$length}) { { lock(%maxdiff); $maxdiff{$length} = maxdiff($length, 0.02, 0.04); } } my $diff = levenshtein($substr1, $substr2 ); if($diff < $maxdiff{$length}) { $conc = 1; last; } } } } if($conc) { # remove any trailing Ns $trimmed1 = $seq1; $trimmed2 = $seq2; $trimmed1 =~ s/N*$//; $trimmed2 =~ s/^N*//; $qtrim1 = $qual1; $qtrim2 = $qual2; $qtrim1 = substr($qtrim1, 0, length($trimmed1)); $qtrim2 = substr($qtrim2 , -length($trimmed2)); } if($strand eq "R") { my $trimmed1_tmp = revcompl($trimmed1); my $qtrim1_tmp = reverse($qtrim1); $trimmed1 = revcompl($trimmed2); $qtrim1 = reverse($qtrim2); $trimmed2 = $trimmed1_tmp; $qtrim2 = $qtrim1_tmp; my $index_read_tmp = $index_read1; $index_read1 = $index_read2; $index_read2 = $index_read_tmp; } if($conc) { my $align; my $search1 = $trimmed1; $search1 =~ s/N/./g; my $search2 = $trimmed2; $search2 =~ s/N/./g; if($$refSeq{$contig} =~ /$search1/ && $$refSeq{$contig} =~ /$search2/) { # the read matches exactly to the reference $align{Ref1} = $$refSeq{$contig}; $align{Read1} = $$refSeq{$contig}; $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/; $align{Read1} =~ s/[^X]/-/g; $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X+/$trimmed1/ }); $align{Read2} = $$refSeq{$contig}; $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}XX$2/; $align{Read2} =~ s/[^X]/-/g; $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X+/$trimmed2/ }); # add aligned reads to queue my @array; { lock($Aligned); # Keep other threads from changing the queue's contents @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, $align{Ref1}, $align{Read2}, $qtrim2, $contig, $strand, $conc); $Aligned->enqueue(\@array); } # add alignment to cache { lock(%cacheAlign); $cacheAlign{Conc}{$name} = &share( [] ); @{ $cacheAlign{Conc}{$name} } = @array; } } elsif(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) { my @array; { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } }; $array[0] = $read1; $array[3] = $qtrim1; $array[6] = $qtrim2; { lock($cacheAlignHits); $cacheAlignHits++ }; # add aligned reads to queue { lock($Aligned); # Keep other threads from changing the queue's contents $Aligned->enqueue(\@array); } } else { # align reads using NW2 { lock($ToAlign); # Keep other threads from changing the queue's contents my @array = ($$refSeq{$contig}, $trimmed1, $trimmed2, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc); #my @array = ($fastaIn, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc, "$trimmed1.$trimmed2"); $ToAlign->enqueue(\@array); } } } if(!$conc && $index_read1 > -1) { my $search1 = $trimmed1; $search1 =~ s/N/./g; if($$refSeq{$contig} =~ /$search1/) { # the read matches exactly to the reference $align{Ref1} = $$refSeq{$contig}; $align{Read1} = $$refSeq{$contig}; $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/; $align{Read1} =~ s/[^X]/-/g; $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X/$trimmed1/ }); # add aligned reads to queue { lock($Aligned); # Keep other threads from changing the queue's contents my @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, undef, undef, undef, $contig, $strand, $conc); $Aligned->enqueue(\@array); } } elsif(defined $cacheAlign{$contig}{R1}{$trimmed1}) { my @array; { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R1}{$trimmed1} } }; $array[0] = $read1; $array[3] = $qtrim1; { lock($cacheAlignHits); $cacheAlignHits++ }; # add aligned reads to queue { lock($Aligned); # Keep other threads from changing the queue's contents $Aligned->enqueue(\@array); } } else { # align read using NW2 { lock($ToAlign); # Keep other threads from changing the queue's contents my @array = ($$refSeq{$contig}, $trimmed1, undef, $read1, $qtrim1, undef, $contig, $strand, $conc); #my @array = ($fastaIn, $read1, $qtrim1, undef, $contig, $strand, $conc, $trimmed1); $ToAlign->enqueue(\@array); } } } if(!$conc && $index_read2 > -1) { my $search2 = $trimmed2; $search2 =~ s/N/./g; if($$refSeq{$contig} =~ /$search2/) { # the read matches exactly to the reference $align{Ref2} = $$refSeq{$contig}; $align{Read2} = $$refSeq{$contig}; $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}X$2/; $align{Read2} =~ s/[^X]/-/g; $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X/$trimmed2/ }); # add aligned reads to queue { lock($Aligned); # Keep other threads from changing the queue's contents my @array = ($read2, undef, undef, undef, $align{Ref2}, $align{Read2}, $qtrim2, $contig, $strand, $conc); $Aligned->enqueue(\@array); } } elsif(defined $cacheAlign{$contig}{R2}{$trimmed2}) { my @array; { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R2}{$trimmed2} } }; $array[0] = $read2; $array[6] = $qtrim2; { lock($cacheAlignHits); $cacheAlignHits++ }; # add aligned reads to queue { lock($Aligned); # Keep other threads from changing the queue's contents $Aligned->enqueue(\@array); } } else { # align read using NW2 { lock($ToAlign); # Keep other threads from changing the queue's contents my @array = ($$refSeq{$contig}, undef, $trimmed2, $read2, undef, $qtrim2, $contig, $strand, $conc); #my @array = ($fastaIn, $read2, undef, $qtrim2, $contig, $strand, $conc, $trimmed2); $ToAlign->enqueue(\@array); } } } } if($index_read1 < 0) { # discard read 1 # send to writebamfile thread { lock($WriteBam); # Keep other threads from changing the queue's contents $WriteBam->enqueue("$read1\t4\t*\t0\t0\t*\t*\t0\t0\t$seq1\t$qual1\tRG:Z:$readGroup\n"); } { lock(%stats); $stats{PoorQual1}++ }; } if($index_read2 < 0) { # discard read 2 # send to writebamfile thread { lock($WriteBam); # Keep other threads from changing the queue's contents $WriteBam->enqueue("$read2\t4\t*\t0\t0\t*\t*\t0\t0\t$seq2\t$qual2\tRG:Z:$readGroup\n"); } { lock(%stats); $stats{PoorQual2}++ }; } if($index_read1 == -1 && $index_read2 == -1) { { lock(%stats); $stats{Amb}++ }; } elsif($index_read1 == -2 && $index_read2 == -2) { { lock(%stats); $stats{PoorQualMates}++ }; } } # grab the next reads in queue { lock($FindContig); # Keep other threads from changing the queue's contents $readArrayRef = $FindContig->dequeue(); } } print "Closing Find Amplicon Thread\n"; return; } sub writebamfile { my $bamfile = shift; my $WriteBam = shift; # write samfile header open(PIPE_TO_SAMTOOLS, "| samtools view -Sb -o $bamfile - 2>&1") or die "Can't fork: $!\n"; foreach my $key (@OrderedContigs) { print PIPE_TO_SAMTOOLS "\@SQ\tSN:$refLength{$key}{newReg}\tLN:$refLength{$key}{length}\n"; } print PIPE_TO_SAMTOOLS "\@RG\tID:$readGroup\tPL:Illumina\tSM:$sampleName\n"; my $read; { lock($WriteBam); # Keep other threads from changing the queue's contents $read = $WriteBam->dequeue(); } while(defined $read) { # write read to bamfile print PIPE_TO_SAMTOOLS $read; # grab next item in queue { lock($WriteBam); # Keep other threads from changing the queue's contents $read = $WriteBam->dequeue(); } } print "Closing Bam Writer\n"; close PIPE_TO_SAMTOOLS or die "pipe could not be closed"; return; } sub thread_process_align { my $Aligned = shift; my $refFrags = shift; my $refSeq = shift; my $Samfile = shift; my $WriteBam = shift; my $readArrayRef; { lock($Aligned); # Keep other threads from changing the queue's contents $readArrayRef = $Aligned->dequeue(); } while(defined $readArrayRef) { # grab the alignment my ($read, $ref1, $read1, $qtrim1, $ref2, $read2, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef }; # grab primer lengths my $PU = $$refFrags{primers}{$contig}{U}{F}{length}; my $PD = $$refFrags{primers}{$contig}{D}{F}{length}; my %readStats; my $orig_read1 = $read1; my $orig_read2 = $read2; my $orig_qtrim1 = $qtrim1; my $orig_qtrim2 = $qtrim2; $orig_read1 =~ s/-//g if defined $orig_read1; $orig_read2 =~ s/-//g if defined $orig_read2; if(defined $OPTIONS{d}) { # debug info print "Alignment With Primers\n"; print "$ref1\n$read1\n" if defined $read1; print "$ref2\n$read2\n" if defined $read2; print "\nFinal Alignment:\n"; } # calculate where primers begin and end if(defined $read1) { if($strand eq 'F') { $readStats{$contig}{Read1} = 1; } else { $readStats{$contig}{Read2} = 1; } # determine the primer locations my $offset = 0; for(my $occurance = 0; $occurance < $PU; $occurance++) { substr($ref1, $offset) =~ /^.*?([^-])/; $offset += $-[1] + 1; } my $length = 0; for(my $occurance = 0; $occurance < $PD; $occurance++) { substr($ref1, 0, length($ref1) - $length) =~ /.*([^-]).*?$/; $length = length($ref1) - $-[1]; } $length = length($ref1) - $length - $offset; # remove primers and adjust quality scores $ref1 = substr($ref1, $offset, $length); my $qualOffset = ( substr($read1, 0, $offset) =~ tr/ACTGN// ); $read1 = substr($read1, $offset, $length); $length = ( $read1 =~ tr/ACTGN// ); $qtrim1 = substr($qtrim1, $qualOffset, $length); } if(defined $read2) { if($strand eq 'F') { $readStats{$contig}{Read2} = 1; } else { $readStats{$contig}{Read1} = 1; } # determine the primer locations my $offset = 0; for(my $occurance = 0; $occurance < $PU; $occurance++) { substr($ref2, $offset) =~ /^.*?([^-])/; $offset += $-[1] + 1; } my $length = 0; for(my $occurance = 0; $occurance < $PD; $occurance++) { substr($ref2, 0, length($ref2) - $length) =~ /.*([^-]).*?$/; $length = length($ref2) - $-[1]; } $length = length($ref2) - $length - $offset; # remove primers and adjust quality scores $ref2 = substr($ref2, $offset, $length); my $qualOffset = ( substr($read2, 0, $offset) =~ tr/ACTGN// ); $read2 = substr($read2, $offset, $length); $length = ( $read2 =~ tr/ACTGN// ); $qtrim2 = substr($qtrim2, $qualOffset, $length); } # generate a concensus sequence if we have both read1 and read2 and they overlap if($conc) { # the following are 0 based indexes my $R1qual = 0; # index for sanger qual read 1 my $R2qual = 0; # index for sanger qual read 2 my $aindex = 0; # index for alignment my $q1; my $q2; my $concensus; my $concqual; $readStats{$contig}{ConcReads} = 1; $readStats{$contig}{ConcCorrectBases} = 0; $readStats{$contig}{ConcErrorsBases} = 0; while($aindex < length($ref1)) { # base1 is current base from read 1 # base2 is current base from read 2 my $base1 = substr($read1, $aindex, 1); my $base2 = substr($read2, $aindex, 1); # grab the next quality score if($R1qual < length($qtrim1)) { $q1 = substr($qtrim1, $R1qual, 1); } if($R2qual < length($qtrim2)) { $q2 = substr($qtrim2, $R2qual, 1); } # if we have a base in either if(($base1 ne '-' && $base1 ne 'N') || ($base2 ne '-' && $base2 ne 'N')) { if(($base2 eq "-" || $base2 eq 'N') && $base1 ne "-" && $base1 ne 'N') { # we are only in read 1 or read 2 is a N $concensus .= $base1; $concqual .= $q1; $R1qual++; if($base2 eq 'N') { $R2qual++; } } elsif(($base1 eq "-" || $base1 eq 'N') && $base2 ne "-" && $base2 ne 'N') { # we are only in read 2 or read 1 is a N $concensus .= $base2; $concqual .= $q2; $R2qual++; if($base1 eq 'N') { $R1qual++; } } else { # we are in the overlap region my $maxqual = sangerMax($q1, $q2); if($base1 eq $base2) { # both reads agree $readStats{$contig}{ConcCorrectBases}++; $concensus .= $base1; $concqual .= $maxqual; } elsif($q1 eq $q2) { # reads have different bases but the same score $readStats{$contig}{ConcErrorsBases}++; $concensus .= 'N'; $concqual .= '#'; } elsif($q1 eq $maxqual) { # reads have different bases but read 1 has the best score $concensus .= $base1; $concqual .= $maxqual; $readStats{$contig}{ConcErrorsBases}++; } else { # reads have different bases but read 2 has the best score $concensus .= $base2; $concqual .= $maxqual; $readStats{$contig}{ConcErrorsBases}++; } $R1qual++; $R2qual++; } } elsif($base1 eq 'N' || $base2 eq 'N') { if($base2 eq '-') { # one is a deletion the other is an N $concensus .= $base1; $concqual .= $q1; $R1qual++; } elsif($base1 eq '-') { # one is a deletion the other is an N $concensus .= $base2; $concqual .= $q2; $R2qual++; } elsif($base1 eq 'N' && $base2 eq 'N') { # both reads are an N $concensus .= 'N'; $concqual .= sangerMax($q1, $q2); $R1qual++; $R2qual++; } } else { $concensus .= '-'; } $aindex++; } $read1 = $concensus; $qtrim1 = $concqual; $read2 = undef; $qtrim2 = undef; } if(defined $read1) { # remove any trailing - and send to samfile_line $read1 =~ s/^-*?([^-].*)/$1/; $ref1 = substr($ref1, -length($read1)); $read1 =~ s/(.*[^-])-*?$/$1/; $ref1 = substr($ref1, 0, length($read1)); my $tmp = $ref1; $tmp =~ s/-//g; my $rstart = index($$refSeq{$contig}, $tmp) - $PU + 1; # generate sam file line { lock($Samfile); # Keep other threads from changing the queue's contents my @array = ($read, $qtrim1, $rstart, $read1, $ref1, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats); $Samfile->enqueue(\@array); } } if(defined $read2) { # remove any trailing - and send to samfile_line $read2 =~ s/^-*?([^-].*)/$1/; $ref2 = substr($ref2, -length($read2)); $read2 =~ s/(.*[^-])-*?$/$1/; $ref2 = substr($ref2, 0, length($read2)); my $tmp = $ref2; $tmp =~ s/-//g; my $rstart = rindex($$refSeq{$contig}, $tmp) - $PU + 1; # generate sam file line { lock($Samfile); # Keep other threads from changing the queue's contents my @array = ($read, $qtrim2, $rstart, $read2, $ref2, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats); $Samfile->enqueue(\@array); } } if(defined $OPTIONS{d}) { # debug info print "$ref1\n$read1\n" if defined $read1; print "$ref2\n$read2\n" if defined $read2; print "$qtrim1\n" if defined $read1; print "$qtrim2\n" if defined $read2; print "\n\n"; } # grab next item in queue { lock($Aligned); # Keep other threads from changing the queue's contents $readArrayRef = $Aligned->dequeue(); } } print "Closing Alignment Processor\n"; return; } # function to take reverse compliment sub revcompl { my $string = shift @_; my $revcomp = reverse($string); $revcomp =~ tr/ACGTacgtKMSWRYBDHVkmswrybdhv/TGCAtgcaMKSWYRVHDBmkswyrvhdb/; return $revcomp; } # function to convert sanger fastq quality scores to phred scores sub sanger2phred { my $sang = shift @_; my @sanger = split("", $sang); my @phred; foreach my $q (@sanger) { my $Q = ord($q) - 33; push @phred, $Q; } return @phred; } # function to return the largest sanger quality score sub sangerMax { my ($s1, $s2) = @_; my $Q1 = ord($s1) - 33; my $Q2 = ord($s2) - 33; if($Q1 > $Q2) { return $s1; } else { return $s2; } } # function to read a fasta file as a hash sub read_fasta_file { my ($fasta_file, $seqs) = @_; my $header; my $seq = ""; open (IN, $fasta_file) or die "can't open $fasta_file: $!\n"; while (my $line = <IN>) { chomp $line; if (grep />/, $line) { if ($seq ne "") { $seqs->{$header} = $seq; } $header = $line; $header =~ s/^>//; # remove ">" $header =~ s/\s+$//; # remove trailing whitespace $seq = ""; # clear out old sequence } else { $line =~ s/\s+//g; # remove whitespace $seq .= $line; # add sequence } } close IN; # finished with file if ($seq ne "") { # handle last sequence $seqs->{$header} = $seq; } return; } # minimal element of a list # sub min { my @list = @{$_[0]}; my $min = $list[0]; foreach my $i (@list) { $min = $i if ($i < $min); } return $min; } # maximal element of a list # sub max { my @list = @{$_[0]}; my $max = $list[0]; foreach my $i (@list) { $max = $i if ($i > $max); } return $max; } # trim sequences using the BWA algorithm sub trim { my ($s, $q, $threshold, $read) = @_; # read sequence and quality scores if($read == 2) { $q = reverse($q); $s = reverse($s); } my @array = sanger2phred($q); my $length = scalar @array; # only calculate if quality fails near end if( $array[$#array] >= $threshold ){ if($read == 2) { $q = reverse($q); $s = reverse($s); } return ($s, $q); } # run bwa equation my @arg; for( my $i = 0; $i < $length - 1; $i++ ){ my $x = $i + 1; for( my $j = $x; $j < $length; $j++ ){ $arg[$x] += $threshold - $array[$j]; } } # find number of 5' bases to retain my $index = 0; my $maxval = 0; for ( 1 .. $#arg ){ if ( $maxval < $arg[$_] ){ $index = $_; $maxval = $arg[$_]; } } # trim reads $s = substr($s,0,$index); $q = substr($q,0,$index); if($read == 2) { $s = reverse($s); $q = reverse($q); } # return seq and quality return ($s, $q); } sub samfile_line { # grab the variables my ($Samfile, $WriteBam, $refSeq) = @_; my %SamStats; my %minLength; my %maxLength; my $readArrayRef; { lock($Samfile); # Keep other threads from changing the queue's contents $readArrayRef = $Samfile->dequeue(); } while(defined $readArrayRef) { my ($read, $qual, $rstart, $align, $ref, $reg, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, $readStats) = @{ $readArrayRef }; my $seq = $align; $seq =~ s/-//g; # calculate the edit difference - count indels as only 1 missmatch hence tranform the strings so that all indels have length 1 my $sam; my $s1; my $s2; my @s1 = split("", $align); my @s2 = split("", $ref); my $indel = 0; for(my $i = 0; $i <= $#s1; $i++) { if(($s1[$i] ne "-" && $s2[$i] ne "-") || !$indel) { $s1 .= $s1[$i]; $s2 .= $s2[$i]; $indel = 0; if($s1[$i] eq "-" || $s2[$i] eq "-") { $indel = 1; } } } my $mq = levenshtein($s1, $s2); my $cl = length($seq); # calculate max diff allowed if(!defined $maxdiff{$cl}) { { lock(%maxdiff); $maxdiff{$cl} = maxdiff($cl, 0.02, 0.04); } } if($mq > $maxdiff{$cl} || $cl == 0) { { lock(%stats); $stats{$reg}{PoorMapQual}++; $stats{PoorMapQual}++; } if(defined $orig_read1) { $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read1\t$orig_qtrim1\tRG:Z:$readGroup\n"; # send to writebamfile thread { lock($WriteBam); # Keep other threads from changing the queue's contents $WriteBam->enqueue($sam); } } if(defined $orig_read2) { $read =~ s/[1](:[A-Z]+:[0-9]+:[ACTGN]+)$/2$1/; $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read2\t$orig_qtrim2\tRG:Z:$readGroup\n"; # send to writebamfile thread { lock($WriteBam); # Keep other threads from changing the queue's contents $WriteBam->enqueue($sam); } } } else { $mq = 60; # add any stats from earlier foreach my $key1 (keys %$readStats) { foreach my $key2 (keys % { $$readStats{$key1} }) { $SamStats{$key1}{$key2} += $$readStats{$key1}{$key2}; } } # keep track of quality scores $SamStats{$reg}{Bases} += $cl; $SamStats{$reg}{Quality} += $_ for sanger2phred($qual); # keep track of read lengths if(defined $minLength{$reg}) { $minLength{$reg} = min([$cl, $minLength{$reg}]); $maxLength{$reg} = max([$cl, $maxLength{$reg}]); } else { $minLength{$reg} = $cl; $maxLength{$reg} = $cl; } # keep track of strand and bases seen if($strand eq 'F') { $SamStats{$reg}{NumberForward}++; } else { $SamStats{$reg}{NumberReverse}++; } $SamStats{$reg}{TotalA} += ($seq =~ tr/A//); $SamStats{$reg}{TotalC} += ($seq =~ tr/C//); $SamStats{$reg}{TotalG} += ($seq =~ tr/G//); $SamStats{$reg}{TotalT} += ($seq =~ tr/T//); $SamStats{$reg}{TotalN} += ($seq =~ tr/N//); # keep count the number of reads for each contig $SamStats{$reg}{Reads}++; # calculate strand info if($strand eq 'F') { $strand = 0; } else { $strand = 16; } # calculate cigar my $cigar = ""; my $M = 0; my $D = 0; my $I = 0; my $Subs = 0; for(my $i = 0; $i < length($align); $i++) { if(substr($align, $i, 1) eq "-") { $D++; if($M) { $cigar .= "${M}M"; $M = 0; } elsif($I) { $cigar .= "${I}I"; $I = 0; } } elsif(substr($ref, $i, 1) eq "-") { $I++; if($M) { $cigar .= "${M}M"; $M = 0; } elsif($D) { $cigar .= "${D}D"; $D = 0; } } else { $M++; if($D) { $cigar .= "${D}D"; $D = 0; } elsif($I) { $cigar .= "${I}I"; $I = 0; } $Subs++ if substr($ref, $i, 1) ne substr($align, $i, 1); } } if($M) { $cigar .= "${M}M"; } elsif($I) { $cigar .= "${I}I"; } elsif($D) { $cigar .= "${D}D"; } # construct sam file line $sam = "$read\t$strand\t$refLength{$reg}{newReg}\t$rstart\t$mq\t$cigar\t*\t0\t0\t$seq\t$qual\tRG:Z:$readGroup\n"; # send to writebamfile thread { lock($WriteBam); # Keep other threads from changing the queue's contents $WriteBam->enqueue($sam); } } #grab the next read { lock($Samfile); # Keep other threads from changing the queue's contents $readArrayRef = $Samfile->dequeue(); } } { # add stats from earlier lock(%stats); foreach my $key1 (keys %SamStats) { $stats{$key1}{MinLength} = min([$minLength{$key1}, $stats{$key1}{MinLength}]); $stats{$key1}{MaxLength} = max([$maxLength{$key1}, $stats{$key1}{MaxLength}]); foreach my $key2 (keys % { $SamStats{$key1} }) { $stats{$key1}{$key2} += $SamStats{$key1}{$key2}; } } } print "Closing Sam File Line Generator\n"; return; } # perform the alignment sub nwAlign { my ($ToAlign, $Aligned, $regions) = @_; my $readArrayRef; { lock($ToAlign); # Keep other threads from changing the queue's contents $readArrayRef = $ToAlign->dequeue(); } while(defined $readArrayRef) { # we have picked a contig # check strand and take rev comp of ref if necessary my ($ref, $left, $right, $read, $qtrim1, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef }; my %align; my $name; #! Parameters #! ClustalW Parameters my $matchScore = 1.9; my $missmatch_penalty = -0.4; my $gapPenOpen = -10.0; my $gapPenExt = -0.4; my $leftAndRightGapPen = true; my $debug = false; my $cons = ""; my $align1= ""; my $align2= ""; my $align3= ""; my $score = 0.0; my $singleAlign = false; #aligning 2 or 3 sequences if($conc) { #got to concatenate seqs and align $score = align3($ref, $right, $left, $align1, $align2, $align3, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, $leftAndRightGapPen, $debug); $name = "$left.$right"; print "$contig\n$align1\n$align2\n$align3\n\n" if defined $$regions{$contig}; } elsif(!defined $right) { #align with left $score = align2($ref, $left, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, false, true, $debug); $singleAlign = true; $name = $left; print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig}; } elsif(!defined $left) { #align with right $score = align2($ref, $right, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen ,$gapPenExt ,true, false, $debug); $singleAlign = true; $name = $right; print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig}; } # add aligned reads to queue # cache results my $type; { lock($Aligned); # Keep other threads from changing the queue's contents my @array; if($singleAlign && !defined $right) { @array = ($read, $align1, $align2, $qtrim1, undef, undef, undef, $contig, $strand, $conc); { lock(%cacheAlign); $cacheAlign{$contig}{R1}{$name} = &share( [] ); @{ $cacheAlign{$contig}{R1}{$name} } = @array; } } elsif($singleAlign && !defined $left ) { @array = ($read, undef, undef, undef, $align1, $align2, $qtrim2, $contig, $strand, $conc); { lock(%cacheAlign); $cacheAlign{$contig}{R2}{$name} = &share( [] ); @{ $cacheAlign{$contig}{R2}{$name} } = @array; } } else { @array = ($read, $align1, $align2, $qtrim1, $align1, $align3, $qtrim2, $contig, $strand, $conc); { lock(%cacheAlign); $cacheAlign{Conc}{$name} = &share( [] ); @{ $cacheAlign{Conc}{$name} } = @array; } } $Aligned->enqueue(\@array); } #grab the next read { lock($ToAlign); # Keep other threads from changing the queue's contents $readArrayRef = $ToAlign->dequeue(); } } # clean up print "Closing NW2 Aligner\n"; return; } # function to calculat the minimum edit distance threshold (taken from bwa) sub maxdiff { my ($l, $err, $thres) = @_; my $lambda = exp(-$l*$err); my $y = 1.0; my $x = 1; my $sum = $lambda; for(my $k = 1; $k < 1000; $k++) { $y *= $l * $err; $x *= $k; $sum += $lambda * $y / $x; if(1.0 - $sum < $thres) { return $k; } } return 2; } # using approx matching return the positions in $list of those most closely matching $string sub amatch { my ($string, $list) = @_; my @d = adistr($string, @$list); my @d_index = sort { abs($d[$a]) <=> abs($d[$b]) } 0..$#$list; return if $d[$d_index[0]] > 0.25; my @pos = ($d_index[0]); my $i = 1; while($i <= $#d_index && $d[$d_index[0]] == $d[$d_index[$i]]) { push @pos, $d_index[$i]; $i++; } return @pos; } # a sorting algorithm to sort the chromosomes sub sortChrom { my ($achr, $astart, $aend) = split(":|-", $a); my ($bchr, $bstart, $bend) = split(":|-", $b); $achr =~ s/X/23/; $achr =~ s/Y/24/; $achr =~ s/MT?/25/; $bchr =~ s/X/23/; $bchr =~ s/Y/24/; $bchr =~ s/MT?/25/; if($achr < $bchr) { return -1; } elsif($bchr < $achr) { return 1; } elsif($astart < $bstart) { return -1; } elsif($bstart < $astart) { return 1; } elsif($aend < $bend) { return -1; } elsif($bend < $aend) { return 1; } else { return 0; } } # pass a code reference sub modify { my($text, $code) = @_; $code->($text); return $text; } sub split_region { my ($chr, $start, $end) = split(":|-", shift); $end = $start unless defined $end; return($chr, $start, $end); } sub error_rate { my ($x, $y) = @_; my @x = split("", $x); my @y = split("", $y); my $s = 0; my $l = 0; for(my $i = 0; $i <= $#x; $i++) { if($x[$i] ne "-" && $y[$i] ne "-") { $s++ if $x[$i] ne $y[$i]; $l++; } } return($s / $l); }