Mercurial > repos > lijing > bubio
diff MITObim_1.8.pl @ 6:a03d23c6ab95 draft
MitoBim and interleave
author | lijing |
---|---|
date | Thu, 02 Nov 2017 12:44:55 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MITObim_1.8.pl Thu Nov 02 12:44:55 2017 -0400 @@ -0,0 +1,1009 @@ +#! /usr/bin/perl +# +# MITObim - mitochondrial baiting and iterative mapping +# wrapper script version 1.8 +# Author: Christoph Hahn, 2012-2016 +# christoph.hahn@nhm.uio.no + +use strict; +use warnings; +use Getopt::Long; +use Cwd qw(abs_path); +use File::Copy; +use List::Util qw< min max >; +use POSIX qw(strftime); +use POSIX qw(ceil); +use File::Path 'rmtree'; + +my $startiteration = 1; +my $enditeration = 1; + +my ($quick, $noshow, $help, $strainname, $paired, $mode, $refname, $readpool, $maf, $proofreading, $readlength, $insertsize, $MM, $trim, $trimoverhang, $k_bait, $clean, $clean_interval, $min_contig_cov) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 31, 0, 2, 0); +my ($miramode, $key, $val, $exit, $current_contiglength, $current_number_of_reads, $current_number_of_contigs); +my $splitting = 0; +my $platform = "solexa"; +my $platform_settings;# = "SOLEXA"; +my $shme = ""; +my $trim_off = ""; +my $redirect_temp = ""; +my $NFS_warn_only = ""; +my ($mirapath, $mira, $miraconvert, $mirabait) = ("", "mira", "miraconvert", "mirabait"); +my (@reads, @output, @path, @current_contig_stats, @contiglengths, @number_of_reads, @number_of_contigs); +my %hash; +my $PROGRAM = "\nMITObim - mitochondrial baiting and iterative mapping\n"; +my $VERSION = "version 1.8\n"; +my $AUTHOR = "author: Christoph Hahn, (c) 2012-2016\n"; +my $cite = "\nif you found MITObim useful, please cite: +Hahn C, Bachmann L and Chevreux B. (2013) Reconstructing mitochondrial genomes directly from genomic next-generation sequencing reads - +a baiting and iterative mapping approach. Nucl. Acids Res. 41(13):e129. doi: 10.1093/nar/gkt371\n\n"; +my $USAGE = "\nusage: ./MITObim.pl <parameters> + \nparameters: + -start <int> iteration to start with, default=1 + -end <int> iteration to end with, default=1 + -sample <string> sampleID as used in initial MIRA assembly + -ref <string> referenceID as used in initial MIRA assembly + -readpool <FILE> readpool in fastq format (*.gz is also allowed) + -maf <FILE> maf file from previous MIRA assembly + \noptional: + --quick <FILE> starts process with initial baiting using provided fasta reference + --kbait <int> set kmer for baiting stringency (default: 31) + --platform specify sequencing platform (default: 'solexa'; other options: 'iontor', '454', 'pacbio') + --denovo runs MIRA in denovo mode (default: mapping) + --pair finds pairs after baiting (relies on /1 and /2 header convention for read pairs) (default: no) + --verbose show detailed output of MIRA modules (default: no) + --split split reference at positions with more than 5N (default: no) + --help shows this helpful information + --clean retain only the last 2 iteration directories (default: no) + --trimreads trim data (default: no; we recommend to trim beforehand and feed MITObim with pre trimmed data) + --trimoverhang trim overhang up- and downstream of reference (default: no) + --missmatch <int> number of allowed missmatches in mapping - only for illumina data (default: 15% of avg. read length) + --min_cov <int> minimum average coverage of contigs to be retained (default: off) + --mirapath <string> full path to MIRA binaries (only needed if MIRA is not in PATH) + --redirect_tmp redirect temporary output to this location (useful in case you are running MITObim on an NFS mount) + --NFS_warn_only allow MIRA to run on NFS mount without aborting - warn only (expert option - see MIRA documentation 'check_nfs') + \nexamples: + ./MITObim.pl -start 1 -end 5 -sample StrainX -ref reference-mt -readpool illumina_readpool.fastq -maf initial_assembly.maf + ./MITObim.pl -end 10 --quick reference.fasta -sample StrainY -ref reference-mt -readpool illumina_readpool.fastq\n\n"; +# --proofread applies proofreading (atm only to be used if starting the process from a single short seed reference) +# --readlength <int> read length of illumina library, default=150, relevant only for proofreading +# --insert <int> insert size of illumina library, default=300, relevant only for proofreading + +my $command = $0; +for (@ARGV){ + $command .= " $_"; +} + +GetOptions ( "start=i" => \$startiteration, + "end=i" => \$enditeration, + "kbait=i" => \$k_bait, + "quick=s" => \$quick, + "verbose!" => \$noshow, + "sample=s" => \$strainname, + "paired" => \$paired, + "denovo" => \$mode, + "ref=s" => \$refname, + "readpool=s" => \$readpool, + "help!" => \$help, + "clean!" => \$clean, + "mirapath=s" => \$mirapath, + "maf=s" => \$maf, +# "proofreading!" => \$proofreading, + "trimreads!" => \$trim, + "trimoverhang!" => \$trimoverhang, + "missmatch=i" => \$MM, + "platform=s" => \$platform, +# "readlength=i" => \$readlength, +# "insertsize=i" => \$insertsize, + "split!" => \$splitting, + "min_cov=i" => \$min_contig_cov, + "redirect_tmp=s" => \$redirect_temp, + "NFS_warn_only!" => \$NFS_warn_only) or die "Incorrect usage!\n$USAGE"; + + +print $PROGRAM; +print $VERSION; +print $AUTHOR; + +print $USAGE and exit if $help; +print $USAGE and exit if !$readpool; +unless ($quick){ + print $USAGE and exit if !$maf; +} +print $USAGE and exit if !$refname; + +$readpool=abs_path($readpool); +unless (-e $readpool){ + print "Cant find the readpool. Is the path correct?\n"; +} +if ($maf){ + $maf=abs_path($maf); + unless (-e $maf){ + print "Cant find *.maf file. Is the path correct?\n"; + } +} + +($platform, $platform_settings) = &set_platform($platform); + +if ($quick){ + $quick=abs_path($quick); + unless (-e $quick){ + print "quick option selected but is the path to the file correct?\n"; + exit 1; + } + print "\nquick option selected! -maf option will be ignored (if given)\n"; + $maf = 0; + $startiteration = 0; +} +print $USAGE and exit if ($startiteration > $enditeration); +unless (((-e $maf)||($quick)) && (-e $readpool)){ + print "\nAre readpool AND maf/reference files there?\n"; + exit 1; +} +if ($mirapath){ + if (-e "$mirapath/mira"){ + print "found executables in the path specified by the user - good!\n"; + $mira = "$mirapath/mira"; + $miraconvert = "$mirapath/miraconvert"; + $mirabait = "$mirapath/mirabait"; + }else{ + print "somethings wrong with the path to mira.\n"; + exit 1; + } +} + + +##if not given otherwise, readlength and insertsize are set to default. automatic readlength and insertsize detection will be implemented in time. +#if (!$readlength){ +# $readlength = 150; +#} +#if (!$insertsize){ +# $insertsize = 300; +#} +if (!$mode){ + $miramode = "mapping"; +}else { + $miramode = "denovo"; +} + +if (!$trim){ + $trim_off = "\"--noclipping -CL:pec=no\""; +} + +my $out_command = $command; $out_command =~ s/\S+galaxy\///g; +print "\nFull command run:\n$out_command\n"; +print "\nAll paramters seem to make sense:\n"; +print "startiteration: $startiteration\n"; +print "enditeration: $enditeration\n"; +print "sample: $strainname\n"; +print "refname: $refname\n"; +my $out_readpool = $readpool; $out_readpool =~ s/.*galaxy\///; +print "readpool: $out_readpool\n"; +print "maf: $maf\n"; +my $out_quick = $quick; $out_quick =~ s/.*galaxy\///; +print "quick: $out_quick\n"; +print "paired: $paired (off=0, on=1)\n"; +print "assembly mode: $mode (mapping=0, denovo=1)\n"; +print "verbose: $noshow (off=0, on=1)\n"; +print "split: $splitting (off=0, on=1)\n"; +print "minimum avg. coverage: $min_contig_cov (off=0)\n"; +print "clean: $clean (off=0, on=1)\n"; +print "trim reads: $trim (off=0, on=1)\n"; +print "trim overhang: $trimoverhang (no=0, yes=1)\n"; +print "platform: $platform\n"; +print "kmer baiting: $k_bait\n"; +#print "proofread: $proofreading (off=0, on=1)\n"; + +if ($proofreading){ + print "\nproofreading is not yet enabled in this beta version of MITObim 1.8 - it is currently being optimized for MIRA4. \nplease refer to MITObim 1.6 if you wish to use this option. Sorry for the inconvenience!\n\n"; + exit; + + print "proofreading: on\n"; + print "readlength: $readlength\n"; + print "insertsize: $insertsize\n"; + $MM = 0; + print "number of allowed missmatches in proofreading assembly: $MM\n"; + $shme = "-AL:shme=$MM"; +}elsif ((!$proofreading) && (!$mode) && ($platform eq "solexa")){ + if ($MM == -1){ + print "number of missmatches in mapping assembly: default (15% of average read length loaded)\n"; + $shme = ""; + }else { + print "number of missmatches in mapping assembly: $MM\n"; + $shme = "-AL:shme=$MM"; + } + print "proofreading: off\n"; +} + +if (!$trimoverhang){ + $trimoverhang = "-SB:tor=no"; +}else { + $trimoverhang = "-SB:tor=yes"; +} + +if ($redirect_temp) { + $redirect_temp="-DI:trt=$redirect_temp" +} + +print "\nStarting MITObim \n"; + +my @iteration = ($startiteration .. $enditeration); +foreach (@iteration){ + chomp; + my $currentiteration = $_; + mkdir "iteration$currentiteration" or die "MITObim will not overwrite an existing directory: iteration$currentiteration\n"; + chdir "iteration$currentiteration" or die $!; + print "\n==============\n"; + print " ITERATION $currentiteration\n"; + print "==============\n"; + print strftime("%b %e %H:%M:%S", localtime) . "\n\n"; +# if (($proofreading) && ($currentiteration != 0)){ + if ($proofreading){ + $shme = "-AL:shme=0"; + } + + if ($maf){ + print "\nrecover backbone by running miraconvert on maf file\n\n"; + if ($currentiteration<2){ + @output= qx($miraconvert -f maf -t fasta -A "$platform_settings -CO:fnicpst=yes" $maf tmp); + }else{ + @output= qx($miraconvert -f maf -t fasta -y $min_contig_cov -A "$platform_settings -CO:fnicpst=yes" $maf tmp); + } + $exit = $? >> 8; + unless (!$noshow){ + print "@output\n"; + } + unless ($exit == 0){ + if (!$noshow){ + print "@output\n"; + } + print "\nmiraconvert seems to have failed - see detailed output above\n"; + exit; + } +# if ( ((($mode) && ($currentiteration > 1)) && (!$quick)) || ((($mode) && ($currentiteration >= 1)) && ($quick)) ){ +# open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!"; +# }else{ + open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!\nIs the sampleID identical as in the initial MIRA assembly?\n"; +# } + open(FH2,">backbone_it$currentiteration\_initial_$refname.fna") or die $!; + while (<FH1>) { + $_ =~ s/x/N/g; + print FH2 $_; + } + close(FH1); + close(FH2); + unlink glob ("tmp*"); + } + MIRABAIT: + unless ($maf){ + print "\nquick option baits reads from provided reference in iteration 0\n"; + copy("$quick", "backbone_it$currentiteration\_initial_$refname.fna") or die "copy failed: $!"; + } + &check_ref_length("backbone_it$currentiteration\_initial_$refname.fna","temp_baitfile.fasta",29800,$k_bait); + + if ($splitting){ + &remove_unmapped_contigs("temp_baitfile_backup.fasta","temp_baitfile.fasta"); + if (-e "temp_baitfile_backup.fasta"){ #this file has only been created if splitting contigs was necessary + copy "temp_baitfile.fasta","backbone_it$currentiteration\_initial_$refname.fna"; + } + } + print "\nfishing readpool using mirabait (k = $k_bait)\n\n"; + +# @output = (`mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-$refname\_in.$platform 2>&1`); + @output = qx($mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-readpool-it$currentiteration); + $exit = $? >> 8; + unless (!$noshow){ + print "@output\n"; + } + if (!-s "$strainname-readpool-it$currentiteration.fastq"){ + print "\nyour readpool does not contain any reads with reasonable match (k = $k_bait) to your reference - Maybe you ll want to different settings or even a different reference?\n\n"; + exit; + } + unless ($exit == 0){ + if (!$noshow){ + print "@output\n"; + } + print "\nmirabait seems to have failed - see detailed output above\n"; + exit; + } + + FINDPAIRS: + + unless (!$paired){ + print "\nfinding pairs for baited reads\n\n"; + copy ("$strainname-readpool-it$currentiteration.fastq", "$strainname-readpool-it$currentiteration-se.fastq") or die "copy failed: $!"; + open(FH1,"<$strainname-readpool-it$currentiteration.fastq") or die $!; + open(FH2,">list"); + my $index=1; + while (<FH1>) { + if ($index % 8 ==1 || $index % 8 ==5) { + chomp; + $_ =~ s/@//g; + ($key, $val) = split /\//; + $hash{$key} .= exists $hash{$key} ? ",$val" : $val; + } + $index++; + } + + for (keys %hash){ + $_ =~ s/$/\/1/g; + print FH2 "$_\n"; + $_ =~ s/1$/2/g; + print FH2 "$_\n"; + } + close(FH1); + close(FH2); +# @output = (`miraconvert -f fastq -t fastq -n list $readpool $strainname-$refname\_in.$platform 2>&1`); + @output = qx($miraconvert -f fastq -t fastq -n list $readpool $strainname-readpool-it$currentiteration ); + $exit = $? >> 8; + unless (!$noshow){ + print "@output\n"; + } + unless ($exit == 0){ + if (!$noshow){ + print "@output\n"; + } + print "\nmiraconvert seems to have failed - see detailed output above\n"; + exit; + } + } + unlink("list"); + + MIRA: + print "\nrunning $miramode assembly using MIRA\n\n"; + &create_manifest($currentiteration,$strainname,$refname,$miramode,$trim_off,$platform_settings,$shme,$paired,$trimoverhang,"$strainname-readpool-it$currentiteration.fastq","backbone_it$currentiteration\_initial_$refname.fna", $redirect_temp, $NFS_warn_only); + @output = qx($mira manifest.conf ); + + $exit = $? >> 8; + unless (!$noshow){ + print "@output\n"; + } + unless ($exit == 0){ + if (!$noshow){ + print "@output\n"; + } + print "\nMIRA seems to have failed - see detailed output above\n"; + exit; + } + + if ($redirect_temp) { + my $tmp_link = readlink("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp"); + rmtree ("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp") or die $!; + mkdir "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/" or die $1; + my @files = glob("$tmp_link/*"); + for (@files) { + move ("$_", "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/") or die $!; + } + rmtree ("$tmp_link") or die $!; + } + + @path = abs_path; + push (@path, "/$strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out.maf"); + $maf = join("",@path); + unless (-e $maf){ + print "maf file is not there \n"; + exit; + } + +# $current_contiglength = &get_contig_length("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt"); +# $current_number_of_reads = (&get_number_of_reads("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt") - 1); + + @current_contig_stats = &get_contig_stats("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt"); + if (((scalar @current_contig_stats > 3) || ($current_contig_stats[0] > 1)) && ($proofreading)) { + print "assembly consists of more than one contigs - this is atm not permitted in proofreading mode. Sorry!\n\n"; + exit 1; + } + + PROOFREAD: +# if (($proofreading) && ($currentiteration >= 1)){ + if ($proofreading){ + print "proofreading option is currently disabled in this beta version of MITObim 1.7 - sorry for the inconvenience!\n\n"; + exit; + } + + $current_number_of_contigs = shift @current_contig_stats; + $current_number_of_reads = shift @current_contig_stats; + if (!$mode){ #in mapping assemblies the reference is counted as one read + $current_number_of_reads -= $current_number_of_contigs; + } + push (@number_of_contigs, $current_number_of_contigs); + push (@number_of_reads, $current_number_of_reads); + print "readpool contains $current_number_of_reads reads\n"; + print "assembly contains $current_number_of_contigs contig(s)\n"; + if (scalar @current_contig_stats == 1){ + print "contig length: $current_contig_stats[0]\n"; + }elsif (scalar @current_contig_stats == 3){ + print "min contig length: ".$current_contig_stats[0]." bp\nmax contig length: ".$current_contig_stats[1]." bp\navg contig length: ".sprintf("%.0f", $current_contig_stats[2])." bp\n"; + print "find details on individual contigs in: ". abs_path . "/$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt\n"; +# printf("%1f",$current_contig_stats[2])."\n"; + }else { + print "somethings wrong with your contig stats. Sorry!\n"; + exit 1; + } + + if ($clean){ + &clean($clean_interval, $currentiteration); + } +# my $path=abs_path; print "Now at $path\n"; + system("cp $strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out_AllStrains.unpadded.fasta ../finaloutput.fasta"); + system("sed -i 's/_bb.*/_mitobim_out_$currentiteration\_iterations/' ../finaloutput.fasta"); + if ($number_of_reads[-2]){ + if ($number_of_reads[-2] >= $number_of_reads[-1]){ + print "\nMITObim has reached a stationary read number after $currentiteration iterations!!\n$cite"; + print strftime("%b %e %H:%M:%S", localtime) . "\n\n"; + exit; + } + } + chdir ".." or die "Failed to go to parent directory: $!"; +} +print "\nsuccessfully completed $enditeration iterations with MITObim! " . strftime("%b %e %H:%M:%S", localtime) . "\n$cite"; + +# +# +###SUBROUTINES +# +# +# + +sub set_platform{ + my @platforms = ('solexa', '454', 'iontorrent', 'pacbio'); + my %pf_settings = ("solexa" => "SOLEXA_SETTINGS", "454" => "454_SETTINGS", "iontorrent" => "IONTOR_SETTINGS", "pacbio" => "PCBIOHQ_SETTINGS -CO:mrpg=5"); + my $user_setting = shift; + my @user_choice; + for (@platforms){ + if ( $_ =~ /^$user_setting/ ){ + push(@user_choice, $_); + } + } + + if (scalar(@user_choice) > 1){ + print "\nYour platform choice was ambiguous: $user_setting could be: @user_choice - Please try again\n"; + exit; + + } elsif (scalar(@user_choice) == 0){ + print "\nPlease specify a valid sequencing platform - not specifying anything will default into: $platforms[0]\n\navailable options are:\n@platforms\n"; + exit; + } + + return ($user_choice[0], $pf_settings{$user_choice[0]}) +} + +sub get_contig_stats{ + my $contig = $_[0]; + my @array; + my @contiglength; + my (@readnumber, @stats); + my $readssum = 0; + open (CONTIGSTATS,"<$contig") or die $!; + while (<CONTIGSTATS>){ + unless ($_ =~ /#/){ + @array = split /\t/; + push (@contiglength, $array[1]); + push (@readnumber, $array[3]); + } + } + close (CONTIGSTATS); + if (scalar @readnumber == 1){ + push (@stats, (scalar @readnumber, $readnumber[0], $contiglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, length of contig + } + elsif (scalar @readnumber > 1){ + $readssum += $_ for @readnumber; + my $minlength = min @contiglength; + my $maxlength = max @contiglength; + my @avglength = &standard_deviation(@contiglength); + push (@stats, (scalar @readnumber, $readssum, $minlength, $maxlength, $avglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, minimal, maximal, avg length of contigs + } + return @stats; +} + + +#sub get_contig_length{ +# my $contig = $_[0]; +# my @contiglength; +# open (CONTIGSTATS,"<$contig") or die $!; +# while (<CONTIGSTATS>){ +# unless ($_ =~ /#/){ +# @contigslength = split /\t/; +# } +# } +# close (CONTIGSTATS); +# return $contiglength[1]; +#} + +#sub get_number_of_reads{ +# my $contig = $_[0]; +# my @contigstats; +# open (CONTIGSTATS,"<$contig") or die $!; +# while (<CONTIGSTATS>){ +# unless ($_ =~ /#/){ +# @contigstats = split /\t/; +# } +# } +# close (CONTIGSTATS); +# return $contigstats[3]; +#} + +sub proofread { + my $zero_MM = $_[0]; + my $readtaglist_FH = $_[1]; + my $contiglength = $_[2]; + my $elevated_cov_lower_start = $_[3]; + my $elevated_cov_lower_end = $_[4]; + my $elevated_cov_upper_start = $_[5]; + my $elevated_cov_upper_end = $_[6]; + my $lower_limit = $_[7]; +# my $lower_limit = 200; + my $lower_main_limit = $_[8]; +# my $lower_main_limit = 500; + my $upper_limit = $contiglength - $lower_limit; + my $upper_main_limit = $contiglength - $lower_main_limit; + my @readtaglist; + my $ref; + my $junk; + my $current_id; + my %count; + my @readid_good; + my @readid_bad; + my @reads; + my @readlist; + my @readlist_good; + my @readlist_proofread; + my @total_readlist; + my @singleton; + my @singletons; + my @taglist; + my @taglist_line; + my @readtaglist_lower; + my @readtaglist_upper; + my @read_ids_lower; + my @read_ids_all; + my @read_ids_upper; + my %ids =(); + my @unsorted; + my $min; + my $max; + my $tag; + + + print "\nlower limit: $lower_limit\n"; + print "upper limit: $upper_limit\n"; + print "lower main limit: $lower_main_limit\n"; + print "upper main limit: $upper_main_limit\n\n"; + open (TAGLIST,"<$readtaglist_FH") or die $!; + while (<TAGLIST>){ + push (@readtaglist, "$_"); + } + close (TAGLIST); + + for (@readtaglist){ + @taglist_line = split /\t/; + unless ($taglist_line[0] =~ /#/){ + $ref = join ("\t", $taglist_line[0], $taglist_line[6]); + push (@read_ids_all, $ref); + + if (($taglist_line[1] <= $lower_limit) || ($taglist_line[2] <= $lower_limit)){ +# if ((($taglist_line[1] <= $lower_limit) || (($taglist_line[1] >= $coverage_limits_lower[0])&&($taglist_line[1] <= $coverage_limits_lower[1]))) || ($taglist_line[2] <= $lower_limit)){ + $ref = join ("\t", $taglist_line[0], $taglist_line[6]); + push (@read_ids_lower, $ref); + }elsif (($taglist_line[1] >= $upper_limit) || ($taglist_line[2] >= $upper_limit)){ + $ref = join ("\t", $taglist_line[0], $taglist_line[6]); + push (@read_ids_upper, $ref); + } + } + } + %ids = map { $_ => 1 } @read_ids_lower; + my @unique_lower = keys %ids; + %ids = map { $_ => 1 } @read_ids_upper; + my @unique_upper = keys %ids; + %ids = map { $_ => 1 } @read_ids_all; + my @unique_all = keys %ids; + for (@unique_all) { + my @junk = split /\//; + push (@reads, $junk[0]); + @junk = split /\t/; + push (@total_readlist, $junk[1]); + } + + map { $count{$_}++ } @reads; + map {if ($count{$_} == 2){ @readid_good = split /\t/; push(@readlist, "$readid_good[1]");} elsif ($count{$_} == 1) { push(@readid_bad, "$_");}} keys (%count); + @reads = {}; + undef %count; + + for (@readlist){ + chomp; + $current_id = $_; + my @pairs_lower = grep { $_ =~ /$current_id/} @unique_lower; + my @pairs_upper = grep{ $_ =~ /$current_id/} @unique_upper; +# print "good id: $current_id\n"; + my $count_lower = scalar @pairs_lower; + my $count_upper = scalar @pairs_upper; +# print "count lower: $count_lower\n"; +# print "count upper: $count_upper\n"; + unless ((scalar @pairs_lower == 2) || (scalar @pairs_upper == 2)){ + push (@readlist_good, "$current_id"); + } + } + for (@readid_bad){ + chomp; + @unsorted = (); + ($junk, $current_id) = split (/\t/); + print "current ID: $current_id\n"; + @singleton = grep { $_ =~ /$current_id/} @total_readlist; + for (@singleton){ + chomp; + $tag = $_; + @taglist = grep { $_ =~ /$tag/} @readtaglist; +# print "taglist: @taglist\n"; + } + for (@taglist) { + @taglist_line = split /\t/; + push(@unsorted, $taglist_line[1], $taglist_line[2]); + $max = max @unsorted; + $min = min @unsorted; + } +# print "unsorted: @unsorted\n"; + print "read mapping from $min to $max\n"; +# print "min: $min\n"; +# print "max: $max\n"; + + if ($min <= $lower_limit){ + print "orphan discarded! min<lowerlimit\n----------------------------------------------\n"; + }elsif ($max >= $upper_limit){ + print "orphan discarded! max>upperlimit\n----------------------------------------------\n"; + }elsif (($min >= $lower_main_limit) && ($max <= $upper_main_limit)){ + print "orphan discarded! lower_main_limit<min-max<upper_main_limit\n----------------------------------------------\n"; + }elsif (($min >= $elevated_cov_lower_start) && ($min <= $elevated_cov_lower_end - ($lower_limit / 2))){ + print "orphan discarded! increased_coverage_lower_start<min<increased_coverage_lower_end\n----------------------------------------------\n"; + }elsif (($max >= ($elevated_cov_upper_start + ($lower_limit / 2))) && ($max <= $elevated_cov_upper_end)){ + print "orphan discarded! increased_coverage_upper_start<max<increased_coverage_upper_end\n----------------------------------------------\n"; + }else { + push(@singletons, "@singleton\n"); + print "orphan resurrected! \n----------------------------------------------\n"; + } +# print "contiglength: $contiglength\n"; + } + + for (@singletons){ + my @resurrection = split /\//; + push (@readlist_good, $resurrection[0]); + } + for (@readlist_good){ + $_ =~ s/$/\/1\n/g; + push(@readlist_proofread, $_); + $_ =~ s/1$/2/g; + push(@readlist_proofread, $_); + } + return @readlist_proofread; +} + +sub standard_deviation { + my(@numbers) = @_; + #Prevent division by 0 error in case you get junk data + return undef unless(scalar(@numbers)); + + # Step 1, find the mean of the numbers + my $total1 = 0; + foreach my $num (@numbers) { + if (!$num){ + $num = 0; + } + $total1 += $num; + } + my $mean1 = $total1 / (scalar @numbers); + push (my @stdev, "$mean1"); + # Step 2, find the mean of the squares of the differences between each number and the mean + my $total2 = 0; + foreach my $num (@numbers) { + if (!$num){ + $num = 0; + } + $total2 += ($mean1-$num)**2; + } + my $mean2 = $total2 / (scalar @numbers); + + # Step 3, standard deviation is the square root of the above mean + my $std_dev = sqrt($mean2); + push (@stdev, "$std_dev"); + return @stdev; +} + +sub assess_coverage{ + my $readtaglist_FH = $_[0]; + my @readtaglist; + my $from =$_[1]; + my $to = $_[2]; + my $where = $_[3]; + my @taglist_line; + my @coverage_array_lower; + my @coverage_array_upper; + my @read_ids_lower; + my @read_ids_upper; + my %ids; + my @taglist; + my @unsorted; + my $min; + my $max; + my %coverage; + my @allnums; + my @coverage_change_position; + my @coverage_limits; + +# print "assessing coverage from position $from to position $to\n"; + open (TAGLIST,"<$readtaglist_FH") or die $!; + while (<TAGLIST>){ + push (@readtaglist, "$_"); + } + close (TAGLIST); + + for (@readtaglist){ + @taglist_line = split /\t/; + unless ($taglist_line[0] =~ /#/){ + if ((($taglist_line[1] >= $from) && ($taglist_line[1] <= $to)) || (($taglist_line[2] >= $from) && ($taglist_line[2] <= $to))){ + push (@coverage_array_lower, "$_"); + push (@read_ids_lower, $taglist_line[6]); + } + } + } + %ids = map { $_ => 1 } @read_ids_lower; + my @unique_lower = keys %ids; + for (@unique_lower){ + my @current_id = $_; + chomp; + @unsorted = (); + for (@current_id){ + my $current_id = $_; + @taglist = grep { $_ =~ /$current_id/} @coverage_array_lower; + } + for (@taglist) { + @taglist_line = split /\t/; + push(@unsorted, $taglist_line[1], $taglist_line[2]); + $max = max @unsorted; + $min = min @unsorted; + } + my @nums = ($min .. $max); + for (@nums){ + push (@allnums, "$_"); + } + } + %coverage = map { $_ => 0 } @allnums; + map { $coverage{$_}++ } @allnums; + open (OUT,">out-$where.csv"); + +########## detecting coverage peak + my $max_cov = 0; + my $max_cov_position; + my @cumulative_coverage; + map { unless (!$coverage{$_}){print OUT "$_,$coverage{$_}\n"; push (@cumulative_coverage, "$coverage{$_}"); if ($coverage{$_} > $max_cov){$max_cov = $coverage{$_}; $max_cov_position = $_; }}} ($from..$to); + my @average_coverage = &standard_deviation(@cumulative_coverage); + my $coverage_factor = $max_cov / $average_coverage[0]; + open (OUT,">>out-$where.csv"); + print OUT "\nmaximum coverage is $max_cov at position $max_cov_position\naverge coverage is: $average_coverage[0], sd: $average_coverage[1]\nfactor $coverage_factor\n"; + close (OUT); + +######### detecting rapid changes in coverage + for ($from..($to - 10)){ + my $position = $_; + my $cov = $coverage{$position}; + unless (!$cov){ + my @positions = (); + push (@positions, "$cov"); + for (1 .. 10){ + my $pos_plus = $position + $_; + if ($coverage{$pos_plus}){ + push (@positions, "$coverage{$pos_plus}"); + } + } + my @stdev = &standard_deviation(@positions); + if ($stdev[1] > 6.0){ + print "positions ($position): @positions -> stdev: $stdev[1]\n"; + push (@coverage_change_position, $position); + }elsif ($stdev[1] >= 4.5){ + print "positions ($position): @positions -> stdev: $stdev[1]\n"; + } + } + } + if (@coverage_change_position){ + print "positions with rapidly changing coverage detected: @coverage_change_position\n"; + my $start = min @coverage_change_position; + my $end = max @coverage_change_position; + push (@coverage_limits, "$start", "$end"); + print "set limits from $coverage_limits[0] to $coverage_limits[1]\n"; + }else{ + print "no irregularities in coverage detected\n"; + push (@coverage_limits, "0", "0"); + return @coverage_limits; + } + + +###### assessing whether coverage peak lies within putative conserved region, if yes accept prediction; if no, reject conserved region + if (($coverage_factor >= 1.6) && (($coverage_limits[0] < $max_cov_position) && ( $max_cov_position < $coverage_limits[1]))){ + + print "suspicious coverage peak detected within the predicted limits\n"; + }else { + print "no coverage peak detected within predicted limits - rejecting limits\n"; + @coverage_limits = (); + push (@coverage_limits, "0", "0"); + } + return @coverage_limits; + +} + +sub check_ref_length{ + my $ref=$_[0]; + my $output_filename=$_[1]; + my $critical=$_[2]; + my $kmer = $_[3]; + my @header; + my $header_count=0; + my (@sequence,@temp_output,@final_output); + my $full_sequence; + open(REF,"<$ref") or die $!; + while(<REF>){ + chomp; + if ($_ =~ /^>/){ + push(@header,$_); +# print "found header:\n$header[$header_count]\n"; + $header_count++; +# print "header count: $header_count\n"; + if (@sequence){ + @temp_output=&finalize_sequence($critical,$header[-2],$kmer,@sequence); + for (@temp_output){ + push(@final_output,$_); + } + } + undef @sequence; + }elsif ($_ =~ /[a-zA-Z]/){ +# print "found sequence:\n$_\n"; + push(@sequence,$_); + } + } + @temp_output=&finalize_sequence($critical,$header[-1],$kmer,@sequence); + for (@temp_output){ + push(@final_output,$_); + } +# print "result:\n"; + open (OUT,">$output_filename") or die $!; + for(@final_output){ +# print "$_\n"; + print OUT "$_\n"; + } + close REF; + close OUT; + +} + +sub finalize_sequence{ + my $critical = shift; + my $header = shift; + my $kmer = shift; + my $full_sequence=join("",@_); + my $factor; + my @output; + if (!$critical){ + $factor=0; + }else{ + $factor=ceil(length($full_sequence)/$critical); + } + if ($factor == 1){ + push(@output,$header); + push(@output,$full_sequence); + }else{ #too long + print "\nreference is too long for mirabait to be handled in one go -> will be split into sub-sequences\n"; + $header=substr $header, 1; + for (my $i=0; $i<$factor; $i++){ + unless ((length(substr $full_sequence, $i*$critical, $critical+$kmer)-$kmer)<0){ + push(@output,">sub$i\_" .$header); + push(@output,substr $full_sequence, $i*$critical, $critical+$kmer); + } + } + } + return @output; +} + +sub create_manifest { + my ($iter, $sampleID, $refID, $mmode, $trim, $platform, $solexa_missmatches, $pair, $overhang, $reads, $ref, $redirect, $NFS_warn); + $iter = $_[0]; + $sampleID = $_[1]; + $refID = $_[2]; + $mmode = $_[3]; + $trim = $_[4]; + $platform = $_[5]; + $solexa_missmatches = $_[6]; + $pair = $_[7]; + $overhang = $_[8]; + $reads = $_[9]; + $ref = $_[10]; + $redirect = $_[11]; + $NFS_warn = $_[12]; + + if ($NFS_warn){ + $NFS_warn = ":cnfs=warn" + } + + open (MANIFEST,">manifest.conf") or die $!; + print MANIFEST "#manifest file for iteration $iter created by MITObim\n\nproject = $sampleID-$refID + \njob = genome,$mmode,accurate + \nparameters = -NW:mrnl=0$NFS_warn -AS:nop=1 $redirect $overhang $platform $trim -CO:msr=no $solexa_missmatches\n"; + my @technology = split("_",$platform); + #-notraceinfo - + if ($mmode eq "mapping"){ + print MANIFEST "\nreadgroup\nis_reference\ndata = $ref\nstrain = $refID\n"; + } + print MANIFEST "\nreadgroup = reads\ndata = $reads\ntechnology = $technology[0]"; + if ($pair){ +# print MANIFEST "\nsegmentplacement = ---> <--- infoonly"; + print MANIFEST "\nsegmentplacement = ---> <--- exclusion_criterion"; + print MANIFEST "\ntemplatesize = -1 600 exclusion_criterion"; + } + print MANIFEST "\nstrain = $sampleID\n"; + close MANIFEST; +} +sub clean { + my $interval = shift; + my $cur = shift; + my $dir = $cur-$interval; + my $path=abs_path; + if (-d "$path/../iteration$dir"){ + print "\nnow removing directory iteration$dir\n"; + rmtree ("$path/../iteration$dir") or die $!; + } +} + +sub remove_unmapped_contigs{ + my $file = shift; + my $out = shift; + my ($head, $seq,$length); + my @split; + my ($dropped,$split,$trim)=(0,0,0); + open (FASTA,"<$out") or die $!."\ncould not open $file for reading\n"; + while (<FASTA>){ + chomp; + if ($_ =~ /^>/){ + $head=$_; + }else { + if ($_ =~ /X/){ + $dropped++; + print "drop:\n$head\n$_\n"; + }else{ + $length=length($_); + $_ =~ s/^N+//g; #remove all Ns from the beginning of the sequence + $_ =~ s/N+$//g; #remove all Ns from the end of the sequence + if ($length != length($_)){ + $trim++; + } + @split=split(/N{3,}/); #split at every occurance of 3 or more Ns + if (@split > 1){ + print "contig $head has been split into ".scalar @split." sub-contigs\n"; + $split++; + for (my $i=0; $i<@split; $i++){ +# print "$head\_$i\n$split[$i]\n"; + $seq.="$head\_$i\n$split[$i]\n"; + } + }else{ +# print "$head\n$_\n"; + $seq.="$head\n$_\n"; + } + } + } + } + close FASTA; + if (($dropped) || ($split) || ($trim)){ + print "creating backup of $out before changing it\n"; + copy $out,$file; + open (OUT,">$out") or die $!."\ncould not open $out for writing\n"; + print OUT "$seq\n"; + close OUT; + } + + if ($dropped){ + print "\nremoved $dropped contigs from baitfile because they did not receive any hits\n"; + }else{ + print "\nno need to remove any sequences from the baitfile\n"; + } + + if (!$split){ + print "\nno need to split any contigs\n"; + } + if ($trim){ + print "\n$trim contigs had N's trimmed off their ends\n"; + } +}