Mercurial > repos > lijing > bubio
view MITObim_1.8.pl @ 6:a03d23c6ab95 draft
MitoBim and interleave
author | lijing |
---|---|
date | Thu, 02 Nov 2017 12:44:55 -0400 |
parents | |
children |
line wrap: on
line source
#! /usr/bin/perl # # MITObim - mitochondrial baiting and iterative mapping # wrapper script version 1.8 # Author: Christoph Hahn, 2012-2016 # christoph.hahn@nhm.uio.no use strict; use warnings; use Getopt::Long; use Cwd qw(abs_path); use File::Copy; use List::Util qw< min max >; use POSIX qw(strftime); use POSIX qw(ceil); use File::Path 'rmtree'; my $startiteration = 1; my $enditeration = 1; my ($quick, $noshow, $help, $strainname, $paired, $mode, $refname, $readpool, $maf, $proofreading, $readlength, $insertsize, $MM, $trim, $trimoverhang, $k_bait, $clean, $clean_interval, $min_contig_cov) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 31, 0, 2, 0); my ($miramode, $key, $val, $exit, $current_contiglength, $current_number_of_reads, $current_number_of_contigs); my $splitting = 0; my $platform = "solexa"; my $platform_settings;# = "SOLEXA"; my $shme = ""; my $trim_off = ""; my $redirect_temp = ""; my $NFS_warn_only = ""; my ($mirapath, $mira, $miraconvert, $mirabait) = ("", "mira", "miraconvert", "mirabait"); my (@reads, @output, @path, @current_contig_stats, @contiglengths, @number_of_reads, @number_of_contigs); my %hash; my $PROGRAM = "\nMITObim - mitochondrial baiting and iterative mapping\n"; my $VERSION = "version 1.8\n"; my $AUTHOR = "author: Christoph Hahn, (c) 2012-2016\n"; my $cite = "\nif you found MITObim useful, please cite: Hahn C, Bachmann L and Chevreux B. (2013) Reconstructing mitochondrial genomes directly from genomic next-generation sequencing reads - a baiting and iterative mapping approach. Nucl. Acids Res. 41(13):e129. doi: 10.1093/nar/gkt371\n\n"; my $USAGE = "\nusage: ./MITObim.pl <parameters> \nparameters: -start <int> iteration to start with, default=1 -end <int> iteration to end with, default=1 -sample <string> sampleID as used in initial MIRA assembly -ref <string> referenceID as used in initial MIRA assembly -readpool <FILE> readpool in fastq format (*.gz is also allowed) -maf <FILE> maf file from previous MIRA assembly \noptional: --quick <FILE> starts process with initial baiting using provided fasta reference --kbait <int> set kmer for baiting stringency (default: 31) --platform specify sequencing platform (default: 'solexa'; other options: 'iontor', '454', 'pacbio') --denovo runs MIRA in denovo mode (default: mapping) --pair finds pairs after baiting (relies on /1 and /2 header convention for read pairs) (default: no) --verbose show detailed output of MIRA modules (default: no) --split split reference at positions with more than 5N (default: no) --help shows this helpful information --clean retain only the last 2 iteration directories (default: no) --trimreads trim data (default: no; we recommend to trim beforehand and feed MITObim with pre trimmed data) --trimoverhang trim overhang up- and downstream of reference (default: no) --missmatch <int> number of allowed missmatches in mapping - only for illumina data (default: 15% of avg. read length) --min_cov <int> minimum average coverage of contigs to be retained (default: off) --mirapath <string> full path to MIRA binaries (only needed if MIRA is not in PATH) --redirect_tmp redirect temporary output to this location (useful in case you are running MITObim on an NFS mount) --NFS_warn_only allow MIRA to run on NFS mount without aborting - warn only (expert option - see MIRA documentation 'check_nfs') \nexamples: ./MITObim.pl -start 1 -end 5 -sample StrainX -ref reference-mt -readpool illumina_readpool.fastq -maf initial_assembly.maf ./MITObim.pl -end 10 --quick reference.fasta -sample StrainY -ref reference-mt -readpool illumina_readpool.fastq\n\n"; # --proofread applies proofreading (atm only to be used if starting the process from a single short seed reference) # --readlength <int> read length of illumina library, default=150, relevant only for proofreading # --insert <int> insert size of illumina library, default=300, relevant only for proofreading my $command = $0; for (@ARGV){ $command .= " $_"; } GetOptions ( "start=i" => \$startiteration, "end=i" => \$enditeration, "kbait=i" => \$k_bait, "quick=s" => \$quick, "verbose!" => \$noshow, "sample=s" => \$strainname, "paired" => \$paired, "denovo" => \$mode, "ref=s" => \$refname, "readpool=s" => \$readpool, "help!" => \$help, "clean!" => \$clean, "mirapath=s" => \$mirapath, "maf=s" => \$maf, # "proofreading!" => \$proofreading, "trimreads!" => \$trim, "trimoverhang!" => \$trimoverhang, "missmatch=i" => \$MM, "platform=s" => \$platform, # "readlength=i" => \$readlength, # "insertsize=i" => \$insertsize, "split!" => \$splitting, "min_cov=i" => \$min_contig_cov, "redirect_tmp=s" => \$redirect_temp, "NFS_warn_only!" => \$NFS_warn_only) or die "Incorrect usage!\n$USAGE"; print $PROGRAM; print $VERSION; print $AUTHOR; print $USAGE and exit if $help; print $USAGE and exit if !$readpool; unless ($quick){ print $USAGE and exit if !$maf; } print $USAGE and exit if !$refname; $readpool=abs_path($readpool); unless (-e $readpool){ print "Cant find the readpool. Is the path correct?\n"; } if ($maf){ $maf=abs_path($maf); unless (-e $maf){ print "Cant find *.maf file. Is the path correct?\n"; } } ($platform, $platform_settings) = &set_platform($platform); if ($quick){ $quick=abs_path($quick); unless (-e $quick){ print "quick option selected but is the path to the file correct?\n"; exit 1; } print "\nquick option selected! -maf option will be ignored (if given)\n"; $maf = 0; $startiteration = 0; } print $USAGE and exit if ($startiteration > $enditeration); unless (((-e $maf)||($quick)) && (-e $readpool)){ print "\nAre readpool AND maf/reference files there?\n"; exit 1; } if ($mirapath){ if (-e "$mirapath/mira"){ print "found executables in the path specified by the user - good!\n"; $mira = "$mirapath/mira"; $miraconvert = "$mirapath/miraconvert"; $mirabait = "$mirapath/mirabait"; }else{ print "somethings wrong with the path to mira.\n"; exit 1; } } ##if not given otherwise, readlength and insertsize are set to default. automatic readlength and insertsize detection will be implemented in time. #if (!$readlength){ # $readlength = 150; #} #if (!$insertsize){ # $insertsize = 300; #} if (!$mode){ $miramode = "mapping"; }else { $miramode = "denovo"; } if (!$trim){ $trim_off = "\"--noclipping -CL:pec=no\""; } my $out_command = $command; $out_command =~ s/\S+galaxy\///g; print "\nFull command run:\n$out_command\n"; print "\nAll paramters seem to make sense:\n"; print "startiteration: $startiteration\n"; print "enditeration: $enditeration\n"; print "sample: $strainname\n"; print "refname: $refname\n"; my $out_readpool = $readpool; $out_readpool =~ s/.*galaxy\///; print "readpool: $out_readpool\n"; print "maf: $maf\n"; my $out_quick = $quick; $out_quick =~ s/.*galaxy\///; print "quick: $out_quick\n"; print "paired: $paired (off=0, on=1)\n"; print "assembly mode: $mode (mapping=0, denovo=1)\n"; print "verbose: $noshow (off=0, on=1)\n"; print "split: $splitting (off=0, on=1)\n"; print "minimum avg. coverage: $min_contig_cov (off=0)\n"; print "clean: $clean (off=0, on=1)\n"; print "trim reads: $trim (off=0, on=1)\n"; print "trim overhang: $trimoverhang (no=0, yes=1)\n"; print "platform: $platform\n"; print "kmer baiting: $k_bait\n"; #print "proofread: $proofreading (off=0, on=1)\n"; if ($proofreading){ print "\nproofreading is not yet enabled in this beta version of MITObim 1.8 - it is currently being optimized for MIRA4. \nplease refer to MITObim 1.6 if you wish to use this option. Sorry for the inconvenience!\n\n"; exit; print "proofreading: on\n"; print "readlength: $readlength\n"; print "insertsize: $insertsize\n"; $MM = 0; print "number of allowed missmatches in proofreading assembly: $MM\n"; $shme = "-AL:shme=$MM"; }elsif ((!$proofreading) && (!$mode) && ($platform eq "solexa")){ if ($MM == -1){ print "number of missmatches in mapping assembly: default (15% of average read length loaded)\n"; $shme = ""; }else { print "number of missmatches in mapping assembly: $MM\n"; $shme = "-AL:shme=$MM"; } print "proofreading: off\n"; } if (!$trimoverhang){ $trimoverhang = "-SB:tor=no"; }else { $trimoverhang = "-SB:tor=yes"; } if ($redirect_temp) { $redirect_temp="-DI:trt=$redirect_temp" } print "\nStarting MITObim \n"; my @iteration = ($startiteration .. $enditeration); foreach (@iteration){ chomp; my $currentiteration = $_; mkdir "iteration$currentiteration" or die "MITObim will not overwrite an existing directory: iteration$currentiteration\n"; chdir "iteration$currentiteration" or die $!; print "\n==============\n"; print " ITERATION $currentiteration\n"; print "==============\n"; print strftime("%b %e %H:%M:%S", localtime) . "\n\n"; # if (($proofreading) && ($currentiteration != 0)){ if ($proofreading){ $shme = "-AL:shme=0"; } if ($maf){ print "\nrecover backbone by running miraconvert on maf file\n\n"; if ($currentiteration<2){ @output= qx($miraconvert -f maf -t fasta -A "$platform_settings -CO:fnicpst=yes" $maf tmp); }else{ @output= qx($miraconvert -f maf -t fasta -y $min_contig_cov -A "$platform_settings -CO:fnicpst=yes" $maf tmp); } $exit = $? >> 8; unless (!$noshow){ print "@output\n"; } unless ($exit == 0){ if (!$noshow){ print "@output\n"; } print "\nmiraconvert seems to have failed - see detailed output above\n"; exit; } # if ( ((($mode) && ($currentiteration > 1)) && (!$quick)) || ((($mode) && ($currentiteration >= 1)) && ($quick)) ){ # open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!"; # }else{ open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!\nIs the sampleID identical as in the initial MIRA assembly?\n"; # } open(FH2,">backbone_it$currentiteration\_initial_$refname.fna") or die $!; while (<FH1>) { $_ =~ s/x/N/g; print FH2 $_; } close(FH1); close(FH2); unlink glob ("tmp*"); } MIRABAIT: unless ($maf){ print "\nquick option baits reads from provided reference in iteration 0\n"; copy("$quick", "backbone_it$currentiteration\_initial_$refname.fna") or die "copy failed: $!"; } &check_ref_length("backbone_it$currentiteration\_initial_$refname.fna","temp_baitfile.fasta",29800,$k_bait); if ($splitting){ &remove_unmapped_contigs("temp_baitfile_backup.fasta","temp_baitfile.fasta"); if (-e "temp_baitfile_backup.fasta"){ #this file has only been created if splitting contigs was necessary copy "temp_baitfile.fasta","backbone_it$currentiteration\_initial_$refname.fna"; } } print "\nfishing readpool using mirabait (k = $k_bait)\n\n"; # @output = (`mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-$refname\_in.$platform 2>&1`); @output = qx($mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-readpool-it$currentiteration); $exit = $? >> 8; unless (!$noshow){ print "@output\n"; } if (!-s "$strainname-readpool-it$currentiteration.fastq"){ print "\nyour readpool does not contain any reads with reasonable match (k = $k_bait) to your reference - Maybe you ll want to different settings or even a different reference?\n\n"; exit; } unless ($exit == 0){ if (!$noshow){ print "@output\n"; } print "\nmirabait seems to have failed - see detailed output above\n"; exit; } FINDPAIRS: unless (!$paired){ print "\nfinding pairs for baited reads\n\n"; copy ("$strainname-readpool-it$currentiteration.fastq", "$strainname-readpool-it$currentiteration-se.fastq") or die "copy failed: $!"; open(FH1,"<$strainname-readpool-it$currentiteration.fastq") or die $!; open(FH2,">list"); my $index=1; while (<FH1>) { if ($index % 8 ==1 || $index % 8 ==5) { chomp; $_ =~ s/@//g; ($key, $val) = split /\//; $hash{$key} .= exists $hash{$key} ? ",$val" : $val; } $index++; } for (keys %hash){ $_ =~ s/$/\/1/g; print FH2 "$_\n"; $_ =~ s/1$/2/g; print FH2 "$_\n"; } close(FH1); close(FH2); # @output = (`miraconvert -f fastq -t fastq -n list $readpool $strainname-$refname\_in.$platform 2>&1`); @output = qx($miraconvert -f fastq -t fastq -n list $readpool $strainname-readpool-it$currentiteration ); $exit = $? >> 8; unless (!$noshow){ print "@output\n"; } unless ($exit == 0){ if (!$noshow){ print "@output\n"; } print "\nmiraconvert seems to have failed - see detailed output above\n"; exit; } } unlink("list"); MIRA: print "\nrunning $miramode assembly using MIRA\n\n"; &create_manifest($currentiteration,$strainname,$refname,$miramode,$trim_off,$platform_settings,$shme,$paired,$trimoverhang,"$strainname-readpool-it$currentiteration.fastq","backbone_it$currentiteration\_initial_$refname.fna", $redirect_temp, $NFS_warn_only); @output = qx($mira manifest.conf ); $exit = $? >> 8; unless (!$noshow){ print "@output\n"; } unless ($exit == 0){ if (!$noshow){ print "@output\n"; } print "\nMIRA seems to have failed - see detailed output above\n"; exit; } if ($redirect_temp) { my $tmp_link = readlink("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp"); rmtree ("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp") or die $!; mkdir "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/" or die $1; my @files = glob("$tmp_link/*"); for (@files) { move ("$_", "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/") or die $!; } rmtree ("$tmp_link") or die $!; } @path = abs_path; push (@path, "/$strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out.maf"); $maf = join("",@path); unless (-e $maf){ print "maf file is not there \n"; exit; } # $current_contiglength = &get_contig_length("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt"); # $current_number_of_reads = (&get_number_of_reads("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt") - 1); @current_contig_stats = &get_contig_stats("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt"); if (((scalar @current_contig_stats > 3) || ($current_contig_stats[0] > 1)) && ($proofreading)) { print "assembly consists of more than one contigs - this is atm not permitted in proofreading mode. Sorry!\n\n"; exit 1; } PROOFREAD: # if (($proofreading) && ($currentiteration >= 1)){ if ($proofreading){ print "proofreading option is currently disabled in this beta version of MITObim 1.7 - sorry for the inconvenience!\n\n"; exit; } $current_number_of_contigs = shift @current_contig_stats; $current_number_of_reads = shift @current_contig_stats; if (!$mode){ #in mapping assemblies the reference is counted as one read $current_number_of_reads -= $current_number_of_contigs; } push (@number_of_contigs, $current_number_of_contigs); push (@number_of_reads, $current_number_of_reads); print "readpool contains $current_number_of_reads reads\n"; print "assembly contains $current_number_of_contigs contig(s)\n"; if (scalar @current_contig_stats == 1){ print "contig length: $current_contig_stats[0]\n"; }elsif (scalar @current_contig_stats == 3){ print "min contig length: ".$current_contig_stats[0]." bp\nmax contig length: ".$current_contig_stats[1]." bp\navg contig length: ".sprintf("%.0f", $current_contig_stats[2])." bp\n"; print "find details on individual contigs in: ". abs_path . "/$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt\n"; # printf("%1f",$current_contig_stats[2])."\n"; }else { print "somethings wrong with your contig stats. Sorry!\n"; exit 1; } if ($clean){ &clean($clean_interval, $currentiteration); } # my $path=abs_path; print "Now at $path\n"; system("cp $strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out_AllStrains.unpadded.fasta ../finaloutput.fasta"); system("sed -i 's/_bb.*/_mitobim_out_$currentiteration\_iterations/' ../finaloutput.fasta"); if ($number_of_reads[-2]){ if ($number_of_reads[-2] >= $number_of_reads[-1]){ print "\nMITObim has reached a stationary read number after $currentiteration iterations!!\n$cite"; print strftime("%b %e %H:%M:%S", localtime) . "\n\n"; exit; } } chdir ".." or die "Failed to go to parent directory: $!"; } print "\nsuccessfully completed $enditeration iterations with MITObim! " . strftime("%b %e %H:%M:%S", localtime) . "\n$cite"; # # ###SUBROUTINES # # # sub set_platform{ my @platforms = ('solexa', '454', 'iontorrent', 'pacbio'); my %pf_settings = ("solexa" => "SOLEXA_SETTINGS", "454" => "454_SETTINGS", "iontorrent" => "IONTOR_SETTINGS", "pacbio" => "PCBIOHQ_SETTINGS -CO:mrpg=5"); my $user_setting = shift; my @user_choice; for (@platforms){ if ( $_ =~ /^$user_setting/ ){ push(@user_choice, $_); } } if (scalar(@user_choice) > 1){ print "\nYour platform choice was ambiguous: $user_setting could be: @user_choice - Please try again\n"; exit; } elsif (scalar(@user_choice) == 0){ print "\nPlease specify a valid sequencing platform - not specifying anything will default into: $platforms[0]\n\navailable options are:\n@platforms\n"; exit; } return ($user_choice[0], $pf_settings{$user_choice[0]}) } sub get_contig_stats{ my $contig = $_[0]; my @array; my @contiglength; my (@readnumber, @stats); my $readssum = 0; open (CONTIGSTATS,"<$contig") or die $!; while (<CONTIGSTATS>){ unless ($_ =~ /#/){ @array = split /\t/; push (@contiglength, $array[1]); push (@readnumber, $array[3]); } } close (CONTIGSTATS); if (scalar @readnumber == 1){ push (@stats, (scalar @readnumber, $readnumber[0], $contiglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, length of contig } elsif (scalar @readnumber > 1){ $readssum += $_ for @readnumber; my $minlength = min @contiglength; my $maxlength = max @contiglength; my @avglength = &standard_deviation(@contiglength); push (@stats, (scalar @readnumber, $readssum, $minlength, $maxlength, $avglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, minimal, maximal, avg length of contigs } return @stats; } #sub get_contig_length{ # my $contig = $_[0]; # my @contiglength; # open (CONTIGSTATS,"<$contig") or die $!; # while (<CONTIGSTATS>){ # unless ($_ =~ /#/){ # @contigslength = split /\t/; # } # } # close (CONTIGSTATS); # return $contiglength[1]; #} #sub get_number_of_reads{ # my $contig = $_[0]; # my @contigstats; # open (CONTIGSTATS,"<$contig") or die $!; # while (<CONTIGSTATS>){ # unless ($_ =~ /#/){ # @contigstats = split /\t/; # } # } # close (CONTIGSTATS); # return $contigstats[3]; #} sub proofread { my $zero_MM = $_[0]; my $readtaglist_FH = $_[1]; my $contiglength = $_[2]; my $elevated_cov_lower_start = $_[3]; my $elevated_cov_lower_end = $_[4]; my $elevated_cov_upper_start = $_[5]; my $elevated_cov_upper_end = $_[6]; my $lower_limit = $_[7]; # my $lower_limit = 200; my $lower_main_limit = $_[8]; # my $lower_main_limit = 500; my $upper_limit = $contiglength - $lower_limit; my $upper_main_limit = $contiglength - $lower_main_limit; my @readtaglist; my $ref; my $junk; my $current_id; my %count; my @readid_good; my @readid_bad; my @reads; my @readlist; my @readlist_good; my @readlist_proofread; my @total_readlist; my @singleton; my @singletons; my @taglist; my @taglist_line; my @readtaglist_lower; my @readtaglist_upper; my @read_ids_lower; my @read_ids_all; my @read_ids_upper; my %ids =(); my @unsorted; my $min; my $max; my $tag; print "\nlower limit: $lower_limit\n"; print "upper limit: $upper_limit\n"; print "lower main limit: $lower_main_limit\n"; print "upper main limit: $upper_main_limit\n\n"; open (TAGLIST,"<$readtaglist_FH") or die $!; while (<TAGLIST>){ push (@readtaglist, "$_"); } close (TAGLIST); for (@readtaglist){ @taglist_line = split /\t/; unless ($taglist_line[0] =~ /#/){ $ref = join ("\t", $taglist_line[0], $taglist_line[6]); push (@read_ids_all, $ref); if (($taglist_line[1] <= $lower_limit) || ($taglist_line[2] <= $lower_limit)){ # if ((($taglist_line[1] <= $lower_limit) || (($taglist_line[1] >= $coverage_limits_lower[0])&&($taglist_line[1] <= $coverage_limits_lower[1]))) || ($taglist_line[2] <= $lower_limit)){ $ref = join ("\t", $taglist_line[0], $taglist_line[6]); push (@read_ids_lower, $ref); }elsif (($taglist_line[1] >= $upper_limit) || ($taglist_line[2] >= $upper_limit)){ $ref = join ("\t", $taglist_line[0], $taglist_line[6]); push (@read_ids_upper, $ref); } } } %ids = map { $_ => 1 } @read_ids_lower; my @unique_lower = keys %ids; %ids = map { $_ => 1 } @read_ids_upper; my @unique_upper = keys %ids; %ids = map { $_ => 1 } @read_ids_all; my @unique_all = keys %ids; for (@unique_all) { my @junk = split /\//; push (@reads, $junk[0]); @junk = split /\t/; push (@total_readlist, $junk[1]); } map { $count{$_}++ } @reads; map {if ($count{$_} == 2){ @readid_good = split /\t/; push(@readlist, "$readid_good[1]");} elsif ($count{$_} == 1) { push(@readid_bad, "$_");}} keys (%count); @reads = {}; undef %count; for (@readlist){ chomp; $current_id = $_; my @pairs_lower = grep { $_ =~ /$current_id/} @unique_lower; my @pairs_upper = grep{ $_ =~ /$current_id/} @unique_upper; # print "good id: $current_id\n"; my $count_lower = scalar @pairs_lower; my $count_upper = scalar @pairs_upper; # print "count lower: $count_lower\n"; # print "count upper: $count_upper\n"; unless ((scalar @pairs_lower == 2) || (scalar @pairs_upper == 2)){ push (@readlist_good, "$current_id"); } } for (@readid_bad){ chomp; @unsorted = (); ($junk, $current_id) = split (/\t/); print "current ID: $current_id\n"; @singleton = grep { $_ =~ /$current_id/} @total_readlist; for (@singleton){ chomp; $tag = $_; @taglist = grep { $_ =~ /$tag/} @readtaglist; # print "taglist: @taglist\n"; } for (@taglist) { @taglist_line = split /\t/; push(@unsorted, $taglist_line[1], $taglist_line[2]); $max = max @unsorted; $min = min @unsorted; } # print "unsorted: @unsorted\n"; print "read mapping from $min to $max\n"; # print "min: $min\n"; # print "max: $max\n"; if ($min <= $lower_limit){ print "orphan discarded! min<lowerlimit\n----------------------------------------------\n"; }elsif ($max >= $upper_limit){ print "orphan discarded! max>upperlimit\n----------------------------------------------\n"; }elsif (($min >= $lower_main_limit) && ($max <= $upper_main_limit)){ print "orphan discarded! lower_main_limit<min-max<upper_main_limit\n----------------------------------------------\n"; }elsif (($min >= $elevated_cov_lower_start) && ($min <= $elevated_cov_lower_end - ($lower_limit / 2))){ print "orphan discarded! increased_coverage_lower_start<min<increased_coverage_lower_end\n----------------------------------------------\n"; }elsif (($max >= ($elevated_cov_upper_start + ($lower_limit / 2))) && ($max <= $elevated_cov_upper_end)){ print "orphan discarded! increased_coverage_upper_start<max<increased_coverage_upper_end\n----------------------------------------------\n"; }else { push(@singletons, "@singleton\n"); print "orphan resurrected! \n----------------------------------------------\n"; } # print "contiglength: $contiglength\n"; } for (@singletons){ my @resurrection = split /\//; push (@readlist_good, $resurrection[0]); } for (@readlist_good){ $_ =~ s/$/\/1\n/g; push(@readlist_proofread, $_); $_ =~ s/1$/2/g; push(@readlist_proofread, $_); } return @readlist_proofread; } sub standard_deviation { my(@numbers) = @_; #Prevent division by 0 error in case you get junk data return undef unless(scalar(@numbers)); # Step 1, find the mean of the numbers my $total1 = 0; foreach my $num (@numbers) { if (!$num){ $num = 0; } $total1 += $num; } my $mean1 = $total1 / (scalar @numbers); push (my @stdev, "$mean1"); # Step 2, find the mean of the squares of the differences between each number and the mean my $total2 = 0; foreach my $num (@numbers) { if (!$num){ $num = 0; } $total2 += ($mean1-$num)**2; } my $mean2 = $total2 / (scalar @numbers); # Step 3, standard deviation is the square root of the above mean my $std_dev = sqrt($mean2); push (@stdev, "$std_dev"); return @stdev; } sub assess_coverage{ my $readtaglist_FH = $_[0]; my @readtaglist; my $from =$_[1]; my $to = $_[2]; my $where = $_[3]; my @taglist_line; my @coverage_array_lower; my @coverage_array_upper; my @read_ids_lower; my @read_ids_upper; my %ids; my @taglist; my @unsorted; my $min; my $max; my %coverage; my @allnums; my @coverage_change_position; my @coverage_limits; # print "assessing coverage from position $from to position $to\n"; open (TAGLIST,"<$readtaglist_FH") or die $!; while (<TAGLIST>){ push (@readtaglist, "$_"); } close (TAGLIST); for (@readtaglist){ @taglist_line = split /\t/; unless ($taglist_line[0] =~ /#/){ if ((($taglist_line[1] >= $from) && ($taglist_line[1] <= $to)) || (($taglist_line[2] >= $from) && ($taglist_line[2] <= $to))){ push (@coverage_array_lower, "$_"); push (@read_ids_lower, $taglist_line[6]); } } } %ids = map { $_ => 1 } @read_ids_lower; my @unique_lower = keys %ids; for (@unique_lower){ my @current_id = $_; chomp; @unsorted = (); for (@current_id){ my $current_id = $_; @taglist = grep { $_ =~ /$current_id/} @coverage_array_lower; } for (@taglist) { @taglist_line = split /\t/; push(@unsorted, $taglist_line[1], $taglist_line[2]); $max = max @unsorted; $min = min @unsorted; } my @nums = ($min .. $max); for (@nums){ push (@allnums, "$_"); } } %coverage = map { $_ => 0 } @allnums; map { $coverage{$_}++ } @allnums; open (OUT,">out-$where.csv"); ########## detecting coverage peak my $max_cov = 0; my $max_cov_position; my @cumulative_coverage; map { unless (!$coverage{$_}){print OUT "$_,$coverage{$_}\n"; push (@cumulative_coverage, "$coverage{$_}"); if ($coverage{$_} > $max_cov){$max_cov = $coverage{$_}; $max_cov_position = $_; }}} ($from..$to); my @average_coverage = &standard_deviation(@cumulative_coverage); my $coverage_factor = $max_cov / $average_coverage[0]; open (OUT,">>out-$where.csv"); print OUT "\nmaximum coverage is $max_cov at position $max_cov_position\naverge coverage is: $average_coverage[0], sd: $average_coverage[1]\nfactor $coverage_factor\n"; close (OUT); ######### detecting rapid changes in coverage for ($from..($to - 10)){ my $position = $_; my $cov = $coverage{$position}; unless (!$cov){ my @positions = (); push (@positions, "$cov"); for (1 .. 10){ my $pos_plus = $position + $_; if ($coverage{$pos_plus}){ push (@positions, "$coverage{$pos_plus}"); } } my @stdev = &standard_deviation(@positions); if ($stdev[1] > 6.0){ print "positions ($position): @positions -> stdev: $stdev[1]\n"; push (@coverage_change_position, $position); }elsif ($stdev[1] >= 4.5){ print "positions ($position): @positions -> stdev: $stdev[1]\n"; } } } if (@coverage_change_position){ print "positions with rapidly changing coverage detected: @coverage_change_position\n"; my $start = min @coverage_change_position; my $end = max @coverage_change_position; push (@coverage_limits, "$start", "$end"); print "set limits from $coverage_limits[0] to $coverage_limits[1]\n"; }else{ print "no irregularities in coverage detected\n"; push (@coverage_limits, "0", "0"); return @coverage_limits; } ###### assessing whether coverage peak lies within putative conserved region, if yes accept prediction; if no, reject conserved region if (($coverage_factor >= 1.6) && (($coverage_limits[0] < $max_cov_position) && ( $max_cov_position < $coverage_limits[1]))){ print "suspicious coverage peak detected within the predicted limits\n"; }else { print "no coverage peak detected within predicted limits - rejecting limits\n"; @coverage_limits = (); push (@coverage_limits, "0", "0"); } return @coverage_limits; } sub check_ref_length{ my $ref=$_[0]; my $output_filename=$_[1]; my $critical=$_[2]; my $kmer = $_[3]; my @header; my $header_count=0; my (@sequence,@temp_output,@final_output); my $full_sequence; open(REF,"<$ref") or die $!; while(<REF>){ chomp; if ($_ =~ /^>/){ push(@header,$_); # print "found header:\n$header[$header_count]\n"; $header_count++; # print "header count: $header_count\n"; if (@sequence){ @temp_output=&finalize_sequence($critical,$header[-2],$kmer,@sequence); for (@temp_output){ push(@final_output,$_); } } undef @sequence; }elsif ($_ =~ /[a-zA-Z]/){ # print "found sequence:\n$_\n"; push(@sequence,$_); } } @temp_output=&finalize_sequence($critical,$header[-1],$kmer,@sequence); for (@temp_output){ push(@final_output,$_); } # print "result:\n"; open (OUT,">$output_filename") or die $!; for(@final_output){ # print "$_\n"; print OUT "$_\n"; } close REF; close OUT; } sub finalize_sequence{ my $critical = shift; my $header = shift; my $kmer = shift; my $full_sequence=join("",@_); my $factor; my @output; if (!$critical){ $factor=0; }else{ $factor=ceil(length($full_sequence)/$critical); } if ($factor == 1){ push(@output,$header); push(@output,$full_sequence); }else{ #too long print "\nreference is too long for mirabait to be handled in one go -> will be split into sub-sequences\n"; $header=substr $header, 1; for (my $i=0; $i<$factor; $i++){ unless ((length(substr $full_sequence, $i*$critical, $critical+$kmer)-$kmer)<0){ push(@output,">sub$i\_" .$header); push(@output,substr $full_sequence, $i*$critical, $critical+$kmer); } } } return @output; } sub create_manifest { my ($iter, $sampleID, $refID, $mmode, $trim, $platform, $solexa_missmatches, $pair, $overhang, $reads, $ref, $redirect, $NFS_warn); $iter = $_[0]; $sampleID = $_[1]; $refID = $_[2]; $mmode = $_[3]; $trim = $_[4]; $platform = $_[5]; $solexa_missmatches = $_[6]; $pair = $_[7]; $overhang = $_[8]; $reads = $_[9]; $ref = $_[10]; $redirect = $_[11]; $NFS_warn = $_[12]; if ($NFS_warn){ $NFS_warn = ":cnfs=warn" } open (MANIFEST,">manifest.conf") or die $!; print MANIFEST "#manifest file for iteration $iter created by MITObim\n\nproject = $sampleID-$refID \njob = genome,$mmode,accurate \nparameters = -NW:mrnl=0$NFS_warn -AS:nop=1 $redirect $overhang $platform $trim -CO:msr=no $solexa_missmatches\n"; my @technology = split("_",$platform); #-notraceinfo - if ($mmode eq "mapping"){ print MANIFEST "\nreadgroup\nis_reference\ndata = $ref\nstrain = $refID\n"; } print MANIFEST "\nreadgroup = reads\ndata = $reads\ntechnology = $technology[0]"; if ($pair){ # print MANIFEST "\nsegmentplacement = ---> <--- infoonly"; print MANIFEST "\nsegmentplacement = ---> <--- exclusion_criterion"; print MANIFEST "\ntemplatesize = -1 600 exclusion_criterion"; } print MANIFEST "\nstrain = $sampleID\n"; close MANIFEST; } sub clean { my $interval = shift; my $cur = shift; my $dir = $cur-$interval; my $path=abs_path; if (-d "$path/../iteration$dir"){ print "\nnow removing directory iteration$dir\n"; rmtree ("$path/../iteration$dir") or die $!; } } sub remove_unmapped_contigs{ my $file = shift; my $out = shift; my ($head, $seq,$length); my @split; my ($dropped,$split,$trim)=(0,0,0); open (FASTA,"<$out") or die $!."\ncould not open $file for reading\n"; while (<FASTA>){ chomp; if ($_ =~ /^>/){ $head=$_; }else { if ($_ =~ /X/){ $dropped++; print "drop:\n$head\n$_\n"; }else{ $length=length($_); $_ =~ s/^N+//g; #remove all Ns from the beginning of the sequence $_ =~ s/N+$//g; #remove all Ns from the end of the sequence if ($length != length($_)){ $trim++; } @split=split(/N{3,}/); #split at every occurance of 3 or more Ns if (@split > 1){ print "contig $head has been split into ".scalar @split." sub-contigs\n"; $split++; for (my $i=0; $i<@split; $i++){ # print "$head\_$i\n$split[$i]\n"; $seq.="$head\_$i\n$split[$i]\n"; } }else{ # print "$head\n$_\n"; $seq.="$head\n$_\n"; } } } } close FASTA; if (($dropped) || ($split) || ($trim)){ print "creating backup of $out before changing it\n"; copy $out,$file; open (OUT,">$out") or die $!."\ncould not open $out for writing\n"; print OUT "$seq\n"; close OUT; } if ($dropped){ print "\nremoved $dropped contigs from baitfile because they did not receive any hits\n"; }else{ print "\nno need to remove any sequences from the baitfile\n"; } if (!$split){ print "\nno need to split any contigs\n"; } if ($trim){ print "\n$trim contigs had N's trimmed off their ends\n"; } }