bubio: MITObim_1.8.pl comparison

comparison MITObim_1.8.pl @ 6:a03d23c6ab95 draft

MitoBim and interleave

author	lijing
date	Thu, 02 Nov 2017 12:44:55 -0400
parents
children

comparison

equal deleted inserted replaced

-:7c679db88fa3
+:a03d23c6ab95
+#! /usr/bin/perl
+#
+# MITObim - mitochondrial baiting and iterative mapping
+# wrapper script version 1.8
+# Author: Christoph Hahn, 2012-2016
+# christoph.hahn@nhm.uio.no
+use strict;
+use warnings;
+use Getopt::Long;
+use Cwd qw(abs_path);
+use File::Copy;
+use List::Util qw< min max >;
+use POSIX qw(strftime);
+use POSIX qw(ceil);
+use File::Path 'rmtree';
+my $startiteration = 1;
+my $enditeration = 1;
+my ($quick, $noshow, $help, $strainname, $paired, $mode, $refname, $readpool, $maf, $proofreading, $readlength, $insertsize, $MM, $trim, $trimoverhang, $k_bait, $clean, $clean_interval, $min_contig_cov) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 31, 0, 2, 0);
+my ($miramode, $key, $val, $exit, $current_contiglength, $current_number_of_reads, $current_number_of_contigs);
+my $splitting = 0;
+my $platform = "solexa";
+my $platform_settings;# = "SOLEXA";
+my $shme = "";
+my $trim_off = "";
+my $redirect_temp = "";
+my $NFS_warn_only = "";
+my ($mirapath, $mira, $miraconvert, $mirabait) = ("", "mira", "miraconvert", "mirabait");
+my (@reads, @output, @path, @current_contig_stats, @contiglengths, @number_of_reads, @number_of_contigs);
+my %hash;
+my $PROGRAM = "\nMITObim - mitochondrial baiting and iterative mapping\n";
+my $VERSION = "version 1.8\n";
+my $AUTHOR = "author: Christoph Hahn, (c) 2012-2016\n";
+my $cite = "\nif you found MITObim useful, please cite:
+Hahn C, Bachmann L and Chevreux B. (2013) Reconstructing mitochondrial genomes directly from genomic next-generation sequencing reads -
+a baiting and iterative mapping approach. Nucl. Acids Res. 41(13):e129. doi: 10.1093/nar/gkt371\n\n";
+my $USAGE = 	"\nusage: ./MITObim.pl <parameters>
+	 	\nparameters:
+		-start <int>		iteration to start with, default=1
+		-end <int>		iteration to end with, default=1
+		-sample <string>	sampleID as used in initial MIRA assembly
+		-ref <string>		referenceID as used in initial MIRA assembly
+		-readpool <FILE>	readpool in fastq format (*.gz is also allowed)
+		-maf <FILE>		maf file from previous MIRA assembly
+		\noptional:
+		--quick <FILE>		starts process with initial baiting using provided fasta reference
+		--kbait <int>		set kmer for baiting stringency (default: 31)
+		--platform		specify sequencing platform (default: 'solexa'; other options: 'iontor', '454', 'pacbio')
+		--denovo		runs MIRA in denovo mode (default: mapping)
+		--pair			finds pairs after baiting (relies on /1 and /2 header convention for read pairs) (default: no)
+		--verbose		show detailed output of MIRA modules (default: no)
+		--split			split reference at positions with more than 5N (default: no)
+		--help			shows this helpful information
+		--clean                 retain only the last 2 iteration directories (default: no)
+		--trimreads		trim data (default: no; we recommend to trim beforehand and feed MITObim with pre trimmed data)
+		--trimoverhang		trim overhang up- and downstream of reference (default: no)
+		--missmatch <int>	number of allowed missmatches in mapping - only for illumina data (default: 15% of avg. read length)
+		--min_cov <int>		minimum average coverage of contigs to be retained (default: off)
+		--mirapath <string>     full path to MIRA binaries (only needed if MIRA is not in PATH)
+		--redirect_tmp		redirect temporary output to this location (useful in case you are running MITObim on an NFS mount)
+		--NFS_warn_only		allow MIRA to run on NFS mount without aborting -  warn only (expert option - see MIRA documentation 'check_nfs')
+		\nexamples:
+		./MITObim.pl -start 1 -end 5 -sample StrainX -ref reference-mt -readpool illumina_readpool.fastq -maf initial_assembly.maf
+		./MITObim.pl -end 10 --quick reference.fasta -sample StrainY -ref reference-mt -readpool illumina_readpool.fastq\n\n";
+#		--proofread		applies proofreading (atm only to be used if starting the process from a single short seed reference)
+#		--readlength <int>	read length of illumina library, default=150, relevant only for proofreading
+#		--insert <int>		insert size of illumina library, default=300, relevant only for proofreading
+my $command = $0;
+for (@ARGV){
+$command .= " $_";
+}
+GetOptions (	"start=i" => \$startiteration,
+		"end=i" => \$enditeration,
+		"kbait=i" => \$k_bait,
+		"quick=s" => \$quick,
+		"verbose!" => \$noshow,
+		"sample=s" => \$strainname,
+		"paired" => \$paired,
+		"denovo" => \$mode,
+		"ref=s" => \$refname,
+		"readpool=s" => \$readpool,
+		"help!" => \$help,
+		"clean!" => \$clean,
+		"mirapath=s" => \$mirapath,
+		"maf=s" => \$maf,
+#		"proofreading!" => \$proofreading,
+		"trimreads!" => \$trim,
+		"trimoverhang!" => \$trimoverhang,
+		"missmatch=i" => \$MM,
+		"platform=s" => \$platform,
+#		"readlength=i" => \$readlength,
+#		"insertsize=i" => \$insertsize,
+		"split!"	=>	\$splitting,
+		"min_cov=i"	=>	\$min_contig_cov,
+		"redirect_tmp=s" =>	\$redirect_temp,
+		"NFS_warn_only!" => \$NFS_warn_only) or die "Incorrect usage!\n$USAGE";
+print $PROGRAM;
+print $VERSION;
+print $AUTHOR;
+print $USAGE and exit if $help;
+print $USAGE and exit if !$readpool;
+unless ($quick){
+print $USAGE and exit if !$maf;
+}
+print $USAGE and exit if !$refname;
+$readpool=abs_path($readpool);
+unless (-e $readpool){
+	print "Cant find the readpool. Is the path correct?\n";
+}
+if ($maf){
+	$maf=abs_path($maf);
+	unless (-e $maf){
+		print "Cant find *.maf file. Is the path correct?\n";
+	}
+}
+($platform, $platform_settings) = &set_platform($platform);
+if ($quick){
+	$quick=abs_path($quick);
+unless (-e $quick){
+		print "quick option selected but is the path to the file correct?\n";
+		exit 1;
+	}
+	print "\nquick option selected! -maf option will be ignored (if given)\n";
+	$maf = 0;
+	$startiteration = 0;
+}
+print $USAGE and exit if ($startiteration > $enditeration);
+unless (((-e $maf)||($quick)) && (-e $readpool)){
+print "\nAre readpool AND maf/reference files there?\n";
+exit 1;
+}
+if ($mirapath){
+	if (-e "$mirapath/mira"){
+		print "found executables in the path specified by the user - good!\n";
+		$mira = "$mirapath/mira";
+		$miraconvert = "$mirapath/miraconvert";
+		$mirabait = "$mirapath/mirabait";
+	}else{
+		print "somethings wrong with the path to mira.\n";
+		exit 1;
+	}
+}
+##if not given otherwise, readlength and insertsize are set to default. automatic readlength and insertsize detection will be implemented in time.
+#if (!$readlength){
+#	$readlength = 150;
+#}
+#if (!$insertsize){
+#	$insertsize = 300;
+#}
+if (!$mode){
+	$miramode = "mapping";
+}else {
+	$miramode = "denovo";
+}
+if (!$trim){
+	$trim_off = "\"--noclipping -CL:pec=no\"";
+}
+my $out_command = $command; $out_command =~ s/\S+galaxy\///g;
+print "\nFull command run:\n$out_command\n";
+print "\nAll paramters seem to make sense:\n";
+print "startiteration: $startiteration\n";
+print "enditeration: $enditeration\n";
+print "sample: $strainname\n";
+print "refname: $refname\n";
+my $out_readpool = $readpool; $out_readpool =~ s/.*galaxy\///;
+print "readpool: $out_readpool\n";
+print "maf: $maf\n";
+my $out_quick = $quick; $out_quick =~ s/.*galaxy\///;
+print "quick: $out_quick\n";
+print "paired: $paired (off=0, on=1)\n";
+print "assembly mode: $mode (mapping=0, denovo=1)\n";
+print "verbose: $noshow (off=0, on=1)\n";
+print "split: $splitting (off=0, on=1)\n";
+print "minimum avg. coverage: $min_contig_cov (off=0)\n";
+print "clean: $clean (off=0, on=1)\n";
+print "trim reads: $trim (off=0, on=1)\n";
+print "trim overhang: $trimoverhang (no=0, yes=1)\n";
+print "platform: $platform\n";
+print "kmer baiting: $k_bait\n";
+#print "proofread: $proofreading (off=0, on=1)\n";
+if ($proofreading){
+	print "\nproofreading is not yet enabled in this beta version of MITObim 1.8 - it is currently being optimized for MIRA4. \nplease refer to MITObim 1.6 if you wish to use this option. Sorry for the inconvenience!\n\n";
+	exit;
+	print "proofreading: on\n";
+	print "readlength: $readlength\n";
+	print "insertsize: $insertsize\n";
+	$MM = 0;
+	print "number of allowed missmatches in proofreading assembly: $MM\n";
+	$shme = "-AL:shme=$MM";
+}elsif ((!$proofreading) && (!$mode) && ($platform eq "solexa")){
+	if ($MM == -1){
+		print "number of missmatches in mapping assembly: default (15% of average read length loaded)\n";
+		$shme = "";
+	}else {
+		print "number of missmatches in mapping assembly: $MM\n";
+		$shme = "-AL:shme=$MM";
+	}
+	print "proofreading: off\n";
+}
+if (!$trimoverhang){
+	$trimoverhang = "-SB:tor=no";
+}else {
+	$trimoverhang = "-SB:tor=yes";
+}
+if ($redirect_temp) {
+	$redirect_temp="-DI:trt=$redirect_temp"
+}
+print "\nStarting MITObim \n";
+my @iteration = ($startiteration .. $enditeration);
+foreach (@iteration){
+	chomp;
+	my $currentiteration = $_;
+	mkdir "iteration$currentiteration" or die "MITObim will not overwrite an existing directory: iteration$currentiteration\n";
+	chdir "iteration$currentiteration" or die $!;
+	print "\n==============\n";
+	print " ITERATION $currentiteration\n";
+	print "==============\n";
+	print strftime("%b %e %H:%M:%S", localtime) . "\n\n";
+#	if (($proofreading) && ($currentiteration != 0)){
+	if ($proofreading){
+		$shme = "-AL:shme=0";
+	}
+	if ($maf){
+		print "\nrecover backbone by running miraconvert on maf file\n\n";
+		if ($currentiteration<2){
+			@output= qx($miraconvert -f maf -t fasta -A "$platform_settings -CO:fnicpst=yes" $maf tmp);
+		}else{
+			@output= qx($miraconvert -f maf -t fasta -y $min_contig_cov -A "$platform_settings -CO:fnicpst=yes" $maf tmp);
+		}
+		$exit = $? >> 8;
+		unless (!$noshow){
+			print "@output\n";
+		}
+		unless ($exit == 0){
+			if (!$noshow){
+				print "@output\n";
+			}
+			print "\nmiraconvert seems to have failed - see detailed output above\n";
+			exit;
+		}
+#		if ( ((($mode) && ($currentiteration > 1)) && (!$quick)) || ((($mode) && ($currentiteration >= 1)) && ($quick)) ){
+#			open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!";
+#		}else{
+		open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!\nIs the sampleID identical as in the initial MIRA assembly?\n";
+#		}
+		open(FH2,">backbone_it$currentiteration\_initial_$refname.fna") or die $!;
+		while (<FH1>) {
+			$_ =~ s/x/N/g;
+			print FH2 $_;
+		}
+		close(FH1);
+		close(FH2);
+		unlink glob ("tmp*");
+	}
+	MIRABAIT:
+	unless ($maf){
+		print "\nquick option baits reads from provided reference in iteration 0\n";
+		copy("$quick", "backbone_it$currentiteration\_initial_$refname.fna") or die "copy failed: $!";
+	}
+	&check_ref_length("backbone_it$currentiteration\_initial_$refname.fna","temp_baitfile.fasta",29800,$k_bait);
+	if ($splitting){
+		&remove_unmapped_contigs("temp_baitfile_backup.fasta","temp_baitfile.fasta");
+		if (-e "temp_baitfile_backup.fasta"){	#this file has only been created if splitting contigs was necessary
+			copy "temp_baitfile.fasta","backbone_it$currentiteration\_initial_$refname.fna";
+		}
+	}
+	print "\nfishing readpool using mirabait (k = $k_bait)\n\n";
+#	@output = (`mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-$refname\_in.$platform 2>&1`);
+	@output = qx($mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-readpool-it$currentiteration);
+	$exit = $? >> 8;
+	unless (!$noshow){
+		print "@output\n";
+	}
+	if (!-s "$strainname-readpool-it$currentiteration.fastq"){
+		print "\nyour readpool does not contain any reads with reasonable match (k = $k_bait) to your reference - Maybe you ll want to different settings or even a different reference?\n\n";
+		exit;
+	}
+	unless ($exit == 0){
+		if (!$noshow){
+			print "@output\n";
+		}
+	        print "\nmirabait seems to have failed - see detailed output above\n";
+	        exit;
+	}
+	FINDPAIRS:
+	unless (!$paired){
+		print "\nfinding pairs for baited reads\n\n";
+		copy ("$strainname-readpool-it$currentiteration.fastq", "$strainname-readpool-it$currentiteration-se.fastq") or die "copy failed: $!";
+		open(FH1,"<$strainname-readpool-it$currentiteration.fastq") or die $!;
+		open(FH2,">list");
+		my $index=1;
+		while (<FH1>) {
+			if ($index % 8 ==1 || $index % 8 ==5) {
+				chomp;
+				$_ =~ s/@//g;
+				($key, $val) = split /\//;
+				$hash{$key} .= exists $hash{$key} ? ",$val" : $val;
+			}
+			$index++;
+		}
+		for (keys %hash){
+			$_ =~ s/$/\/1/g;
+			print FH2 "$_\n";
+			$_ =~ s/1$/2/g;
+	        	print FH2 "$_\n";
+		}
+		close(FH1);
+		close(FH2);
+#		@output = (`miraconvert -f fastq -t fastq -n list $readpool $strainname-$refname\_in.$platform 2>&1`);
+		@output = qx($miraconvert -f fastq -t fastq -n list $readpool $strainname-readpool-it$currentiteration );
+		$exit = $? >> 8;
+		unless (!$noshow){
+			print "@output\n";
+		}
+		unless ($exit == 0){
+			if (!$noshow){
+				print "@output\n";
+			}
+	        	print "\nmiraconvert seems to have failed - see detailed output above\n";
+	        	exit;
+		}
+	}
+	unlink("list");
+	MIRA:
+	print "\nrunning $miramode assembly using MIRA\n\n";
+	&create_manifest($currentiteration,$strainname,$refname,$miramode,$trim_off,$platform_settings,$shme,$paired,$trimoverhang,"$strainname-readpool-it$currentiteration.fastq","backbone_it$currentiteration\_initial_$refname.fna", $redirect_temp, $NFS_warn_only);
+	@output = qx($mira manifest.conf );
+	$exit = $? >> 8;
+	unless (!$noshow){
+		print "@output\n";
+	}
+	unless ($exit == 0){
+		if (!$noshow){
+			print "@output\n";
+		}
+		print "\nMIRA seems to have failed - see detailed output above\n";
+		exit;
+	}
+	if ($redirect_temp) {
+		my $tmp_link = readlink("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp");
+		rmtree ("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp") or die $!;
+		mkdir "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/" or die $1;
+		my @files = glob("$tmp_link/*");
+		for (@files) {
+			move ("$_", "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/") or die $!;
+		}
+		rmtree ("$tmp_link") or die $!;
+	}
+	@path = abs_path;
+push (@path, "/$strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out.maf");
+$maf = join("",@path);
+unless (-e $maf){
+print "maf file is not there \n";
+exit;
+}
+#	$current_contiglength = &get_contig_length("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt");
+#	$current_number_of_reads = (&get_number_of_reads("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt") - 1);
+	@current_contig_stats = &get_contig_stats("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt");
+	if (((scalar @current_contig_stats > 3) || ($current_contig_stats[0] > 1)) && ($proofreading)) {
+		print "assembly consists of more than one contigs - this is atm not permitted in proofreading mode. Sorry!\n\n";
+		exit 1;
+	}
+	PROOFREAD:
+#	if (($proofreading) && ($currentiteration >= 1)){
+	if ($proofreading){
+		print "proofreading option is currently disabled in this beta version of MITObim 1.7 - sorry for the inconvenience!\n\n";
+		exit;
+	}
+	$current_number_of_contigs = shift @current_contig_stats;
+	$current_number_of_reads = shift @current_contig_stats;
+	if (!$mode){ #in mapping assemblies the reference is counted as one read
+		$current_number_of_reads -= $current_number_of_contigs;
+	}
+	push (@number_of_contigs, $current_number_of_contigs);
+	push (@number_of_reads, $current_number_of_reads);
+	print "readpool contains $current_number_of_reads reads\n";
+	print "assembly contains $current_number_of_contigs contig(s)\n";
+	if (scalar @current_contig_stats == 1){
+		print "contig length: $current_contig_stats[0]\n";
+	}elsif (scalar @current_contig_stats == 3){
+		print "min contig length: ".$current_contig_stats[0]." bp\nmax contig length: ".$current_contig_stats[1]." bp\navg contig length: ".sprintf("%.0f", $current_contig_stats[2])." bp\n";
+		print "find details on individual contigs in: ". abs_path . "/$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt\n";
+#		printf("%1f",$current_contig_stats[2])."\n";
+	}else {
+		print "somethings wrong with your contig stats. Sorry!\n";
+		exit 1;
+	}
+if ($clean){
+&clean($clean_interval, $currentiteration);
+}
+#	my $path=abs_path; print "Now at $path\n";
+	system("cp $strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out_AllStrains.unpadded.fasta ../finaloutput.fasta");
+	system("sed -i 's/_bb.*/_mitobim_out_$currentiteration\_iterations/' ../finaloutput.fasta");
+	if ($number_of_reads[-2]){
+		if ($number_of_reads[-2] >= $number_of_reads[-1]){
+			print "\nMITObim has reached a stationary read number after $currentiteration iterations!!\n$cite";
+			print strftime("%b %e %H:%M:%S", localtime) . "\n\n";
+			exit;
+		}
+	}
+	chdir ".." or die "Failed to go to parent directory: $!";
+}
+print "\nsuccessfully completed $enditeration iterations with MITObim! " . strftime("%b %e %H:%M:%S", localtime) . "\n$cite";
+#
+#
+###SUBROUTINES
+#
+#
+#
+sub set_platform{
+	my @platforms = ('solexa', '454', 'iontorrent', 'pacbio');
+	my %pf_settings = ("solexa" => "SOLEXA_SETTINGS", "454" => "454_SETTINGS", "iontorrent" => "IONTOR_SETTINGS", "pacbio" => "PCBIOHQ_SETTINGS -CO:mrpg=5");
+	my $user_setting = shift;
+	my @user_choice;
+	for (@platforms){
+		if ( $_ =~ /^$user_setting/ ){
+			push(@user_choice, $_);
+		}
+	}
+	if (scalar(@user_choice) > 1){
+		print "\nYour platform choice was ambiguous: $user_setting could be: @user_choice - Please try again\n";
+		exit;
+	} elsif (scalar(@user_choice) == 0){
+		print "\nPlease specify a valid sequencing platform - not specifying anything will default into: $platforms[0]\n\navailable options are:\n@platforms\n";
+		exit;
+	}
+	return ($user_choice[0], $pf_settings{$user_choice[0]})
+}
+sub get_contig_stats{
+my $contig = $_[0];
+my @array;
+my @contiglength;
+my (@readnumber, @stats);
+	my $readssum = 0;
+open (CONTIGSTATS,"<$contig") or die $!;
+while (<CONTIGSTATS>){
+unless ($_ =~ /#/){
+@array = split /\t/;
+push (@contiglength, $array[1]);
+push (@readnumber, $array[3]);
+}
+}
+close (CONTIGSTATS);
+	if (scalar @readnumber == 1){
+		push (@stats, (scalar @readnumber, $readnumber[0], $contiglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, length of contig
+	}
+	elsif (scalar @readnumber > 1){
+		$readssum += $_ for @readnumber;
+		my $minlength = min @contiglength;
+		my $maxlength =	max @contiglength;
+		my @avglength = &standard_deviation(@contiglength);
+		push (@stats, (scalar @readnumber, $readssum, $minlength, $maxlength, $avglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, minimal, maximal, avg length of contigs
+	}
+return @stats;
+}
+#sub get_contig_length{
+#	my $contig = $_[0];
+#	my @contiglength;
+#	open (CONTIGSTATS,"<$contig") or die $!;
+#	while (<CONTIGSTATS>){
+#		unless ($_ =~ /#/){
+#			@contigslength = split /\t/;
+#		}
+#	}
+#	close (CONTIGSTATS);
+#	return $contiglength[1];
+#}
+#sub get_number_of_reads{
+#	my $contig = $_[0];
+#	my @contigstats;
+#	open (CONTIGSTATS,"<$contig") or die $!;
+#	while (<CONTIGSTATS>){
+#		unless ($_ =~ /#/){
+#			@contigstats = split /\t/;
+#		}
+#	}
+#	close (CONTIGSTATS);
+#	return $contigstats[3];
+#}
+sub proofread {
+my $zero_MM = $_[0];
+my $readtaglist_FH = $_[1];
+	my $contiglength = $_[2];
+	my $elevated_cov_lower_start = $_[3];
+	my $elevated_cov_lower_end = $_[4];
+	my $elevated_cov_upper_start = $_[5];
+	my $elevated_cov_upper_end = $_[6];
+my $lower_limit = $_[7];
+#        my $lower_limit = 200;
+	my $lower_main_limit = $_[8];
+#	my $lower_main_limit = 500;
+my $upper_limit = $contiglength - $lower_limit;
+my $upper_main_limit = $contiglength - $lower_main_limit;
+	my @readtaglist;
+my $ref;
+my $junk;
+my $current_id;
+my %count;
+my @readid_good;
+my @readid_bad;
+my @reads;
+my @readlist;
+my @readlist_good;
+my @readlist_proofread;
+my @total_readlist;
+my @singleton;
+my @singletons;
+my @taglist;
+my @taglist_line;
+my @readtaglist_lower;
+my @readtaglist_upper;
+my @read_ids_lower;
+my @read_ids_all;
+my @read_ids_upper;
+my %ids =();
+my @unsorted;
+my $min;
+my $max;
+my $tag;
+	print "\nlower limit: $lower_limit\n";
+print "upper limit: $upper_limit\n";
+	print "lower main limit: $lower_main_limit\n";
+	print "upper main limit: $upper_main_limit\n\n";
+open (TAGLIST,"<$readtaglist_FH") or die $!;
+while (<TAGLIST>){
+push (@readtaglist, "$_");
+}
+close (TAGLIST);
+for (@readtaglist){
+@taglist_line = split /\t/;
+unless ($taglist_line[0] =~ /#/){
+$ref = join ("\t", $taglist_line[0], $taglist_line[6]);
+push (@read_ids_all, $ref);
+			if (($taglist_line[1] <= $lower_limit) || ($taglist_line[2] <= $lower_limit)){
+#			if ((($taglist_line[1] <= $lower_limit) || (($taglist_line[1] >= $coverage_limits_lower[0])&&($taglist_line[1] <= $coverage_limits_lower[1]))) || ($taglist_line[2] <= $lower_limit)){
+$ref = join ("\t", $taglist_line[0], $taglist_line[6]);
+push (@read_ids_lower, $ref);
+}elsif (($taglist_line[1] >= $upper_limit) || ($taglist_line[2] >= $upper_limit)){
+$ref = join ("\t", $taglist_line[0], $taglist_line[6]);
+push (@read_ids_upper, $ref);
+}
+}
+}
+%ids = map { $_ => 1 } @read_ids_lower;
+my @unique_lower = keys %ids;
+%ids = map { $_ => 1 } @read_ids_upper;
+my @unique_upper = keys %ids;
+%ids = map { $_ => 1 } @read_ids_all;
+my @unique_all = keys %ids;
+for (@unique_all) {
+my @junk = split /\//;
+push (@reads, $junk[0]);
+@junk = split /\t/;
+push (@total_readlist, $junk[1]);
+}
+map { $count{$_}++ } @reads;
+map {if ($count{$_} == 2){ @readid_good = split /\t/; push(@readlist, "$readid_good[1]");} elsif ($count{$_} == 1) { push(@readid_bad, "$_");}} keys (%count);
+@reads = {};
+undef %count;
+for (@readlist){
+chomp;
+$current_id = $_;
+my @pairs_lower = grep { $_ =~ /$current_id/} @unique_lower;
+my @pairs_upper = grep{ $_ =~ /$current_id/} @unique_upper;
+#                print "good id: $current_id\n";
+my $count_lower = scalar @pairs_lower;
+my $count_upper = scalar @pairs_upper;
+#                print "count lower: $count_lower\n";
+#                print "count upper: $count_upper\n";
+unless ((scalar @pairs_lower == 2) || (scalar @pairs_upper == 2)){
+push (@readlist_good, "$current_id");
+}
+}
+for (@readid_bad){
+chomp;
+@unsorted = ();
+($junk, $current_id) = split (/\t/);
+print "current ID: $current_id\n";
+@singleton = grep { $_ =~ /$current_id/} @total_readlist;
+for (@singleton){
+chomp;
+$tag = $_;
+@taglist = grep { $_ =~ /$tag/} @readtaglist;
+#                        print "taglist: @taglist\n";
+}
+for (@taglist) {
+@taglist_line = split /\t/;
+push(@unsorted, $taglist_line[1], $taglist_line[2]);
+$max = max @unsorted;
+$min = min @unsorted;
+}
+#                print "unsorted: @unsorted\n";
+		print "read mapping from $min to $max\n";
+#                print "min: $min\n";
+#                print "max: $max\n";
+		if ($min <= $lower_limit){
+print "orphan discarded! min<lowerlimit\n----------------------------------------------\n";
+}elsif ($max >= $upper_limit){
+			print "orphan discarded! max>upperlimit\n----------------------------------------------\n";
+		}elsif (($min >= $lower_main_limit) && ($max <= $upper_main_limit)){
+			print "orphan discarded! lower_main_limit<min-max<upper_main_limit\n----------------------------------------------\n";
+		}elsif (($min >= $elevated_cov_lower_start) && ($min <= $elevated_cov_lower_end - ($lower_limit / 2))){
+			print "orphan discarded! increased_coverage_lower_start<min<increased_coverage_lower_end\n----------------------------------------------\n";
+		}elsif (($max >= ($elevated_cov_upper_start + ($lower_limit / 2)))  && ($max <= $elevated_cov_upper_end)){
+			print "orphan discarded! increased_coverage_upper_start<max<increased_coverage_upper_end\n----------------------------------------------\n";
+		}else {
+push(@singletons, "@singleton\n");
+print "orphan resurrected! \n----------------------------------------------\n";
+}
+#                print "contiglength: $contiglength\n";
+}
+for (@singletons){
+my @resurrection = split /\//;
+push (@readlist_good, $resurrection[0]);
+}
+for (@readlist_good){
+$_ =~ s/$/\/1\n/g;
+push(@readlist_proofread, $_);
+$_ =~ s/1$/2/g;
+push(@readlist_proofread, $_);
+}
+return @readlist_proofread;
+}
+sub standard_deviation {
+my(@numbers) = @_;
+#Prevent division by 0 error in case you get junk data
+return undef unless(scalar(@numbers));
+# Step 1, find the mean of the numbers
+my $total1 = 0;
+	foreach my $num (@numbers) {
+		if (!$num){
+			$num = 0;
+		}
+		$total1 += $num;
+}
+my $mean1 = $total1 / (scalar @numbers);
+	push (my @stdev, "$mean1");
+# Step 2, find the mean of the squares of the differences between each number and the mean
+my $total2 = 0;
+foreach my $num (@numbers) {
+		if (!$num){
+			$num = 0;
+		}
+$total2 += ($mean1-$num)**2;
+}
+my $mean2 = $total2 / (scalar @numbers);
+# Step 3, standard deviation is the square root of the above mean
+my $std_dev = sqrt($mean2);
+	push (@stdev, "$std_dev");
+return @stdev;
+}
+sub assess_coverage{
+my $readtaglist_FH = $_[0];
+my @readtaglist;
+my $from =$_[1];
+my $to = $_[2];
+	my $where = $_[3];
+my @taglist_line;
+my @coverage_array_lower;
+my @coverage_array_upper;
+my @read_ids_lower;
+my @read_ids_upper;
+my %ids;
+my @taglist;
+my @unsorted;
+my $min;
+my $max;
+my %coverage;
+my @allnums;
+my @coverage_change_position;
+my @coverage_limits;
+#	print "assessing coverage from position $from to position $to\n";
+	open (TAGLIST,"<$readtaglist_FH") or die $!;
+	while (<TAGLIST>){
+		push (@readtaglist, "$_");
+	}
+	close (TAGLIST);
+for (@readtaglist){
+@taglist_line = split /\t/;
+unless ($taglist_line[0] =~ /#/){
+if ((($taglist_line[1] >= $from) && ($taglist_line[1] <= $to)) || (($taglist_line[2] >= $from) && ($taglist_line[2] <= $to))){
+				push (@coverage_array_lower, "$_");
+push (@read_ids_lower, $taglist_line[6]);
+}
+}
+}
+%ids = map { $_ => 1 } @read_ids_lower;
+my @unique_lower = keys %ids;
+for (@unique_lower){
+my @current_id = $_;
+chomp;
+@unsorted = ();
+for (@current_id){
+my $current_id = $_;
+@taglist = grep { $_ =~ /$current_id/} @coverage_array_lower;
+}
+for (@taglist) {
+@taglist_line = split /\t/;
+push(@unsorted, $taglist_line[1], $taglist_line[2]);
+$max = max @unsorted;
+$min = min @unsorted;
+}
+my @nums = ($min .. $max);
+for (@nums){
+push (@allnums, "$_");
+}
+}
+	%coverage = map { $_ => 0 } @allnums;
+map { $coverage{$_}++ } @allnums;
+open (OUT,">out-$where.csv");
+########## detecting coverage peak
+my $max_cov = 0;
+	my $max_cov_position;
+	my @cumulative_coverage;
+	map { unless (!$coverage{$_}){print OUT "$_,$coverage{$_}\n"; push (@cumulative_coverage, "$coverage{$_}"); if ($coverage{$_} > $max_cov){$max_cov = $coverage{$_}; $max_cov_position = $_; }}} ($from..$to);
+my @average_coverage = &standard_deviation(@cumulative_coverage);
+	my $coverage_factor = $max_cov / $average_coverage[0];
+	open (OUT,">>out-$where.csv");
+	print OUT "\nmaximum coverage is $max_cov at position $max_cov_position\naverge coverage is: $average_coverage[0], sd: $average_coverage[1]\nfactor $coverage_factor\n";
+	close (OUT);
+######### detecting rapid changes in coverage
+for ($from..($to - 10)){
+my $position = $_;
+my $cov = $coverage{$position};
+unless (!$cov){
+my @positions = ();
+push (@positions, "$cov");
+for (1 .. 10){
+my $pos_plus = $position + $_;
+if ($coverage{$pos_plus}){
+push (@positions, "$coverage{$pos_plus}");
+}
+}
+			my @stdev = &standard_deviation(@positions);
+	if ($stdev[1] > 6.0){
+	print "positions ($position): @positions -> stdev: $stdev[1]\n";
+	push (@coverage_change_position, $position);
+	}elsif ($stdev[1] >= 4.5){
+				print "positions ($position): @positions -> stdev: $stdev[1]\n";
+	}
+}
+}
+if (@coverage_change_position){
+print "positions with rapidly changing coverage detected: @coverage_change_position\n";
+my $start = min @coverage_change_position;
+my $end = max @coverage_change_position;
+push (@coverage_limits, "$start", "$end");
+print "set limits from $coverage_limits[0] to $coverage_limits[1]\n";
+}else{
+print "no irregularities in coverage detected\n";
+push (@coverage_limits, "0", "0");
+	return @coverage_limits;
+}
+###### assessing whether coverage peak lies within putative conserved region, if yes accept prediction; if no, reject conserved region
+	if (($coverage_factor >= 1.6) && (($coverage_limits[0] < $max_cov_position) && ( $max_cov_position < $coverage_limits[1]))){
+		print "suspicious coverage peak detected within the predicted limits\n";
+	}else {
+		print "no coverage peak detected within predicted limits - rejecting limits\n";
+		@coverage_limits = ();
+		push (@coverage_limits, "0", "0");
+	}
+	return @coverage_limits;
+}
+sub check_ref_length{
+my $ref=$_[0];
+my $output_filename=$_[1];
+my $critical=$_[2];
+	my $kmer = $_[3];
+my @header;
+my $header_count=0;
+my (@sequence,@temp_output,@final_output);
+my $full_sequence;
+open(REF,"<$ref") or die $!;
+while(<REF>){
+chomp;
+if ($_ =~ /^>/){
+push(@header,$_);
+#                       print "found header:\n$header[$header_count]\n";
+$header_count++;
+#                       print "header count: $header_count\n";
+if (@sequence){
+@temp_output=&finalize_sequence($critical,$header[-2],$kmer,@sequence);
+for (@temp_output){
+push(@final_output,$_);
+}
+}
+undef @sequence;
+}elsif ($_ =~ /[a-zA-Z]/){
+#                       print "found sequence:\n$_\n";
+push(@sequence,$_);
+}
+}
+@temp_output=&finalize_sequence($critical,$header[-1],$kmer,@sequence);
+for (@temp_output){
+push(@final_output,$_);
+}
+#       print "result:\n";
+open (OUT,">$output_filename") or die $!;
+for(@final_output){
+#               print "$_\n";
+print OUT "$_\n";
+}
+close REF;
+close OUT;
+}
+sub finalize_sequence{
+my $critical = shift;
+my $header = shift;
+	my $kmer = shift;
+my $full_sequence=join("",@_);
+my $factor;
+my @output;
+if (!$critical){
+$factor=0;
+}else{
+$factor=ceil(length($full_sequence)/$critical);
+}
+if ($factor == 1){
+push(@output,$header);
+push(@output,$full_sequence);
+}else{ #too long
+print "\nreference is too long for mirabait to be handled in one go -> will be split into sub-sequences\n";
+$header=substr $header, 1;
+for (my $i=0; $i<$factor; $i++){
+unless ((length(substr $full_sequence, $i*$critical, $critical+$kmer)-$kmer)<0){
+push(@output,">sub$i\_" .$header);
+push(@output,substr $full_sequence, $i*$critical, $critical+$kmer);
+}
+}
+}
+return @output;
+}
+sub create_manifest {
+	my ($iter, $sampleID, $refID, $mmode, $trim, $platform, $solexa_missmatches, $pair, $overhang, $reads, $ref, $redirect, $NFS_warn);
+	$iter = $_[0];
+	$sampleID = $_[1];
+	$refID = $_[2];
+	$mmode = $_[3];
+	$trim = $_[4];
+	$platform = $_[5];
+	$solexa_missmatches = $_[6];
+	$pair = $_[7];
+	$overhang = $_[8];
+	$reads = $_[9];
+	$ref = $_[10];
+	$redirect = $_[11];
+	$NFS_warn = $_[12];
+	if ($NFS_warn){
+		$NFS_warn = ":cnfs=warn"
+	}
+	open (MANIFEST,">manifest.conf") or die $!;
+	print MANIFEST "#manifest file for iteration $iter created by MITObim\n\nproject = $sampleID-$refID
+	\njob = genome,$mmode,accurate
+	\nparameters = -NW:mrnl=0$NFS_warn -AS:nop=1 $redirect $overhang $platform $trim -CO:msr=no $solexa_missmatches\n";
+	my @technology = split("_",$platform);
+	#-notraceinfo -
+	if ($mmode eq "mapping"){
+		print MANIFEST "\nreadgroup\nis_reference\ndata = $ref\nstrain = $refID\n";
+	}
+	print MANIFEST "\nreadgroup = reads\ndata = $reads\ntechnology = $technology[0]";
+	if ($pair){
+#		print MANIFEST "\nsegmentplacement = ---> <--- infoonly";
+		print MANIFEST "\nsegmentplacement = ---> <--- exclusion_criterion";
+		print MANIFEST "\ntemplatesize = -1 600 exclusion_criterion";
+	}
+	print MANIFEST "\nstrain = $sampleID\n";
+	close MANIFEST;
+}
+sub clean {
+my $interval = shift;
+my $cur = shift;
+my $dir = $cur-$interval;
+my $path=abs_path;
+if (-d "$path/../iteration$dir"){
+print "\nnow removing directory iteration$dir\n";
+rmtree ("$path/../iteration$dir") or die $!;
+}
+}
+sub remove_unmapped_contigs{
+	my $file = shift;
+	my $out = shift;
+	my ($head, $seq,$length);
+	my @split;
+	my ($dropped,$split,$trim)=(0,0,0);
+	open (FASTA,"<$out") or die $!."\ncould not open $file for reading\n";
+	while (<FASTA>){
+		chomp;
+		if ($_ =~ /^>/){
+			$head=$_;
+		}else {
+			if ($_ =~ /X/){
+				$dropped++;
+				print "drop:\n$head\n$_\n";
+			}else{
+				$length=length($_);
+				$_ =~ s/^N+//g;	#remove all Ns from the beginning of the sequence
+				$_ =~ s/N+$//g;	#remove all Ns from the end of the sequence
+				if ($length != length($_)){
+					$trim++;
+				}
+				@split=split(/N{3,}/);	#split at every occurance of 3 or more Ns
+			if (@split > 1){
+					print "contig $head has been split into ".scalar @split." sub-contigs\n";
+					$split++;
+					for (my $i=0; $i<@split; $i++){
+#						print "$head\_$i\n$split[$i]\n";
+						$seq.="$head\_$i\n$split[$i]\n";
+					}
+				}else{
+#					print "$head\n$_\n";
+					$seq.="$head\n$_\n";
+				}
+			}
+		}
+	}
+	close FASTA;
+	if (($dropped) || ($split) || ($trim)){
+		print "creating backup of $out before changing it\n";
+		copy $out,$file;
+		open (OUT,">$out") or die $!."\ncould not open $out for writing\n";
+		print OUT "$seq\n";
+		close OUT;
+	}
+	if ($dropped){
+		print "\nremoved $dropped contigs from baitfile because they did not receive any hits\n";
+	}else{
+		print "\nno need to remove any sequences from the baitfile\n";
+	}
+	if (!$split){
+		print "\nno need to split any contigs\n";
+	}
+	if ($trim){
+		print "\n$trim contigs had N's trimmed off their ends\n";
+	}
+}

Mercurial > repos > lijing > bubio

comparison MITObim_1.8.pl @ 6:a03d23c6ab95 draft