comparison MITObim_1.8.pl @ 6:a03d23c6ab95 draft

MitoBim and interleave
author lijing
date Thu, 02 Nov 2017 12:44:55 -0400
parents
children
comparison
equal deleted inserted replaced
5:7c679db88fa3 6:a03d23c6ab95
1 #! /usr/bin/perl
2 #
3 # MITObim - mitochondrial baiting and iterative mapping
4 # wrapper script version 1.8
5 # Author: Christoph Hahn, 2012-2016
6 # christoph.hahn@nhm.uio.no
7
8 use strict;
9 use warnings;
10 use Getopt::Long;
11 use Cwd qw(abs_path);
12 use File::Copy;
13 use List::Util qw< min max >;
14 use POSIX qw(strftime);
15 use POSIX qw(ceil);
16 use File::Path 'rmtree';
17
18 my $startiteration = 1;
19 my $enditeration = 1;
20
21 my ($quick, $noshow, $help, $strainname, $paired, $mode, $refname, $readpool, $maf, $proofreading, $readlength, $insertsize, $MM, $trim, $trimoverhang, $k_bait, $clean, $clean_interval, $min_contig_cov) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 31, 0, 2, 0);
22 my ($miramode, $key, $val, $exit, $current_contiglength, $current_number_of_reads, $current_number_of_contigs);
23 my $splitting = 0;
24 my $platform = "solexa";
25 my $platform_settings;# = "SOLEXA";
26 my $shme = "";
27 my $trim_off = "";
28 my $redirect_temp = "";
29 my $NFS_warn_only = "";
30 my ($mirapath, $mira, $miraconvert, $mirabait) = ("", "mira", "miraconvert", "mirabait");
31 my (@reads, @output, @path, @current_contig_stats, @contiglengths, @number_of_reads, @number_of_contigs);
32 my %hash;
33 my $PROGRAM = "\nMITObim - mitochondrial baiting and iterative mapping\n";
34 my $VERSION = "version 1.8\n";
35 my $AUTHOR = "author: Christoph Hahn, (c) 2012-2016\n";
36 my $cite = "\nif you found MITObim useful, please cite:
37 Hahn C, Bachmann L and Chevreux B. (2013) Reconstructing mitochondrial genomes directly from genomic next-generation sequencing reads -
38 a baiting and iterative mapping approach. Nucl. Acids Res. 41(13):e129. doi: 10.1093/nar/gkt371\n\n";
39 my $USAGE = "\nusage: ./MITObim.pl <parameters>
40 \nparameters:
41 -start <int> iteration to start with, default=1
42 -end <int> iteration to end with, default=1
43 -sample <string> sampleID as used in initial MIRA assembly
44 -ref <string> referenceID as used in initial MIRA assembly
45 -readpool <FILE> readpool in fastq format (*.gz is also allowed)
46 -maf <FILE> maf file from previous MIRA assembly
47 \noptional:
48 --quick <FILE> starts process with initial baiting using provided fasta reference
49 --kbait <int> set kmer for baiting stringency (default: 31)
50 --platform specify sequencing platform (default: 'solexa'; other options: 'iontor', '454', 'pacbio')
51 --denovo runs MIRA in denovo mode (default: mapping)
52 --pair finds pairs after baiting (relies on /1 and /2 header convention for read pairs) (default: no)
53 --verbose show detailed output of MIRA modules (default: no)
54 --split split reference at positions with more than 5N (default: no)
55 --help shows this helpful information
56 --clean retain only the last 2 iteration directories (default: no)
57 --trimreads trim data (default: no; we recommend to trim beforehand and feed MITObim with pre trimmed data)
58 --trimoverhang trim overhang up- and downstream of reference (default: no)
59 --missmatch <int> number of allowed missmatches in mapping - only for illumina data (default: 15% of avg. read length)
60 --min_cov <int> minimum average coverage of contigs to be retained (default: off)
61 --mirapath <string> full path to MIRA binaries (only needed if MIRA is not in PATH)
62 --redirect_tmp redirect temporary output to this location (useful in case you are running MITObim on an NFS mount)
63 --NFS_warn_only allow MIRA to run on NFS mount without aborting - warn only (expert option - see MIRA documentation 'check_nfs')
64 \nexamples:
65 ./MITObim.pl -start 1 -end 5 -sample StrainX -ref reference-mt -readpool illumina_readpool.fastq -maf initial_assembly.maf
66 ./MITObim.pl -end 10 --quick reference.fasta -sample StrainY -ref reference-mt -readpool illumina_readpool.fastq\n\n";
67 # --proofread applies proofreading (atm only to be used if starting the process from a single short seed reference)
68 # --readlength <int> read length of illumina library, default=150, relevant only for proofreading
69 # --insert <int> insert size of illumina library, default=300, relevant only for proofreading
70
71 my $command = $0;
72 for (@ARGV){
73 $command .= " $_";
74 }
75
76 GetOptions ( "start=i" => \$startiteration,
77 "end=i" => \$enditeration,
78 "kbait=i" => \$k_bait,
79 "quick=s" => \$quick,
80 "verbose!" => \$noshow,
81 "sample=s" => \$strainname,
82 "paired" => \$paired,
83 "denovo" => \$mode,
84 "ref=s" => \$refname,
85 "readpool=s" => \$readpool,
86 "help!" => \$help,
87 "clean!" => \$clean,
88 "mirapath=s" => \$mirapath,
89 "maf=s" => \$maf,
90 # "proofreading!" => \$proofreading,
91 "trimreads!" => \$trim,
92 "trimoverhang!" => \$trimoverhang,
93 "missmatch=i" => \$MM,
94 "platform=s" => \$platform,
95 # "readlength=i" => \$readlength,
96 # "insertsize=i" => \$insertsize,
97 "split!" => \$splitting,
98 "min_cov=i" => \$min_contig_cov,
99 "redirect_tmp=s" => \$redirect_temp,
100 "NFS_warn_only!" => \$NFS_warn_only) or die "Incorrect usage!\n$USAGE";
101
102
103 print $PROGRAM;
104 print $VERSION;
105 print $AUTHOR;
106
107 print $USAGE and exit if $help;
108 print $USAGE and exit if !$readpool;
109 unless ($quick){
110 print $USAGE and exit if !$maf;
111 }
112 print $USAGE and exit if !$refname;
113
114 $readpool=abs_path($readpool);
115 unless (-e $readpool){
116 print "Cant find the readpool. Is the path correct?\n";
117 }
118 if ($maf){
119 $maf=abs_path($maf);
120 unless (-e $maf){
121 print "Cant find *.maf file. Is the path correct?\n";
122 }
123 }
124
125 ($platform, $platform_settings) = &set_platform($platform);
126
127 if ($quick){
128 $quick=abs_path($quick);
129 unless (-e $quick){
130 print "quick option selected but is the path to the file correct?\n";
131 exit 1;
132 }
133 print "\nquick option selected! -maf option will be ignored (if given)\n";
134 $maf = 0;
135 $startiteration = 0;
136 }
137 print $USAGE and exit if ($startiteration > $enditeration);
138 unless (((-e $maf)||($quick)) && (-e $readpool)){
139 print "\nAre readpool AND maf/reference files there?\n";
140 exit 1;
141 }
142 if ($mirapath){
143 if (-e "$mirapath/mira"){
144 print "found executables in the path specified by the user - good!\n";
145 $mira = "$mirapath/mira";
146 $miraconvert = "$mirapath/miraconvert";
147 $mirabait = "$mirapath/mirabait";
148 }else{
149 print "somethings wrong with the path to mira.\n";
150 exit 1;
151 }
152 }
153
154
155 ##if not given otherwise, readlength and insertsize are set to default. automatic readlength and insertsize detection will be implemented in time.
156 #if (!$readlength){
157 # $readlength = 150;
158 #}
159 #if (!$insertsize){
160 # $insertsize = 300;
161 #}
162 if (!$mode){
163 $miramode = "mapping";
164 }else {
165 $miramode = "denovo";
166 }
167
168 if (!$trim){
169 $trim_off = "\"--noclipping -CL:pec=no\"";
170 }
171
172 my $out_command = $command; $out_command =~ s/\S+galaxy\///g;
173 print "\nFull command run:\n$out_command\n";
174 print "\nAll paramters seem to make sense:\n";
175 print "startiteration: $startiteration\n";
176 print "enditeration: $enditeration\n";
177 print "sample: $strainname\n";
178 print "refname: $refname\n";
179 my $out_readpool = $readpool; $out_readpool =~ s/.*galaxy\///;
180 print "readpool: $out_readpool\n";
181 print "maf: $maf\n";
182 my $out_quick = $quick; $out_quick =~ s/.*galaxy\///;
183 print "quick: $out_quick\n";
184 print "paired: $paired (off=0, on=1)\n";
185 print "assembly mode: $mode (mapping=0, denovo=1)\n";
186 print "verbose: $noshow (off=0, on=1)\n";
187 print "split: $splitting (off=0, on=1)\n";
188 print "minimum avg. coverage: $min_contig_cov (off=0)\n";
189 print "clean: $clean (off=0, on=1)\n";
190 print "trim reads: $trim (off=0, on=1)\n";
191 print "trim overhang: $trimoverhang (no=0, yes=1)\n";
192 print "platform: $platform\n";
193 print "kmer baiting: $k_bait\n";
194 #print "proofread: $proofreading (off=0, on=1)\n";
195
196 if ($proofreading){
197 print "\nproofreading is not yet enabled in this beta version of MITObim 1.8 - it is currently being optimized for MIRA4. \nplease refer to MITObim 1.6 if you wish to use this option. Sorry for the inconvenience!\n\n";
198 exit;
199
200 print "proofreading: on\n";
201 print "readlength: $readlength\n";
202 print "insertsize: $insertsize\n";
203 $MM = 0;
204 print "number of allowed missmatches in proofreading assembly: $MM\n";
205 $shme = "-AL:shme=$MM";
206 }elsif ((!$proofreading) && (!$mode) && ($platform eq "solexa")){
207 if ($MM == -1){
208 print "number of missmatches in mapping assembly: default (15% of average read length loaded)\n";
209 $shme = "";
210 }else {
211 print "number of missmatches in mapping assembly: $MM\n";
212 $shme = "-AL:shme=$MM";
213 }
214 print "proofreading: off\n";
215 }
216
217 if (!$trimoverhang){
218 $trimoverhang = "-SB:tor=no";
219 }else {
220 $trimoverhang = "-SB:tor=yes";
221 }
222
223 if ($redirect_temp) {
224 $redirect_temp="-DI:trt=$redirect_temp"
225 }
226
227 print "\nStarting MITObim \n";
228
229 my @iteration = ($startiteration .. $enditeration);
230 foreach (@iteration){
231 chomp;
232 my $currentiteration = $_;
233 mkdir "iteration$currentiteration" or die "MITObim will not overwrite an existing directory: iteration$currentiteration\n";
234 chdir "iteration$currentiteration" or die $!;
235 print "\n==============\n";
236 print " ITERATION $currentiteration\n";
237 print "==============\n";
238 print strftime("%b %e %H:%M:%S", localtime) . "\n\n";
239 # if (($proofreading) && ($currentiteration != 0)){
240 if ($proofreading){
241 $shme = "-AL:shme=0";
242 }
243
244 if ($maf){
245 print "\nrecover backbone by running miraconvert on maf file\n\n";
246 if ($currentiteration<2){
247 @output= qx($miraconvert -f maf -t fasta -A "$platform_settings -CO:fnicpst=yes" $maf tmp);
248 }else{
249 @output= qx($miraconvert -f maf -t fasta -y $min_contig_cov -A "$platform_settings -CO:fnicpst=yes" $maf tmp);
250 }
251 $exit = $? >> 8;
252 unless (!$noshow){
253 print "@output\n";
254 }
255 unless ($exit == 0){
256 if (!$noshow){
257 print "@output\n";
258 }
259 print "\nmiraconvert seems to have failed - see detailed output above\n";
260 exit;
261 }
262 # if ( ((($mode) && ($currentiteration > 1)) && (!$quick)) || ((($mode) && ($currentiteration >= 1)) && ($quick)) ){
263 # open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!";
264 # }else{
265 open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!\nIs the sampleID identical as in the initial MIRA assembly?\n";
266 # }
267 open(FH2,">backbone_it$currentiteration\_initial_$refname.fna") or die $!;
268 while (<FH1>) {
269 $_ =~ s/x/N/g;
270 print FH2 $_;
271 }
272 close(FH1);
273 close(FH2);
274 unlink glob ("tmp*");
275 }
276 MIRABAIT:
277 unless ($maf){
278 print "\nquick option baits reads from provided reference in iteration 0\n";
279 copy("$quick", "backbone_it$currentiteration\_initial_$refname.fna") or die "copy failed: $!";
280 }
281 &check_ref_length("backbone_it$currentiteration\_initial_$refname.fna","temp_baitfile.fasta",29800,$k_bait);
282
283 if ($splitting){
284 &remove_unmapped_contigs("temp_baitfile_backup.fasta","temp_baitfile.fasta");
285 if (-e "temp_baitfile_backup.fasta"){ #this file has only been created if splitting contigs was necessary
286 copy "temp_baitfile.fasta","backbone_it$currentiteration\_initial_$refname.fna";
287 }
288 }
289 print "\nfishing readpool using mirabait (k = $k_bait)\n\n";
290
291 # @output = (`mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-$refname\_in.$platform 2>&1`);
292 @output = qx($mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-readpool-it$currentiteration);
293 $exit = $? >> 8;
294 unless (!$noshow){
295 print "@output\n";
296 }
297 if (!-s "$strainname-readpool-it$currentiteration.fastq"){
298 print "\nyour readpool does not contain any reads with reasonable match (k = $k_bait) to your reference - Maybe you ll want to different settings or even a different reference?\n\n";
299 exit;
300 }
301 unless ($exit == 0){
302 if (!$noshow){
303 print "@output\n";
304 }
305 print "\nmirabait seems to have failed - see detailed output above\n";
306 exit;
307 }
308
309 FINDPAIRS:
310
311 unless (!$paired){
312 print "\nfinding pairs for baited reads\n\n";
313 copy ("$strainname-readpool-it$currentiteration.fastq", "$strainname-readpool-it$currentiteration-se.fastq") or die "copy failed: $!";
314 open(FH1,"<$strainname-readpool-it$currentiteration.fastq") or die $!;
315 open(FH2,">list");
316 my $index=1;
317 while (<FH1>) {
318 if ($index % 8 ==1 || $index % 8 ==5) {
319 chomp;
320 $_ =~ s/@//g;
321 ($key, $val) = split /\//;
322 $hash{$key} .= exists $hash{$key} ? ",$val" : $val;
323 }
324 $index++;
325 }
326
327 for (keys %hash){
328 $_ =~ s/$/\/1/g;
329 print FH2 "$_\n";
330 $_ =~ s/1$/2/g;
331 print FH2 "$_\n";
332 }
333 close(FH1);
334 close(FH2);
335 # @output = (`miraconvert -f fastq -t fastq -n list $readpool $strainname-$refname\_in.$platform 2>&1`);
336 @output = qx($miraconvert -f fastq -t fastq -n list $readpool $strainname-readpool-it$currentiteration );
337 $exit = $? >> 8;
338 unless (!$noshow){
339 print "@output\n";
340 }
341 unless ($exit == 0){
342 if (!$noshow){
343 print "@output\n";
344 }
345 print "\nmiraconvert seems to have failed - see detailed output above\n";
346 exit;
347 }
348 }
349 unlink("list");
350
351 MIRA:
352 print "\nrunning $miramode assembly using MIRA\n\n";
353 &create_manifest($currentiteration,$strainname,$refname,$miramode,$trim_off,$platform_settings,$shme,$paired,$trimoverhang,"$strainname-readpool-it$currentiteration.fastq","backbone_it$currentiteration\_initial_$refname.fna", $redirect_temp, $NFS_warn_only);
354 @output = qx($mira manifest.conf );
355
356 $exit = $? >> 8;
357 unless (!$noshow){
358 print "@output\n";
359 }
360 unless ($exit == 0){
361 if (!$noshow){
362 print "@output\n";
363 }
364 print "\nMIRA seems to have failed - see detailed output above\n";
365 exit;
366 }
367
368 if ($redirect_temp) {
369 my $tmp_link = readlink("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp");
370 rmtree ("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp") or die $!;
371 mkdir "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/" or die $1;
372 my @files = glob("$tmp_link/*");
373 for (@files) {
374 move ("$_", "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/") or die $!;
375 }
376 rmtree ("$tmp_link") or die $!;
377 }
378
379 @path = abs_path;
380 push (@path, "/$strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out.maf");
381 $maf = join("",@path);
382 unless (-e $maf){
383 print "maf file is not there \n";
384 exit;
385 }
386
387 # $current_contiglength = &get_contig_length("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt");
388 # $current_number_of_reads = (&get_number_of_reads("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt") - 1);
389
390 @current_contig_stats = &get_contig_stats("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt");
391 if (((scalar @current_contig_stats > 3) || ($current_contig_stats[0] > 1)) && ($proofreading)) {
392 print "assembly consists of more than one contigs - this is atm not permitted in proofreading mode. Sorry!\n\n";
393 exit 1;
394 }
395
396 PROOFREAD:
397 # if (($proofreading) && ($currentiteration >= 1)){
398 if ($proofreading){
399 print "proofreading option is currently disabled in this beta version of MITObim 1.7 - sorry for the inconvenience!\n\n";
400 exit;
401 }
402
403 $current_number_of_contigs = shift @current_contig_stats;
404 $current_number_of_reads = shift @current_contig_stats;
405 if (!$mode){ #in mapping assemblies the reference is counted as one read
406 $current_number_of_reads -= $current_number_of_contigs;
407 }
408 push (@number_of_contigs, $current_number_of_contigs);
409 push (@number_of_reads, $current_number_of_reads);
410 print "readpool contains $current_number_of_reads reads\n";
411 print "assembly contains $current_number_of_contigs contig(s)\n";
412 if (scalar @current_contig_stats == 1){
413 print "contig length: $current_contig_stats[0]\n";
414 }elsif (scalar @current_contig_stats == 3){
415 print "min contig length: ".$current_contig_stats[0]." bp\nmax contig length: ".$current_contig_stats[1]." bp\navg contig length: ".sprintf("%.0f", $current_contig_stats[2])." bp\n";
416 print "find details on individual contigs in: ". abs_path . "/$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt\n";
417 # printf("%1f",$current_contig_stats[2])."\n";
418 }else {
419 print "somethings wrong with your contig stats. Sorry!\n";
420 exit 1;
421 }
422
423 if ($clean){
424 &clean($clean_interval, $currentiteration);
425 }
426 # my $path=abs_path; print "Now at $path\n";
427 system("cp $strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out_AllStrains.unpadded.fasta ../finaloutput.fasta");
428 system("sed -i 's/_bb.*/_mitobim_out_$currentiteration\_iterations/' ../finaloutput.fasta");
429 if ($number_of_reads[-2]){
430 if ($number_of_reads[-2] >= $number_of_reads[-1]){
431 print "\nMITObim has reached a stationary read number after $currentiteration iterations!!\n$cite";
432 print strftime("%b %e %H:%M:%S", localtime) . "\n\n";
433 exit;
434 }
435 }
436 chdir ".." or die "Failed to go to parent directory: $!";
437 }
438 print "\nsuccessfully completed $enditeration iterations with MITObim! " . strftime("%b %e %H:%M:%S", localtime) . "\n$cite";
439
440 #
441 #
442 ###SUBROUTINES
443 #
444 #
445 #
446
447 sub set_platform{
448 my @platforms = ('solexa', '454', 'iontorrent', 'pacbio');
449 my %pf_settings = ("solexa" => "SOLEXA_SETTINGS", "454" => "454_SETTINGS", "iontorrent" => "IONTOR_SETTINGS", "pacbio" => "PCBIOHQ_SETTINGS -CO:mrpg=5");
450 my $user_setting = shift;
451 my @user_choice;
452 for (@platforms){
453 if ( $_ =~ /^$user_setting/ ){
454 push(@user_choice, $_);
455 }
456 }
457
458 if (scalar(@user_choice) > 1){
459 print "\nYour platform choice was ambiguous: $user_setting could be: @user_choice - Please try again\n";
460 exit;
461
462 } elsif (scalar(@user_choice) == 0){
463 print "\nPlease specify a valid sequencing platform - not specifying anything will default into: $platforms[0]\n\navailable options are:\n@platforms\n";
464 exit;
465 }
466
467 return ($user_choice[0], $pf_settings{$user_choice[0]})
468 }
469
470 sub get_contig_stats{
471 my $contig = $_[0];
472 my @array;
473 my @contiglength;
474 my (@readnumber, @stats);
475 my $readssum = 0;
476 open (CONTIGSTATS,"<$contig") or die $!;
477 while (<CONTIGSTATS>){
478 unless ($_ =~ /#/){
479 @array = split /\t/;
480 push (@contiglength, $array[1]);
481 push (@readnumber, $array[3]);
482 }
483 }
484 close (CONTIGSTATS);
485 if (scalar @readnumber == 1){
486 push (@stats, (scalar @readnumber, $readnumber[0], $contiglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, length of contig
487 }
488 elsif (scalar @readnumber > 1){
489 $readssum += $_ for @readnumber;
490 my $minlength = min @contiglength;
491 my $maxlength = max @contiglength;
492 my @avglength = &standard_deviation(@contiglength);
493 push (@stats, (scalar @readnumber, $readssum, $minlength, $maxlength, $avglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, minimal, maximal, avg length of contigs
494 }
495 return @stats;
496 }
497
498
499 #sub get_contig_length{
500 # my $contig = $_[0];
501 # my @contiglength;
502 # open (CONTIGSTATS,"<$contig") or die $!;
503 # while (<CONTIGSTATS>){
504 # unless ($_ =~ /#/){
505 # @contigslength = split /\t/;
506 # }
507 # }
508 # close (CONTIGSTATS);
509 # return $contiglength[1];
510 #}
511
512 #sub get_number_of_reads{
513 # my $contig = $_[0];
514 # my @contigstats;
515 # open (CONTIGSTATS,"<$contig") or die $!;
516 # while (<CONTIGSTATS>){
517 # unless ($_ =~ /#/){
518 # @contigstats = split /\t/;
519 # }
520 # }
521 # close (CONTIGSTATS);
522 # return $contigstats[3];
523 #}
524
525 sub proofread {
526 my $zero_MM = $_[0];
527 my $readtaglist_FH = $_[1];
528 my $contiglength = $_[2];
529 my $elevated_cov_lower_start = $_[3];
530 my $elevated_cov_lower_end = $_[4];
531 my $elevated_cov_upper_start = $_[5];
532 my $elevated_cov_upper_end = $_[6];
533 my $lower_limit = $_[7];
534 # my $lower_limit = 200;
535 my $lower_main_limit = $_[8];
536 # my $lower_main_limit = 500;
537 my $upper_limit = $contiglength - $lower_limit;
538 my $upper_main_limit = $contiglength - $lower_main_limit;
539 my @readtaglist;
540 my $ref;
541 my $junk;
542 my $current_id;
543 my %count;
544 my @readid_good;
545 my @readid_bad;
546 my @reads;
547 my @readlist;
548 my @readlist_good;
549 my @readlist_proofread;
550 my @total_readlist;
551 my @singleton;
552 my @singletons;
553 my @taglist;
554 my @taglist_line;
555 my @readtaglist_lower;
556 my @readtaglist_upper;
557 my @read_ids_lower;
558 my @read_ids_all;
559 my @read_ids_upper;
560 my %ids =();
561 my @unsorted;
562 my $min;
563 my $max;
564 my $tag;
565
566
567 print "\nlower limit: $lower_limit\n";
568 print "upper limit: $upper_limit\n";
569 print "lower main limit: $lower_main_limit\n";
570 print "upper main limit: $upper_main_limit\n\n";
571 open (TAGLIST,"<$readtaglist_FH") or die $!;
572 while (<TAGLIST>){
573 push (@readtaglist, "$_");
574 }
575 close (TAGLIST);
576
577 for (@readtaglist){
578 @taglist_line = split /\t/;
579 unless ($taglist_line[0] =~ /#/){
580 $ref = join ("\t", $taglist_line[0], $taglist_line[6]);
581 push (@read_ids_all, $ref);
582
583 if (($taglist_line[1] <= $lower_limit) || ($taglist_line[2] <= $lower_limit)){
584 # if ((($taglist_line[1] <= $lower_limit) || (($taglist_line[1] >= $coverage_limits_lower[0])&&($taglist_line[1] <= $coverage_limits_lower[1]))) || ($taglist_line[2] <= $lower_limit)){
585 $ref = join ("\t", $taglist_line[0], $taglist_line[6]);
586 push (@read_ids_lower, $ref);
587 }elsif (($taglist_line[1] >= $upper_limit) || ($taglist_line[2] >= $upper_limit)){
588 $ref = join ("\t", $taglist_line[0], $taglist_line[6]);
589 push (@read_ids_upper, $ref);
590 }
591 }
592 }
593 %ids = map { $_ => 1 } @read_ids_lower;
594 my @unique_lower = keys %ids;
595 %ids = map { $_ => 1 } @read_ids_upper;
596 my @unique_upper = keys %ids;
597 %ids = map { $_ => 1 } @read_ids_all;
598 my @unique_all = keys %ids;
599 for (@unique_all) {
600 my @junk = split /\//;
601 push (@reads, $junk[0]);
602 @junk = split /\t/;
603 push (@total_readlist, $junk[1]);
604 }
605
606 map { $count{$_}++ } @reads;
607 map {if ($count{$_} == 2){ @readid_good = split /\t/; push(@readlist, "$readid_good[1]");} elsif ($count{$_} == 1) { push(@readid_bad, "$_");}} keys (%count);
608 @reads = {};
609 undef %count;
610
611 for (@readlist){
612 chomp;
613 $current_id = $_;
614 my @pairs_lower = grep { $_ =~ /$current_id/} @unique_lower;
615 my @pairs_upper = grep{ $_ =~ /$current_id/} @unique_upper;
616 # print "good id: $current_id\n";
617 my $count_lower = scalar @pairs_lower;
618 my $count_upper = scalar @pairs_upper;
619 # print "count lower: $count_lower\n";
620 # print "count upper: $count_upper\n";
621 unless ((scalar @pairs_lower == 2) || (scalar @pairs_upper == 2)){
622 push (@readlist_good, "$current_id");
623 }
624 }
625 for (@readid_bad){
626 chomp;
627 @unsorted = ();
628 ($junk, $current_id) = split (/\t/);
629 print "current ID: $current_id\n";
630 @singleton = grep { $_ =~ /$current_id/} @total_readlist;
631 for (@singleton){
632 chomp;
633 $tag = $_;
634 @taglist = grep { $_ =~ /$tag/} @readtaglist;
635 # print "taglist: @taglist\n";
636 }
637 for (@taglist) {
638 @taglist_line = split /\t/;
639 push(@unsorted, $taglist_line[1], $taglist_line[2]);
640 $max = max @unsorted;
641 $min = min @unsorted;
642 }
643 # print "unsorted: @unsorted\n";
644 print "read mapping from $min to $max\n";
645 # print "min: $min\n";
646 # print "max: $max\n";
647
648 if ($min <= $lower_limit){
649 print "orphan discarded! min<lowerlimit\n----------------------------------------------\n";
650 }elsif ($max >= $upper_limit){
651 print "orphan discarded! max>upperlimit\n----------------------------------------------\n";
652 }elsif (($min >= $lower_main_limit) && ($max <= $upper_main_limit)){
653 print "orphan discarded! lower_main_limit<min-max<upper_main_limit\n----------------------------------------------\n";
654 }elsif (($min >= $elevated_cov_lower_start) && ($min <= $elevated_cov_lower_end - ($lower_limit / 2))){
655 print "orphan discarded! increased_coverage_lower_start<min<increased_coverage_lower_end\n----------------------------------------------\n";
656 }elsif (($max >= ($elevated_cov_upper_start + ($lower_limit / 2))) && ($max <= $elevated_cov_upper_end)){
657 print "orphan discarded! increased_coverage_upper_start<max<increased_coverage_upper_end\n----------------------------------------------\n";
658 }else {
659 push(@singletons, "@singleton\n");
660 print "orphan resurrected! \n----------------------------------------------\n";
661 }
662 # print "contiglength: $contiglength\n";
663 }
664
665 for (@singletons){
666 my @resurrection = split /\//;
667 push (@readlist_good, $resurrection[0]);
668 }
669 for (@readlist_good){
670 $_ =~ s/$/\/1\n/g;
671 push(@readlist_proofread, $_);
672 $_ =~ s/1$/2/g;
673 push(@readlist_proofread, $_);
674 }
675 return @readlist_proofread;
676 }
677
678 sub standard_deviation {
679 my(@numbers) = @_;
680 #Prevent division by 0 error in case you get junk data
681 return undef unless(scalar(@numbers));
682
683 # Step 1, find the mean of the numbers
684 my $total1 = 0;
685 foreach my $num (@numbers) {
686 if (!$num){
687 $num = 0;
688 }
689 $total1 += $num;
690 }
691 my $mean1 = $total1 / (scalar @numbers);
692 push (my @stdev, "$mean1");
693 # Step 2, find the mean of the squares of the differences between each number and the mean
694 my $total2 = 0;
695 foreach my $num (@numbers) {
696 if (!$num){
697 $num = 0;
698 }
699 $total2 += ($mean1-$num)**2;
700 }
701 my $mean2 = $total2 / (scalar @numbers);
702
703 # Step 3, standard deviation is the square root of the above mean
704 my $std_dev = sqrt($mean2);
705 push (@stdev, "$std_dev");
706 return @stdev;
707 }
708
709 sub assess_coverage{
710 my $readtaglist_FH = $_[0];
711 my @readtaglist;
712 my $from =$_[1];
713 my $to = $_[2];
714 my $where = $_[3];
715 my @taglist_line;
716 my @coverage_array_lower;
717 my @coverage_array_upper;
718 my @read_ids_lower;
719 my @read_ids_upper;
720 my %ids;
721 my @taglist;
722 my @unsorted;
723 my $min;
724 my $max;
725 my %coverage;
726 my @allnums;
727 my @coverage_change_position;
728 my @coverage_limits;
729
730 # print "assessing coverage from position $from to position $to\n";
731 open (TAGLIST,"<$readtaglist_FH") or die $!;
732 while (<TAGLIST>){
733 push (@readtaglist, "$_");
734 }
735 close (TAGLIST);
736
737 for (@readtaglist){
738 @taglist_line = split /\t/;
739 unless ($taglist_line[0] =~ /#/){
740 if ((($taglist_line[1] >= $from) && ($taglist_line[1] <= $to)) || (($taglist_line[2] >= $from) && ($taglist_line[2] <= $to))){
741 push (@coverage_array_lower, "$_");
742 push (@read_ids_lower, $taglist_line[6]);
743 }
744 }
745 }
746 %ids = map { $_ => 1 } @read_ids_lower;
747 my @unique_lower = keys %ids;
748 for (@unique_lower){
749 my @current_id = $_;
750 chomp;
751 @unsorted = ();
752 for (@current_id){
753 my $current_id = $_;
754 @taglist = grep { $_ =~ /$current_id/} @coverage_array_lower;
755 }
756 for (@taglist) {
757 @taglist_line = split /\t/;
758 push(@unsorted, $taglist_line[1], $taglist_line[2]);
759 $max = max @unsorted;
760 $min = min @unsorted;
761 }
762 my @nums = ($min .. $max);
763 for (@nums){
764 push (@allnums, "$_");
765 }
766 }
767 %coverage = map { $_ => 0 } @allnums;
768 map { $coverage{$_}++ } @allnums;
769 open (OUT,">out-$where.csv");
770
771 ########## detecting coverage peak
772 my $max_cov = 0;
773 my $max_cov_position;
774 my @cumulative_coverage;
775 map { unless (!$coverage{$_}){print OUT "$_,$coverage{$_}\n"; push (@cumulative_coverage, "$coverage{$_}"); if ($coverage{$_} > $max_cov){$max_cov = $coverage{$_}; $max_cov_position = $_; }}} ($from..$to);
776 my @average_coverage = &standard_deviation(@cumulative_coverage);
777 my $coverage_factor = $max_cov / $average_coverage[0];
778 open (OUT,">>out-$where.csv");
779 print OUT "\nmaximum coverage is $max_cov at position $max_cov_position\naverge coverage is: $average_coverage[0], sd: $average_coverage[1]\nfactor $coverage_factor\n";
780 close (OUT);
781
782 ######### detecting rapid changes in coverage
783 for ($from..($to - 10)){
784 my $position = $_;
785 my $cov = $coverage{$position};
786 unless (!$cov){
787 my @positions = ();
788 push (@positions, "$cov");
789 for (1 .. 10){
790 my $pos_plus = $position + $_;
791 if ($coverage{$pos_plus}){
792 push (@positions, "$coverage{$pos_plus}");
793 }
794 }
795 my @stdev = &standard_deviation(@positions);
796 if ($stdev[1] > 6.0){
797 print "positions ($position): @positions -> stdev: $stdev[1]\n";
798 push (@coverage_change_position, $position);
799 }elsif ($stdev[1] >= 4.5){
800 print "positions ($position): @positions -> stdev: $stdev[1]\n";
801 }
802 }
803 }
804 if (@coverage_change_position){
805 print "positions with rapidly changing coverage detected: @coverage_change_position\n";
806 my $start = min @coverage_change_position;
807 my $end = max @coverage_change_position;
808 push (@coverage_limits, "$start", "$end");
809 print "set limits from $coverage_limits[0] to $coverage_limits[1]\n";
810 }else{
811 print "no irregularities in coverage detected\n";
812 push (@coverage_limits, "0", "0");
813 return @coverage_limits;
814 }
815
816
817 ###### assessing whether coverage peak lies within putative conserved region, if yes accept prediction; if no, reject conserved region
818 if (($coverage_factor >= 1.6) && (($coverage_limits[0] < $max_cov_position) && ( $max_cov_position < $coverage_limits[1]))){
819
820 print "suspicious coverage peak detected within the predicted limits\n";
821 }else {
822 print "no coverage peak detected within predicted limits - rejecting limits\n";
823 @coverage_limits = ();
824 push (@coverage_limits, "0", "0");
825 }
826 return @coverage_limits;
827
828 }
829
830 sub check_ref_length{
831 my $ref=$_[0];
832 my $output_filename=$_[1];
833 my $critical=$_[2];
834 my $kmer = $_[3];
835 my @header;
836 my $header_count=0;
837 my (@sequence,@temp_output,@final_output);
838 my $full_sequence;
839 open(REF,"<$ref") or die $!;
840 while(<REF>){
841 chomp;
842 if ($_ =~ /^>/){
843 push(@header,$_);
844 # print "found header:\n$header[$header_count]\n";
845 $header_count++;
846 # print "header count: $header_count\n";
847 if (@sequence){
848 @temp_output=&finalize_sequence($critical,$header[-2],$kmer,@sequence);
849 for (@temp_output){
850 push(@final_output,$_);
851 }
852 }
853 undef @sequence;
854 }elsif ($_ =~ /[a-zA-Z]/){
855 # print "found sequence:\n$_\n";
856 push(@sequence,$_);
857 }
858 }
859 @temp_output=&finalize_sequence($critical,$header[-1],$kmer,@sequence);
860 for (@temp_output){
861 push(@final_output,$_);
862 }
863 # print "result:\n";
864 open (OUT,">$output_filename") or die $!;
865 for(@final_output){
866 # print "$_\n";
867 print OUT "$_\n";
868 }
869 close REF;
870 close OUT;
871
872 }
873
874 sub finalize_sequence{
875 my $critical = shift;
876 my $header = shift;
877 my $kmer = shift;
878 my $full_sequence=join("",@_);
879 my $factor;
880 my @output;
881 if (!$critical){
882 $factor=0;
883 }else{
884 $factor=ceil(length($full_sequence)/$critical);
885 }
886 if ($factor == 1){
887 push(@output,$header);
888 push(@output,$full_sequence);
889 }else{ #too long
890 print "\nreference is too long for mirabait to be handled in one go -> will be split into sub-sequences\n";
891 $header=substr $header, 1;
892 for (my $i=0; $i<$factor; $i++){
893 unless ((length(substr $full_sequence, $i*$critical, $critical+$kmer)-$kmer)<0){
894 push(@output,">sub$i\_" .$header);
895 push(@output,substr $full_sequence, $i*$critical, $critical+$kmer);
896 }
897 }
898 }
899 return @output;
900 }
901
902 sub create_manifest {
903 my ($iter, $sampleID, $refID, $mmode, $trim, $platform, $solexa_missmatches, $pair, $overhang, $reads, $ref, $redirect, $NFS_warn);
904 $iter = $_[0];
905 $sampleID = $_[1];
906 $refID = $_[2];
907 $mmode = $_[3];
908 $trim = $_[4];
909 $platform = $_[5];
910 $solexa_missmatches = $_[6];
911 $pair = $_[7];
912 $overhang = $_[8];
913 $reads = $_[9];
914 $ref = $_[10];
915 $redirect = $_[11];
916 $NFS_warn = $_[12];
917
918 if ($NFS_warn){
919 $NFS_warn = ":cnfs=warn"
920 }
921
922 open (MANIFEST,">manifest.conf") or die $!;
923 print MANIFEST "#manifest file for iteration $iter created by MITObim\n\nproject = $sampleID-$refID
924 \njob = genome,$mmode,accurate
925 \nparameters = -NW:mrnl=0$NFS_warn -AS:nop=1 $redirect $overhang $platform $trim -CO:msr=no $solexa_missmatches\n";
926 my @technology = split("_",$platform);
927 #-notraceinfo -
928 if ($mmode eq "mapping"){
929 print MANIFEST "\nreadgroup\nis_reference\ndata = $ref\nstrain = $refID\n";
930 }
931 print MANIFEST "\nreadgroup = reads\ndata = $reads\ntechnology = $technology[0]";
932 if ($pair){
933 # print MANIFEST "\nsegmentplacement = ---> <--- infoonly";
934 print MANIFEST "\nsegmentplacement = ---> <--- exclusion_criterion";
935 print MANIFEST "\ntemplatesize = -1 600 exclusion_criterion";
936 }
937 print MANIFEST "\nstrain = $sampleID\n";
938 close MANIFEST;
939 }
940 sub clean {
941 my $interval = shift;
942 my $cur = shift;
943 my $dir = $cur-$interval;
944 my $path=abs_path;
945 if (-d "$path/../iteration$dir"){
946 print "\nnow removing directory iteration$dir\n";
947 rmtree ("$path/../iteration$dir") or die $!;
948 }
949 }
950
951 sub remove_unmapped_contigs{
952 my $file = shift;
953 my $out = shift;
954 my ($head, $seq,$length);
955 my @split;
956 my ($dropped,$split,$trim)=(0,0,0);
957 open (FASTA,"<$out") or die $!."\ncould not open $file for reading\n";
958 while (<FASTA>){
959 chomp;
960 if ($_ =~ /^>/){
961 $head=$_;
962 }else {
963 if ($_ =~ /X/){
964 $dropped++;
965 print "drop:\n$head\n$_\n";
966 }else{
967 $length=length($_);
968 $_ =~ s/^N+//g; #remove all Ns from the beginning of the sequence
969 $_ =~ s/N+$//g; #remove all Ns from the end of the sequence
970 if ($length != length($_)){
971 $trim++;
972 }
973 @split=split(/N{3,}/); #split at every occurance of 3 or more Ns
974 if (@split > 1){
975 print "contig $head has been split into ".scalar @split." sub-contigs\n";
976 $split++;
977 for (my $i=0; $i<@split; $i++){
978 # print "$head\_$i\n$split[$i]\n";
979 $seq.="$head\_$i\n$split[$i]\n";
980 }
981 }else{
982 # print "$head\n$_\n";
983 $seq.="$head\n$_\n";
984 }
985 }
986 }
987 }
988 close FASTA;
989 if (($dropped) || ($split) || ($trim)){
990 print "creating backup of $out before changing it\n";
991 copy $out,$file;
992 open (OUT,">$out") or die $!."\ncould not open $out for writing\n";
993 print OUT "$seq\n";
994 close OUT;
995 }
996
997 if ($dropped){
998 print "\nremoved $dropped contigs from baitfile because they did not receive any hits\n";
999 }else{
1000 print "\nno need to remove any sequences from the baitfile\n";
1001 }
1002
1003 if (!$split){
1004 print "\nno need to split any contigs\n";
1005 }
1006 if ($trim){
1007 print "\n$trim contigs had N's trimmed off their ends\n";
1008 }
1009 }