Mercurial > repos > lijing > bubio
comparison MITObim_1.8.pl @ 6:a03d23c6ab95 draft
MitoBim and interleave
author | lijing |
---|---|
date | Thu, 02 Nov 2017 12:44:55 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:7c679db88fa3 | 6:a03d23c6ab95 |
---|---|
1 #! /usr/bin/perl | |
2 # | |
3 # MITObim - mitochondrial baiting and iterative mapping | |
4 # wrapper script version 1.8 | |
5 # Author: Christoph Hahn, 2012-2016 | |
6 # christoph.hahn@nhm.uio.no | |
7 | |
8 use strict; | |
9 use warnings; | |
10 use Getopt::Long; | |
11 use Cwd qw(abs_path); | |
12 use File::Copy; | |
13 use List::Util qw< min max >; | |
14 use POSIX qw(strftime); | |
15 use POSIX qw(ceil); | |
16 use File::Path 'rmtree'; | |
17 | |
18 my $startiteration = 1; | |
19 my $enditeration = 1; | |
20 | |
21 my ($quick, $noshow, $help, $strainname, $paired, $mode, $refname, $readpool, $maf, $proofreading, $readlength, $insertsize, $MM, $trim, $trimoverhang, $k_bait, $clean, $clean_interval, $min_contig_cov) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 31, 0, 2, 0); | |
22 my ($miramode, $key, $val, $exit, $current_contiglength, $current_number_of_reads, $current_number_of_contigs); | |
23 my $splitting = 0; | |
24 my $platform = "solexa"; | |
25 my $platform_settings;# = "SOLEXA"; | |
26 my $shme = ""; | |
27 my $trim_off = ""; | |
28 my $redirect_temp = ""; | |
29 my $NFS_warn_only = ""; | |
30 my ($mirapath, $mira, $miraconvert, $mirabait) = ("", "mira", "miraconvert", "mirabait"); | |
31 my (@reads, @output, @path, @current_contig_stats, @contiglengths, @number_of_reads, @number_of_contigs); | |
32 my %hash; | |
33 my $PROGRAM = "\nMITObim - mitochondrial baiting and iterative mapping\n"; | |
34 my $VERSION = "version 1.8\n"; | |
35 my $AUTHOR = "author: Christoph Hahn, (c) 2012-2016\n"; | |
36 my $cite = "\nif you found MITObim useful, please cite: | |
37 Hahn C, Bachmann L and Chevreux B. (2013) Reconstructing mitochondrial genomes directly from genomic next-generation sequencing reads - | |
38 a baiting and iterative mapping approach. Nucl. Acids Res. 41(13):e129. doi: 10.1093/nar/gkt371\n\n"; | |
39 my $USAGE = "\nusage: ./MITObim.pl <parameters> | |
40 \nparameters: | |
41 -start <int> iteration to start with, default=1 | |
42 -end <int> iteration to end with, default=1 | |
43 -sample <string> sampleID as used in initial MIRA assembly | |
44 -ref <string> referenceID as used in initial MIRA assembly | |
45 -readpool <FILE> readpool in fastq format (*.gz is also allowed) | |
46 -maf <FILE> maf file from previous MIRA assembly | |
47 \noptional: | |
48 --quick <FILE> starts process with initial baiting using provided fasta reference | |
49 --kbait <int> set kmer for baiting stringency (default: 31) | |
50 --platform specify sequencing platform (default: 'solexa'; other options: 'iontor', '454', 'pacbio') | |
51 --denovo runs MIRA in denovo mode (default: mapping) | |
52 --pair finds pairs after baiting (relies on /1 and /2 header convention for read pairs) (default: no) | |
53 --verbose show detailed output of MIRA modules (default: no) | |
54 --split split reference at positions with more than 5N (default: no) | |
55 --help shows this helpful information | |
56 --clean retain only the last 2 iteration directories (default: no) | |
57 --trimreads trim data (default: no; we recommend to trim beforehand and feed MITObim with pre trimmed data) | |
58 --trimoverhang trim overhang up- and downstream of reference (default: no) | |
59 --missmatch <int> number of allowed missmatches in mapping - only for illumina data (default: 15% of avg. read length) | |
60 --min_cov <int> minimum average coverage of contigs to be retained (default: off) | |
61 --mirapath <string> full path to MIRA binaries (only needed if MIRA is not in PATH) | |
62 --redirect_tmp redirect temporary output to this location (useful in case you are running MITObim on an NFS mount) | |
63 --NFS_warn_only allow MIRA to run on NFS mount without aborting - warn only (expert option - see MIRA documentation 'check_nfs') | |
64 \nexamples: | |
65 ./MITObim.pl -start 1 -end 5 -sample StrainX -ref reference-mt -readpool illumina_readpool.fastq -maf initial_assembly.maf | |
66 ./MITObim.pl -end 10 --quick reference.fasta -sample StrainY -ref reference-mt -readpool illumina_readpool.fastq\n\n"; | |
67 # --proofread applies proofreading (atm only to be used if starting the process from a single short seed reference) | |
68 # --readlength <int> read length of illumina library, default=150, relevant only for proofreading | |
69 # --insert <int> insert size of illumina library, default=300, relevant only for proofreading | |
70 | |
71 my $command = $0; | |
72 for (@ARGV){ | |
73 $command .= " $_"; | |
74 } | |
75 | |
76 GetOptions ( "start=i" => \$startiteration, | |
77 "end=i" => \$enditeration, | |
78 "kbait=i" => \$k_bait, | |
79 "quick=s" => \$quick, | |
80 "verbose!" => \$noshow, | |
81 "sample=s" => \$strainname, | |
82 "paired" => \$paired, | |
83 "denovo" => \$mode, | |
84 "ref=s" => \$refname, | |
85 "readpool=s" => \$readpool, | |
86 "help!" => \$help, | |
87 "clean!" => \$clean, | |
88 "mirapath=s" => \$mirapath, | |
89 "maf=s" => \$maf, | |
90 # "proofreading!" => \$proofreading, | |
91 "trimreads!" => \$trim, | |
92 "trimoverhang!" => \$trimoverhang, | |
93 "missmatch=i" => \$MM, | |
94 "platform=s" => \$platform, | |
95 # "readlength=i" => \$readlength, | |
96 # "insertsize=i" => \$insertsize, | |
97 "split!" => \$splitting, | |
98 "min_cov=i" => \$min_contig_cov, | |
99 "redirect_tmp=s" => \$redirect_temp, | |
100 "NFS_warn_only!" => \$NFS_warn_only) or die "Incorrect usage!\n$USAGE"; | |
101 | |
102 | |
103 print $PROGRAM; | |
104 print $VERSION; | |
105 print $AUTHOR; | |
106 | |
107 print $USAGE and exit if $help; | |
108 print $USAGE and exit if !$readpool; | |
109 unless ($quick){ | |
110 print $USAGE and exit if !$maf; | |
111 } | |
112 print $USAGE and exit if !$refname; | |
113 | |
114 $readpool=abs_path($readpool); | |
115 unless (-e $readpool){ | |
116 print "Cant find the readpool. Is the path correct?\n"; | |
117 } | |
118 if ($maf){ | |
119 $maf=abs_path($maf); | |
120 unless (-e $maf){ | |
121 print "Cant find *.maf file. Is the path correct?\n"; | |
122 } | |
123 } | |
124 | |
125 ($platform, $platform_settings) = &set_platform($platform); | |
126 | |
127 if ($quick){ | |
128 $quick=abs_path($quick); | |
129 unless (-e $quick){ | |
130 print "quick option selected but is the path to the file correct?\n"; | |
131 exit 1; | |
132 } | |
133 print "\nquick option selected! -maf option will be ignored (if given)\n"; | |
134 $maf = 0; | |
135 $startiteration = 0; | |
136 } | |
137 print $USAGE and exit if ($startiteration > $enditeration); | |
138 unless (((-e $maf)||($quick)) && (-e $readpool)){ | |
139 print "\nAre readpool AND maf/reference files there?\n"; | |
140 exit 1; | |
141 } | |
142 if ($mirapath){ | |
143 if (-e "$mirapath/mira"){ | |
144 print "found executables in the path specified by the user - good!\n"; | |
145 $mira = "$mirapath/mira"; | |
146 $miraconvert = "$mirapath/miraconvert"; | |
147 $mirabait = "$mirapath/mirabait"; | |
148 }else{ | |
149 print "somethings wrong with the path to mira.\n"; | |
150 exit 1; | |
151 } | |
152 } | |
153 | |
154 | |
155 ##if not given otherwise, readlength and insertsize are set to default. automatic readlength and insertsize detection will be implemented in time. | |
156 #if (!$readlength){ | |
157 # $readlength = 150; | |
158 #} | |
159 #if (!$insertsize){ | |
160 # $insertsize = 300; | |
161 #} | |
162 if (!$mode){ | |
163 $miramode = "mapping"; | |
164 }else { | |
165 $miramode = "denovo"; | |
166 } | |
167 | |
168 if (!$trim){ | |
169 $trim_off = "\"--noclipping -CL:pec=no\""; | |
170 } | |
171 | |
172 my $out_command = $command; $out_command =~ s/\S+galaxy\///g; | |
173 print "\nFull command run:\n$out_command\n"; | |
174 print "\nAll paramters seem to make sense:\n"; | |
175 print "startiteration: $startiteration\n"; | |
176 print "enditeration: $enditeration\n"; | |
177 print "sample: $strainname\n"; | |
178 print "refname: $refname\n"; | |
179 my $out_readpool = $readpool; $out_readpool =~ s/.*galaxy\///; | |
180 print "readpool: $out_readpool\n"; | |
181 print "maf: $maf\n"; | |
182 my $out_quick = $quick; $out_quick =~ s/.*galaxy\///; | |
183 print "quick: $out_quick\n"; | |
184 print "paired: $paired (off=0, on=1)\n"; | |
185 print "assembly mode: $mode (mapping=0, denovo=1)\n"; | |
186 print "verbose: $noshow (off=0, on=1)\n"; | |
187 print "split: $splitting (off=0, on=1)\n"; | |
188 print "minimum avg. coverage: $min_contig_cov (off=0)\n"; | |
189 print "clean: $clean (off=0, on=1)\n"; | |
190 print "trim reads: $trim (off=0, on=1)\n"; | |
191 print "trim overhang: $trimoverhang (no=0, yes=1)\n"; | |
192 print "platform: $platform\n"; | |
193 print "kmer baiting: $k_bait\n"; | |
194 #print "proofread: $proofreading (off=0, on=1)\n"; | |
195 | |
196 if ($proofreading){ | |
197 print "\nproofreading is not yet enabled in this beta version of MITObim 1.8 - it is currently being optimized for MIRA4. \nplease refer to MITObim 1.6 if you wish to use this option. Sorry for the inconvenience!\n\n"; | |
198 exit; | |
199 | |
200 print "proofreading: on\n"; | |
201 print "readlength: $readlength\n"; | |
202 print "insertsize: $insertsize\n"; | |
203 $MM = 0; | |
204 print "number of allowed missmatches in proofreading assembly: $MM\n"; | |
205 $shme = "-AL:shme=$MM"; | |
206 }elsif ((!$proofreading) && (!$mode) && ($platform eq "solexa")){ | |
207 if ($MM == -1){ | |
208 print "number of missmatches in mapping assembly: default (15% of average read length loaded)\n"; | |
209 $shme = ""; | |
210 }else { | |
211 print "number of missmatches in mapping assembly: $MM\n"; | |
212 $shme = "-AL:shme=$MM"; | |
213 } | |
214 print "proofreading: off\n"; | |
215 } | |
216 | |
217 if (!$trimoverhang){ | |
218 $trimoverhang = "-SB:tor=no"; | |
219 }else { | |
220 $trimoverhang = "-SB:tor=yes"; | |
221 } | |
222 | |
223 if ($redirect_temp) { | |
224 $redirect_temp="-DI:trt=$redirect_temp" | |
225 } | |
226 | |
227 print "\nStarting MITObim \n"; | |
228 | |
229 my @iteration = ($startiteration .. $enditeration); | |
230 foreach (@iteration){ | |
231 chomp; | |
232 my $currentiteration = $_; | |
233 mkdir "iteration$currentiteration" or die "MITObim will not overwrite an existing directory: iteration$currentiteration\n"; | |
234 chdir "iteration$currentiteration" or die $!; | |
235 print "\n==============\n"; | |
236 print " ITERATION $currentiteration\n"; | |
237 print "==============\n"; | |
238 print strftime("%b %e %H:%M:%S", localtime) . "\n\n"; | |
239 # if (($proofreading) && ($currentiteration != 0)){ | |
240 if ($proofreading){ | |
241 $shme = "-AL:shme=0"; | |
242 } | |
243 | |
244 if ($maf){ | |
245 print "\nrecover backbone by running miraconvert on maf file\n\n"; | |
246 if ($currentiteration<2){ | |
247 @output= qx($miraconvert -f maf -t fasta -A "$platform_settings -CO:fnicpst=yes" $maf tmp); | |
248 }else{ | |
249 @output= qx($miraconvert -f maf -t fasta -y $min_contig_cov -A "$platform_settings -CO:fnicpst=yes" $maf tmp); | |
250 } | |
251 $exit = $? >> 8; | |
252 unless (!$noshow){ | |
253 print "@output\n"; | |
254 } | |
255 unless ($exit == 0){ | |
256 if (!$noshow){ | |
257 print "@output\n"; | |
258 } | |
259 print "\nmiraconvert seems to have failed - see detailed output above\n"; | |
260 exit; | |
261 } | |
262 # if ( ((($mode) && ($currentiteration > 1)) && (!$quick)) || ((($mode) && ($currentiteration >= 1)) && ($quick)) ){ | |
263 # open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!"; | |
264 # }else{ | |
265 open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!\nIs the sampleID identical as in the initial MIRA assembly?\n"; | |
266 # } | |
267 open(FH2,">backbone_it$currentiteration\_initial_$refname.fna") or die $!; | |
268 while (<FH1>) { | |
269 $_ =~ s/x/N/g; | |
270 print FH2 $_; | |
271 } | |
272 close(FH1); | |
273 close(FH2); | |
274 unlink glob ("tmp*"); | |
275 } | |
276 MIRABAIT: | |
277 unless ($maf){ | |
278 print "\nquick option baits reads from provided reference in iteration 0\n"; | |
279 copy("$quick", "backbone_it$currentiteration\_initial_$refname.fna") or die "copy failed: $!"; | |
280 } | |
281 &check_ref_length("backbone_it$currentiteration\_initial_$refname.fna","temp_baitfile.fasta",29800,$k_bait); | |
282 | |
283 if ($splitting){ | |
284 &remove_unmapped_contigs("temp_baitfile_backup.fasta","temp_baitfile.fasta"); | |
285 if (-e "temp_baitfile_backup.fasta"){ #this file has only been created if splitting contigs was necessary | |
286 copy "temp_baitfile.fasta","backbone_it$currentiteration\_initial_$refname.fna"; | |
287 } | |
288 } | |
289 print "\nfishing readpool using mirabait (k = $k_bait)\n\n"; | |
290 | |
291 # @output = (`mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-$refname\_in.$platform 2>&1`); | |
292 @output = qx($mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-readpool-it$currentiteration); | |
293 $exit = $? >> 8; | |
294 unless (!$noshow){ | |
295 print "@output\n"; | |
296 } | |
297 if (!-s "$strainname-readpool-it$currentiteration.fastq"){ | |
298 print "\nyour readpool does not contain any reads with reasonable match (k = $k_bait) to your reference - Maybe you ll want to different settings or even a different reference?\n\n"; | |
299 exit; | |
300 } | |
301 unless ($exit == 0){ | |
302 if (!$noshow){ | |
303 print "@output\n"; | |
304 } | |
305 print "\nmirabait seems to have failed - see detailed output above\n"; | |
306 exit; | |
307 } | |
308 | |
309 FINDPAIRS: | |
310 | |
311 unless (!$paired){ | |
312 print "\nfinding pairs for baited reads\n\n"; | |
313 copy ("$strainname-readpool-it$currentiteration.fastq", "$strainname-readpool-it$currentiteration-se.fastq") or die "copy failed: $!"; | |
314 open(FH1,"<$strainname-readpool-it$currentiteration.fastq") or die $!; | |
315 open(FH2,">list"); | |
316 my $index=1; | |
317 while (<FH1>) { | |
318 if ($index % 8 ==1 || $index % 8 ==5) { | |
319 chomp; | |
320 $_ =~ s/@//g; | |
321 ($key, $val) = split /\//; | |
322 $hash{$key} .= exists $hash{$key} ? ",$val" : $val; | |
323 } | |
324 $index++; | |
325 } | |
326 | |
327 for (keys %hash){ | |
328 $_ =~ s/$/\/1/g; | |
329 print FH2 "$_\n"; | |
330 $_ =~ s/1$/2/g; | |
331 print FH2 "$_\n"; | |
332 } | |
333 close(FH1); | |
334 close(FH2); | |
335 # @output = (`miraconvert -f fastq -t fastq -n list $readpool $strainname-$refname\_in.$platform 2>&1`); | |
336 @output = qx($miraconvert -f fastq -t fastq -n list $readpool $strainname-readpool-it$currentiteration ); | |
337 $exit = $? >> 8; | |
338 unless (!$noshow){ | |
339 print "@output\n"; | |
340 } | |
341 unless ($exit == 0){ | |
342 if (!$noshow){ | |
343 print "@output\n"; | |
344 } | |
345 print "\nmiraconvert seems to have failed - see detailed output above\n"; | |
346 exit; | |
347 } | |
348 } | |
349 unlink("list"); | |
350 | |
351 MIRA: | |
352 print "\nrunning $miramode assembly using MIRA\n\n"; | |
353 &create_manifest($currentiteration,$strainname,$refname,$miramode,$trim_off,$platform_settings,$shme,$paired,$trimoverhang,"$strainname-readpool-it$currentiteration.fastq","backbone_it$currentiteration\_initial_$refname.fna", $redirect_temp, $NFS_warn_only); | |
354 @output = qx($mira manifest.conf ); | |
355 | |
356 $exit = $? >> 8; | |
357 unless (!$noshow){ | |
358 print "@output\n"; | |
359 } | |
360 unless ($exit == 0){ | |
361 if (!$noshow){ | |
362 print "@output\n"; | |
363 } | |
364 print "\nMIRA seems to have failed - see detailed output above\n"; | |
365 exit; | |
366 } | |
367 | |
368 if ($redirect_temp) { | |
369 my $tmp_link = readlink("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp"); | |
370 rmtree ("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp") or die $!; | |
371 mkdir "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/" or die $1; | |
372 my @files = glob("$tmp_link/*"); | |
373 for (@files) { | |
374 move ("$_", "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/") or die $!; | |
375 } | |
376 rmtree ("$tmp_link") or die $!; | |
377 } | |
378 | |
379 @path = abs_path; | |
380 push (@path, "/$strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out.maf"); | |
381 $maf = join("",@path); | |
382 unless (-e $maf){ | |
383 print "maf file is not there \n"; | |
384 exit; | |
385 } | |
386 | |
387 # $current_contiglength = &get_contig_length("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt"); | |
388 # $current_number_of_reads = (&get_number_of_reads("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt") - 1); | |
389 | |
390 @current_contig_stats = &get_contig_stats("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt"); | |
391 if (((scalar @current_contig_stats > 3) || ($current_contig_stats[0] > 1)) && ($proofreading)) { | |
392 print "assembly consists of more than one contigs - this is atm not permitted in proofreading mode. Sorry!\n\n"; | |
393 exit 1; | |
394 } | |
395 | |
396 PROOFREAD: | |
397 # if (($proofreading) && ($currentiteration >= 1)){ | |
398 if ($proofreading){ | |
399 print "proofreading option is currently disabled in this beta version of MITObim 1.7 - sorry for the inconvenience!\n\n"; | |
400 exit; | |
401 } | |
402 | |
403 $current_number_of_contigs = shift @current_contig_stats; | |
404 $current_number_of_reads = shift @current_contig_stats; | |
405 if (!$mode){ #in mapping assemblies the reference is counted as one read | |
406 $current_number_of_reads -= $current_number_of_contigs; | |
407 } | |
408 push (@number_of_contigs, $current_number_of_contigs); | |
409 push (@number_of_reads, $current_number_of_reads); | |
410 print "readpool contains $current_number_of_reads reads\n"; | |
411 print "assembly contains $current_number_of_contigs contig(s)\n"; | |
412 if (scalar @current_contig_stats == 1){ | |
413 print "contig length: $current_contig_stats[0]\n"; | |
414 }elsif (scalar @current_contig_stats == 3){ | |
415 print "min contig length: ".$current_contig_stats[0]." bp\nmax contig length: ".$current_contig_stats[1]." bp\navg contig length: ".sprintf("%.0f", $current_contig_stats[2])." bp\n"; | |
416 print "find details on individual contigs in: ". abs_path . "/$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt\n"; | |
417 # printf("%1f",$current_contig_stats[2])."\n"; | |
418 }else { | |
419 print "somethings wrong with your contig stats. Sorry!\n"; | |
420 exit 1; | |
421 } | |
422 | |
423 if ($clean){ | |
424 &clean($clean_interval, $currentiteration); | |
425 } | |
426 # my $path=abs_path; print "Now at $path\n"; | |
427 system("cp $strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out_AllStrains.unpadded.fasta ../finaloutput.fasta"); | |
428 system("sed -i 's/_bb.*/_mitobim_out_$currentiteration\_iterations/' ../finaloutput.fasta"); | |
429 if ($number_of_reads[-2]){ | |
430 if ($number_of_reads[-2] >= $number_of_reads[-1]){ | |
431 print "\nMITObim has reached a stationary read number after $currentiteration iterations!!\n$cite"; | |
432 print strftime("%b %e %H:%M:%S", localtime) . "\n\n"; | |
433 exit; | |
434 } | |
435 } | |
436 chdir ".." or die "Failed to go to parent directory: $!"; | |
437 } | |
438 print "\nsuccessfully completed $enditeration iterations with MITObim! " . strftime("%b %e %H:%M:%S", localtime) . "\n$cite"; | |
439 | |
440 # | |
441 # | |
442 ###SUBROUTINES | |
443 # | |
444 # | |
445 # | |
446 | |
447 sub set_platform{ | |
448 my @platforms = ('solexa', '454', 'iontorrent', 'pacbio'); | |
449 my %pf_settings = ("solexa" => "SOLEXA_SETTINGS", "454" => "454_SETTINGS", "iontorrent" => "IONTOR_SETTINGS", "pacbio" => "PCBIOHQ_SETTINGS -CO:mrpg=5"); | |
450 my $user_setting = shift; | |
451 my @user_choice; | |
452 for (@platforms){ | |
453 if ( $_ =~ /^$user_setting/ ){ | |
454 push(@user_choice, $_); | |
455 } | |
456 } | |
457 | |
458 if (scalar(@user_choice) > 1){ | |
459 print "\nYour platform choice was ambiguous: $user_setting could be: @user_choice - Please try again\n"; | |
460 exit; | |
461 | |
462 } elsif (scalar(@user_choice) == 0){ | |
463 print "\nPlease specify a valid sequencing platform - not specifying anything will default into: $platforms[0]\n\navailable options are:\n@platforms\n"; | |
464 exit; | |
465 } | |
466 | |
467 return ($user_choice[0], $pf_settings{$user_choice[0]}) | |
468 } | |
469 | |
470 sub get_contig_stats{ | |
471 my $contig = $_[0]; | |
472 my @array; | |
473 my @contiglength; | |
474 my (@readnumber, @stats); | |
475 my $readssum = 0; | |
476 open (CONTIGSTATS,"<$contig") or die $!; | |
477 while (<CONTIGSTATS>){ | |
478 unless ($_ =~ /#/){ | |
479 @array = split /\t/; | |
480 push (@contiglength, $array[1]); | |
481 push (@readnumber, $array[3]); | |
482 } | |
483 } | |
484 close (CONTIGSTATS); | |
485 if (scalar @readnumber == 1){ | |
486 push (@stats, (scalar @readnumber, $readnumber[0], $contiglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, length of contig | |
487 } | |
488 elsif (scalar @readnumber > 1){ | |
489 $readssum += $_ for @readnumber; | |
490 my $minlength = min @contiglength; | |
491 my $maxlength = max @contiglength; | |
492 my @avglength = &standard_deviation(@contiglength); | |
493 push (@stats, (scalar @readnumber, $readssum, $minlength, $maxlength, $avglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, minimal, maximal, avg length of contigs | |
494 } | |
495 return @stats; | |
496 } | |
497 | |
498 | |
499 #sub get_contig_length{ | |
500 # my $contig = $_[0]; | |
501 # my @contiglength; | |
502 # open (CONTIGSTATS,"<$contig") or die $!; | |
503 # while (<CONTIGSTATS>){ | |
504 # unless ($_ =~ /#/){ | |
505 # @contigslength = split /\t/; | |
506 # } | |
507 # } | |
508 # close (CONTIGSTATS); | |
509 # return $contiglength[1]; | |
510 #} | |
511 | |
512 #sub get_number_of_reads{ | |
513 # my $contig = $_[0]; | |
514 # my @contigstats; | |
515 # open (CONTIGSTATS,"<$contig") or die $!; | |
516 # while (<CONTIGSTATS>){ | |
517 # unless ($_ =~ /#/){ | |
518 # @contigstats = split /\t/; | |
519 # } | |
520 # } | |
521 # close (CONTIGSTATS); | |
522 # return $contigstats[3]; | |
523 #} | |
524 | |
525 sub proofread { | |
526 my $zero_MM = $_[0]; | |
527 my $readtaglist_FH = $_[1]; | |
528 my $contiglength = $_[2]; | |
529 my $elevated_cov_lower_start = $_[3]; | |
530 my $elevated_cov_lower_end = $_[4]; | |
531 my $elevated_cov_upper_start = $_[5]; | |
532 my $elevated_cov_upper_end = $_[6]; | |
533 my $lower_limit = $_[7]; | |
534 # my $lower_limit = 200; | |
535 my $lower_main_limit = $_[8]; | |
536 # my $lower_main_limit = 500; | |
537 my $upper_limit = $contiglength - $lower_limit; | |
538 my $upper_main_limit = $contiglength - $lower_main_limit; | |
539 my @readtaglist; | |
540 my $ref; | |
541 my $junk; | |
542 my $current_id; | |
543 my %count; | |
544 my @readid_good; | |
545 my @readid_bad; | |
546 my @reads; | |
547 my @readlist; | |
548 my @readlist_good; | |
549 my @readlist_proofread; | |
550 my @total_readlist; | |
551 my @singleton; | |
552 my @singletons; | |
553 my @taglist; | |
554 my @taglist_line; | |
555 my @readtaglist_lower; | |
556 my @readtaglist_upper; | |
557 my @read_ids_lower; | |
558 my @read_ids_all; | |
559 my @read_ids_upper; | |
560 my %ids =(); | |
561 my @unsorted; | |
562 my $min; | |
563 my $max; | |
564 my $tag; | |
565 | |
566 | |
567 print "\nlower limit: $lower_limit\n"; | |
568 print "upper limit: $upper_limit\n"; | |
569 print "lower main limit: $lower_main_limit\n"; | |
570 print "upper main limit: $upper_main_limit\n\n"; | |
571 open (TAGLIST,"<$readtaglist_FH") or die $!; | |
572 while (<TAGLIST>){ | |
573 push (@readtaglist, "$_"); | |
574 } | |
575 close (TAGLIST); | |
576 | |
577 for (@readtaglist){ | |
578 @taglist_line = split /\t/; | |
579 unless ($taglist_line[0] =~ /#/){ | |
580 $ref = join ("\t", $taglist_line[0], $taglist_line[6]); | |
581 push (@read_ids_all, $ref); | |
582 | |
583 if (($taglist_line[1] <= $lower_limit) || ($taglist_line[2] <= $lower_limit)){ | |
584 # if ((($taglist_line[1] <= $lower_limit) || (($taglist_line[1] >= $coverage_limits_lower[0])&&($taglist_line[1] <= $coverage_limits_lower[1]))) || ($taglist_line[2] <= $lower_limit)){ | |
585 $ref = join ("\t", $taglist_line[0], $taglist_line[6]); | |
586 push (@read_ids_lower, $ref); | |
587 }elsif (($taglist_line[1] >= $upper_limit) || ($taglist_line[2] >= $upper_limit)){ | |
588 $ref = join ("\t", $taglist_line[0], $taglist_line[6]); | |
589 push (@read_ids_upper, $ref); | |
590 } | |
591 } | |
592 } | |
593 %ids = map { $_ => 1 } @read_ids_lower; | |
594 my @unique_lower = keys %ids; | |
595 %ids = map { $_ => 1 } @read_ids_upper; | |
596 my @unique_upper = keys %ids; | |
597 %ids = map { $_ => 1 } @read_ids_all; | |
598 my @unique_all = keys %ids; | |
599 for (@unique_all) { | |
600 my @junk = split /\//; | |
601 push (@reads, $junk[0]); | |
602 @junk = split /\t/; | |
603 push (@total_readlist, $junk[1]); | |
604 } | |
605 | |
606 map { $count{$_}++ } @reads; | |
607 map {if ($count{$_} == 2){ @readid_good = split /\t/; push(@readlist, "$readid_good[1]");} elsif ($count{$_} == 1) { push(@readid_bad, "$_");}} keys (%count); | |
608 @reads = {}; | |
609 undef %count; | |
610 | |
611 for (@readlist){ | |
612 chomp; | |
613 $current_id = $_; | |
614 my @pairs_lower = grep { $_ =~ /$current_id/} @unique_lower; | |
615 my @pairs_upper = grep{ $_ =~ /$current_id/} @unique_upper; | |
616 # print "good id: $current_id\n"; | |
617 my $count_lower = scalar @pairs_lower; | |
618 my $count_upper = scalar @pairs_upper; | |
619 # print "count lower: $count_lower\n"; | |
620 # print "count upper: $count_upper\n"; | |
621 unless ((scalar @pairs_lower == 2) || (scalar @pairs_upper == 2)){ | |
622 push (@readlist_good, "$current_id"); | |
623 } | |
624 } | |
625 for (@readid_bad){ | |
626 chomp; | |
627 @unsorted = (); | |
628 ($junk, $current_id) = split (/\t/); | |
629 print "current ID: $current_id\n"; | |
630 @singleton = grep { $_ =~ /$current_id/} @total_readlist; | |
631 for (@singleton){ | |
632 chomp; | |
633 $tag = $_; | |
634 @taglist = grep { $_ =~ /$tag/} @readtaglist; | |
635 # print "taglist: @taglist\n"; | |
636 } | |
637 for (@taglist) { | |
638 @taglist_line = split /\t/; | |
639 push(@unsorted, $taglist_line[1], $taglist_line[2]); | |
640 $max = max @unsorted; | |
641 $min = min @unsorted; | |
642 } | |
643 # print "unsorted: @unsorted\n"; | |
644 print "read mapping from $min to $max\n"; | |
645 # print "min: $min\n"; | |
646 # print "max: $max\n"; | |
647 | |
648 if ($min <= $lower_limit){ | |
649 print "orphan discarded! min<lowerlimit\n----------------------------------------------\n"; | |
650 }elsif ($max >= $upper_limit){ | |
651 print "orphan discarded! max>upperlimit\n----------------------------------------------\n"; | |
652 }elsif (($min >= $lower_main_limit) && ($max <= $upper_main_limit)){ | |
653 print "orphan discarded! lower_main_limit<min-max<upper_main_limit\n----------------------------------------------\n"; | |
654 }elsif (($min >= $elevated_cov_lower_start) && ($min <= $elevated_cov_lower_end - ($lower_limit / 2))){ | |
655 print "orphan discarded! increased_coverage_lower_start<min<increased_coverage_lower_end\n----------------------------------------------\n"; | |
656 }elsif (($max >= ($elevated_cov_upper_start + ($lower_limit / 2))) && ($max <= $elevated_cov_upper_end)){ | |
657 print "orphan discarded! increased_coverage_upper_start<max<increased_coverage_upper_end\n----------------------------------------------\n"; | |
658 }else { | |
659 push(@singletons, "@singleton\n"); | |
660 print "orphan resurrected! \n----------------------------------------------\n"; | |
661 } | |
662 # print "contiglength: $contiglength\n"; | |
663 } | |
664 | |
665 for (@singletons){ | |
666 my @resurrection = split /\//; | |
667 push (@readlist_good, $resurrection[0]); | |
668 } | |
669 for (@readlist_good){ | |
670 $_ =~ s/$/\/1\n/g; | |
671 push(@readlist_proofread, $_); | |
672 $_ =~ s/1$/2/g; | |
673 push(@readlist_proofread, $_); | |
674 } | |
675 return @readlist_proofread; | |
676 } | |
677 | |
678 sub standard_deviation { | |
679 my(@numbers) = @_; | |
680 #Prevent division by 0 error in case you get junk data | |
681 return undef unless(scalar(@numbers)); | |
682 | |
683 # Step 1, find the mean of the numbers | |
684 my $total1 = 0; | |
685 foreach my $num (@numbers) { | |
686 if (!$num){ | |
687 $num = 0; | |
688 } | |
689 $total1 += $num; | |
690 } | |
691 my $mean1 = $total1 / (scalar @numbers); | |
692 push (my @stdev, "$mean1"); | |
693 # Step 2, find the mean of the squares of the differences between each number and the mean | |
694 my $total2 = 0; | |
695 foreach my $num (@numbers) { | |
696 if (!$num){ | |
697 $num = 0; | |
698 } | |
699 $total2 += ($mean1-$num)**2; | |
700 } | |
701 my $mean2 = $total2 / (scalar @numbers); | |
702 | |
703 # Step 3, standard deviation is the square root of the above mean | |
704 my $std_dev = sqrt($mean2); | |
705 push (@stdev, "$std_dev"); | |
706 return @stdev; | |
707 } | |
708 | |
709 sub assess_coverage{ | |
710 my $readtaglist_FH = $_[0]; | |
711 my @readtaglist; | |
712 my $from =$_[1]; | |
713 my $to = $_[2]; | |
714 my $where = $_[3]; | |
715 my @taglist_line; | |
716 my @coverage_array_lower; | |
717 my @coverage_array_upper; | |
718 my @read_ids_lower; | |
719 my @read_ids_upper; | |
720 my %ids; | |
721 my @taglist; | |
722 my @unsorted; | |
723 my $min; | |
724 my $max; | |
725 my %coverage; | |
726 my @allnums; | |
727 my @coverage_change_position; | |
728 my @coverage_limits; | |
729 | |
730 # print "assessing coverage from position $from to position $to\n"; | |
731 open (TAGLIST,"<$readtaglist_FH") or die $!; | |
732 while (<TAGLIST>){ | |
733 push (@readtaglist, "$_"); | |
734 } | |
735 close (TAGLIST); | |
736 | |
737 for (@readtaglist){ | |
738 @taglist_line = split /\t/; | |
739 unless ($taglist_line[0] =~ /#/){ | |
740 if ((($taglist_line[1] >= $from) && ($taglist_line[1] <= $to)) || (($taglist_line[2] >= $from) && ($taglist_line[2] <= $to))){ | |
741 push (@coverage_array_lower, "$_"); | |
742 push (@read_ids_lower, $taglist_line[6]); | |
743 } | |
744 } | |
745 } | |
746 %ids = map { $_ => 1 } @read_ids_lower; | |
747 my @unique_lower = keys %ids; | |
748 for (@unique_lower){ | |
749 my @current_id = $_; | |
750 chomp; | |
751 @unsorted = (); | |
752 for (@current_id){ | |
753 my $current_id = $_; | |
754 @taglist = grep { $_ =~ /$current_id/} @coverage_array_lower; | |
755 } | |
756 for (@taglist) { | |
757 @taglist_line = split /\t/; | |
758 push(@unsorted, $taglist_line[1], $taglist_line[2]); | |
759 $max = max @unsorted; | |
760 $min = min @unsorted; | |
761 } | |
762 my @nums = ($min .. $max); | |
763 for (@nums){ | |
764 push (@allnums, "$_"); | |
765 } | |
766 } | |
767 %coverage = map { $_ => 0 } @allnums; | |
768 map { $coverage{$_}++ } @allnums; | |
769 open (OUT,">out-$where.csv"); | |
770 | |
771 ########## detecting coverage peak | |
772 my $max_cov = 0; | |
773 my $max_cov_position; | |
774 my @cumulative_coverage; | |
775 map { unless (!$coverage{$_}){print OUT "$_,$coverage{$_}\n"; push (@cumulative_coverage, "$coverage{$_}"); if ($coverage{$_} > $max_cov){$max_cov = $coverage{$_}; $max_cov_position = $_; }}} ($from..$to); | |
776 my @average_coverage = &standard_deviation(@cumulative_coverage); | |
777 my $coverage_factor = $max_cov / $average_coverage[0]; | |
778 open (OUT,">>out-$where.csv"); | |
779 print OUT "\nmaximum coverage is $max_cov at position $max_cov_position\naverge coverage is: $average_coverage[0], sd: $average_coverage[1]\nfactor $coverage_factor\n"; | |
780 close (OUT); | |
781 | |
782 ######### detecting rapid changes in coverage | |
783 for ($from..($to - 10)){ | |
784 my $position = $_; | |
785 my $cov = $coverage{$position}; | |
786 unless (!$cov){ | |
787 my @positions = (); | |
788 push (@positions, "$cov"); | |
789 for (1 .. 10){ | |
790 my $pos_plus = $position + $_; | |
791 if ($coverage{$pos_plus}){ | |
792 push (@positions, "$coverage{$pos_plus}"); | |
793 } | |
794 } | |
795 my @stdev = &standard_deviation(@positions); | |
796 if ($stdev[1] > 6.0){ | |
797 print "positions ($position): @positions -> stdev: $stdev[1]\n"; | |
798 push (@coverage_change_position, $position); | |
799 }elsif ($stdev[1] >= 4.5){ | |
800 print "positions ($position): @positions -> stdev: $stdev[1]\n"; | |
801 } | |
802 } | |
803 } | |
804 if (@coverage_change_position){ | |
805 print "positions with rapidly changing coverage detected: @coverage_change_position\n"; | |
806 my $start = min @coverage_change_position; | |
807 my $end = max @coverage_change_position; | |
808 push (@coverage_limits, "$start", "$end"); | |
809 print "set limits from $coverage_limits[0] to $coverage_limits[1]\n"; | |
810 }else{ | |
811 print "no irregularities in coverage detected\n"; | |
812 push (@coverage_limits, "0", "0"); | |
813 return @coverage_limits; | |
814 } | |
815 | |
816 | |
817 ###### assessing whether coverage peak lies within putative conserved region, if yes accept prediction; if no, reject conserved region | |
818 if (($coverage_factor >= 1.6) && (($coverage_limits[0] < $max_cov_position) && ( $max_cov_position < $coverage_limits[1]))){ | |
819 | |
820 print "suspicious coverage peak detected within the predicted limits\n"; | |
821 }else { | |
822 print "no coverage peak detected within predicted limits - rejecting limits\n"; | |
823 @coverage_limits = (); | |
824 push (@coverage_limits, "0", "0"); | |
825 } | |
826 return @coverage_limits; | |
827 | |
828 } | |
829 | |
830 sub check_ref_length{ | |
831 my $ref=$_[0]; | |
832 my $output_filename=$_[1]; | |
833 my $critical=$_[2]; | |
834 my $kmer = $_[3]; | |
835 my @header; | |
836 my $header_count=0; | |
837 my (@sequence,@temp_output,@final_output); | |
838 my $full_sequence; | |
839 open(REF,"<$ref") or die $!; | |
840 while(<REF>){ | |
841 chomp; | |
842 if ($_ =~ /^>/){ | |
843 push(@header,$_); | |
844 # print "found header:\n$header[$header_count]\n"; | |
845 $header_count++; | |
846 # print "header count: $header_count\n"; | |
847 if (@sequence){ | |
848 @temp_output=&finalize_sequence($critical,$header[-2],$kmer,@sequence); | |
849 for (@temp_output){ | |
850 push(@final_output,$_); | |
851 } | |
852 } | |
853 undef @sequence; | |
854 }elsif ($_ =~ /[a-zA-Z]/){ | |
855 # print "found sequence:\n$_\n"; | |
856 push(@sequence,$_); | |
857 } | |
858 } | |
859 @temp_output=&finalize_sequence($critical,$header[-1],$kmer,@sequence); | |
860 for (@temp_output){ | |
861 push(@final_output,$_); | |
862 } | |
863 # print "result:\n"; | |
864 open (OUT,">$output_filename") or die $!; | |
865 for(@final_output){ | |
866 # print "$_\n"; | |
867 print OUT "$_\n"; | |
868 } | |
869 close REF; | |
870 close OUT; | |
871 | |
872 } | |
873 | |
874 sub finalize_sequence{ | |
875 my $critical = shift; | |
876 my $header = shift; | |
877 my $kmer = shift; | |
878 my $full_sequence=join("",@_); | |
879 my $factor; | |
880 my @output; | |
881 if (!$critical){ | |
882 $factor=0; | |
883 }else{ | |
884 $factor=ceil(length($full_sequence)/$critical); | |
885 } | |
886 if ($factor == 1){ | |
887 push(@output,$header); | |
888 push(@output,$full_sequence); | |
889 }else{ #too long | |
890 print "\nreference is too long for mirabait to be handled in one go -> will be split into sub-sequences\n"; | |
891 $header=substr $header, 1; | |
892 for (my $i=0; $i<$factor; $i++){ | |
893 unless ((length(substr $full_sequence, $i*$critical, $critical+$kmer)-$kmer)<0){ | |
894 push(@output,">sub$i\_" .$header); | |
895 push(@output,substr $full_sequence, $i*$critical, $critical+$kmer); | |
896 } | |
897 } | |
898 } | |
899 return @output; | |
900 } | |
901 | |
902 sub create_manifest { | |
903 my ($iter, $sampleID, $refID, $mmode, $trim, $platform, $solexa_missmatches, $pair, $overhang, $reads, $ref, $redirect, $NFS_warn); | |
904 $iter = $_[0]; | |
905 $sampleID = $_[1]; | |
906 $refID = $_[2]; | |
907 $mmode = $_[3]; | |
908 $trim = $_[4]; | |
909 $platform = $_[5]; | |
910 $solexa_missmatches = $_[6]; | |
911 $pair = $_[7]; | |
912 $overhang = $_[8]; | |
913 $reads = $_[9]; | |
914 $ref = $_[10]; | |
915 $redirect = $_[11]; | |
916 $NFS_warn = $_[12]; | |
917 | |
918 if ($NFS_warn){ | |
919 $NFS_warn = ":cnfs=warn" | |
920 } | |
921 | |
922 open (MANIFEST,">manifest.conf") or die $!; | |
923 print MANIFEST "#manifest file for iteration $iter created by MITObim\n\nproject = $sampleID-$refID | |
924 \njob = genome,$mmode,accurate | |
925 \nparameters = -NW:mrnl=0$NFS_warn -AS:nop=1 $redirect $overhang $platform $trim -CO:msr=no $solexa_missmatches\n"; | |
926 my @technology = split("_",$platform); | |
927 #-notraceinfo - | |
928 if ($mmode eq "mapping"){ | |
929 print MANIFEST "\nreadgroup\nis_reference\ndata = $ref\nstrain = $refID\n"; | |
930 } | |
931 print MANIFEST "\nreadgroup = reads\ndata = $reads\ntechnology = $technology[0]"; | |
932 if ($pair){ | |
933 # print MANIFEST "\nsegmentplacement = ---> <--- infoonly"; | |
934 print MANIFEST "\nsegmentplacement = ---> <--- exclusion_criterion"; | |
935 print MANIFEST "\ntemplatesize = -1 600 exclusion_criterion"; | |
936 } | |
937 print MANIFEST "\nstrain = $sampleID\n"; | |
938 close MANIFEST; | |
939 } | |
940 sub clean { | |
941 my $interval = shift; | |
942 my $cur = shift; | |
943 my $dir = $cur-$interval; | |
944 my $path=abs_path; | |
945 if (-d "$path/../iteration$dir"){ | |
946 print "\nnow removing directory iteration$dir\n"; | |
947 rmtree ("$path/../iteration$dir") or die $!; | |
948 } | |
949 } | |
950 | |
951 sub remove_unmapped_contigs{ | |
952 my $file = shift; | |
953 my $out = shift; | |
954 my ($head, $seq,$length); | |
955 my @split; | |
956 my ($dropped,$split,$trim)=(0,0,0); | |
957 open (FASTA,"<$out") or die $!."\ncould not open $file for reading\n"; | |
958 while (<FASTA>){ | |
959 chomp; | |
960 if ($_ =~ /^>/){ | |
961 $head=$_; | |
962 }else { | |
963 if ($_ =~ /X/){ | |
964 $dropped++; | |
965 print "drop:\n$head\n$_\n"; | |
966 }else{ | |
967 $length=length($_); | |
968 $_ =~ s/^N+//g; #remove all Ns from the beginning of the sequence | |
969 $_ =~ s/N+$//g; #remove all Ns from the end of the sequence | |
970 if ($length != length($_)){ | |
971 $trim++; | |
972 } | |
973 @split=split(/N{3,}/); #split at every occurance of 3 or more Ns | |
974 if (@split > 1){ | |
975 print "contig $head has been split into ".scalar @split." sub-contigs\n"; | |
976 $split++; | |
977 for (my $i=0; $i<@split; $i++){ | |
978 # print "$head\_$i\n$split[$i]\n"; | |
979 $seq.="$head\_$i\n$split[$i]\n"; | |
980 } | |
981 }else{ | |
982 # print "$head\n$_\n"; | |
983 $seq.="$head\n$_\n"; | |
984 } | |
985 } | |
986 } | |
987 } | |
988 close FASTA; | |
989 if (($dropped) || ($split) || ($trim)){ | |
990 print "creating backup of $out before changing it\n"; | |
991 copy $out,$file; | |
992 open (OUT,">$out") or die $!."\ncould not open $out for writing\n"; | |
993 print OUT "$seq\n"; | |
994 close OUT; | |
995 } | |
996 | |
997 if ($dropped){ | |
998 print "\nremoved $dropped contigs from baitfile because they did not receive any hits\n"; | |
999 }else{ | |
1000 print "\nno need to remove any sequences from the baitfile\n"; | |
1001 } | |
1002 | |
1003 if (!$split){ | |
1004 print "\nno need to split any contigs\n"; | |
1005 } | |
1006 if ($trim){ | |
1007 print "\n$trim contigs had N's trimmed off their ends\n"; | |
1008 } | |
1009 } |