6
|
1 #! /usr/bin/perl
|
|
2 #
|
|
3 # MITObim - mitochondrial baiting and iterative mapping
|
|
4 # wrapper script version 1.8
|
|
5 # Author: Christoph Hahn, 2012-2016
|
|
6 # christoph.hahn@nhm.uio.no
|
|
7
|
|
8 use strict;
|
|
9 use warnings;
|
|
10 use Getopt::Long;
|
|
11 use Cwd qw(abs_path);
|
|
12 use File::Copy;
|
|
13 use List::Util qw< min max >;
|
|
14 use POSIX qw(strftime);
|
|
15 use POSIX qw(ceil);
|
|
16 use File::Path 'rmtree';
|
|
17
|
|
18 my $startiteration = 1;
|
|
19 my $enditeration = 1;
|
|
20
|
|
21 my ($quick, $noshow, $help, $strainname, $paired, $mode, $refname, $readpool, $maf, $proofreading, $readlength, $insertsize, $MM, $trim, $trimoverhang, $k_bait, $clean, $clean_interval, $min_contig_cov) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 31, 0, 2, 0);
|
|
22 my ($miramode, $key, $val, $exit, $current_contiglength, $current_number_of_reads, $current_number_of_contigs);
|
|
23 my $splitting = 0;
|
|
24 my $platform = "solexa";
|
|
25 my $platform_settings;# = "SOLEXA";
|
|
26 my $shme = "";
|
|
27 my $trim_off = "";
|
|
28 my $redirect_temp = "";
|
|
29 my $NFS_warn_only = "";
|
|
30 my ($mirapath, $mira, $miraconvert, $mirabait) = ("", "mira", "miraconvert", "mirabait");
|
|
31 my (@reads, @output, @path, @current_contig_stats, @contiglengths, @number_of_reads, @number_of_contigs);
|
|
32 my %hash;
|
|
33 my $PROGRAM = "\nMITObim - mitochondrial baiting and iterative mapping\n";
|
|
34 my $VERSION = "version 1.8\n";
|
|
35 my $AUTHOR = "author: Christoph Hahn, (c) 2012-2016\n";
|
|
36 my $cite = "\nif you found MITObim useful, please cite:
|
|
37 Hahn C, Bachmann L and Chevreux B. (2013) Reconstructing mitochondrial genomes directly from genomic next-generation sequencing reads -
|
|
38 a baiting and iterative mapping approach. Nucl. Acids Res. 41(13):e129. doi: 10.1093/nar/gkt371\n\n";
|
|
39 my $USAGE = "\nusage: ./MITObim.pl <parameters>
|
|
40 \nparameters:
|
|
41 -start <int> iteration to start with, default=1
|
|
42 -end <int> iteration to end with, default=1
|
|
43 -sample <string> sampleID as used in initial MIRA assembly
|
|
44 -ref <string> referenceID as used in initial MIRA assembly
|
|
45 -readpool <FILE> readpool in fastq format (*.gz is also allowed)
|
|
46 -maf <FILE> maf file from previous MIRA assembly
|
|
47 \noptional:
|
|
48 --quick <FILE> starts process with initial baiting using provided fasta reference
|
|
49 --kbait <int> set kmer for baiting stringency (default: 31)
|
|
50 --platform specify sequencing platform (default: 'solexa'; other options: 'iontor', '454', 'pacbio')
|
|
51 --denovo runs MIRA in denovo mode (default: mapping)
|
|
52 --pair finds pairs after baiting (relies on /1 and /2 header convention for read pairs) (default: no)
|
|
53 --verbose show detailed output of MIRA modules (default: no)
|
|
54 --split split reference at positions with more than 5N (default: no)
|
|
55 --help shows this helpful information
|
|
56 --clean retain only the last 2 iteration directories (default: no)
|
|
57 --trimreads trim data (default: no; we recommend to trim beforehand and feed MITObim with pre trimmed data)
|
|
58 --trimoverhang trim overhang up- and downstream of reference (default: no)
|
|
59 --missmatch <int> number of allowed missmatches in mapping - only for illumina data (default: 15% of avg. read length)
|
|
60 --min_cov <int> minimum average coverage of contigs to be retained (default: off)
|
|
61 --mirapath <string> full path to MIRA binaries (only needed if MIRA is not in PATH)
|
|
62 --redirect_tmp redirect temporary output to this location (useful in case you are running MITObim on an NFS mount)
|
|
63 --NFS_warn_only allow MIRA to run on NFS mount without aborting - warn only (expert option - see MIRA documentation 'check_nfs')
|
|
64 \nexamples:
|
|
65 ./MITObim.pl -start 1 -end 5 -sample StrainX -ref reference-mt -readpool illumina_readpool.fastq -maf initial_assembly.maf
|
|
66 ./MITObim.pl -end 10 --quick reference.fasta -sample StrainY -ref reference-mt -readpool illumina_readpool.fastq\n\n";
|
|
67 # --proofread applies proofreading (atm only to be used if starting the process from a single short seed reference)
|
|
68 # --readlength <int> read length of illumina library, default=150, relevant only for proofreading
|
|
69 # --insert <int> insert size of illumina library, default=300, relevant only for proofreading
|
|
70
|
|
71 my $command = $0;
|
|
72 for (@ARGV){
|
|
73 $command .= " $_";
|
|
74 }
|
|
75
|
|
76 GetOptions ( "start=i" => \$startiteration,
|
|
77 "end=i" => \$enditeration,
|
|
78 "kbait=i" => \$k_bait,
|
|
79 "quick=s" => \$quick,
|
|
80 "verbose!" => \$noshow,
|
|
81 "sample=s" => \$strainname,
|
|
82 "paired" => \$paired,
|
|
83 "denovo" => \$mode,
|
|
84 "ref=s" => \$refname,
|
|
85 "readpool=s" => \$readpool,
|
|
86 "help!" => \$help,
|
|
87 "clean!" => \$clean,
|
|
88 "mirapath=s" => \$mirapath,
|
|
89 "maf=s" => \$maf,
|
|
90 # "proofreading!" => \$proofreading,
|
|
91 "trimreads!" => \$trim,
|
|
92 "trimoverhang!" => \$trimoverhang,
|
|
93 "missmatch=i" => \$MM,
|
|
94 "platform=s" => \$platform,
|
|
95 # "readlength=i" => \$readlength,
|
|
96 # "insertsize=i" => \$insertsize,
|
|
97 "split!" => \$splitting,
|
|
98 "min_cov=i" => \$min_contig_cov,
|
|
99 "redirect_tmp=s" => \$redirect_temp,
|
|
100 "NFS_warn_only!" => \$NFS_warn_only) or die "Incorrect usage!\n$USAGE";
|
|
101
|
|
102
|
|
103 print $PROGRAM;
|
|
104 print $VERSION;
|
|
105 print $AUTHOR;
|
|
106
|
|
107 print $USAGE and exit if $help;
|
|
108 print $USAGE and exit if !$readpool;
|
|
109 unless ($quick){
|
|
110 print $USAGE and exit if !$maf;
|
|
111 }
|
|
112 print $USAGE and exit if !$refname;
|
|
113
|
|
114 $readpool=abs_path($readpool);
|
|
115 unless (-e $readpool){
|
|
116 print "Cant find the readpool. Is the path correct?\n";
|
|
117 }
|
|
118 if ($maf){
|
|
119 $maf=abs_path($maf);
|
|
120 unless (-e $maf){
|
|
121 print "Cant find *.maf file. Is the path correct?\n";
|
|
122 }
|
|
123 }
|
|
124
|
|
125 ($platform, $platform_settings) = &set_platform($platform);
|
|
126
|
|
127 if ($quick){
|
|
128 $quick=abs_path($quick);
|
|
129 unless (-e $quick){
|
|
130 print "quick option selected but is the path to the file correct?\n";
|
|
131 exit 1;
|
|
132 }
|
|
133 print "\nquick option selected! -maf option will be ignored (if given)\n";
|
|
134 $maf = 0;
|
|
135 $startiteration = 0;
|
|
136 }
|
|
137 print $USAGE and exit if ($startiteration > $enditeration);
|
|
138 unless (((-e $maf)||($quick)) && (-e $readpool)){
|
|
139 print "\nAre readpool AND maf/reference files there?\n";
|
|
140 exit 1;
|
|
141 }
|
|
142 if ($mirapath){
|
|
143 if (-e "$mirapath/mira"){
|
|
144 print "found executables in the path specified by the user - good!\n";
|
|
145 $mira = "$mirapath/mira";
|
|
146 $miraconvert = "$mirapath/miraconvert";
|
|
147 $mirabait = "$mirapath/mirabait";
|
|
148 }else{
|
|
149 print "somethings wrong with the path to mira.\n";
|
|
150 exit 1;
|
|
151 }
|
|
152 }
|
|
153
|
|
154
|
|
155 ##if not given otherwise, readlength and insertsize are set to default. automatic readlength and insertsize detection will be implemented in time.
|
|
156 #if (!$readlength){
|
|
157 # $readlength = 150;
|
|
158 #}
|
|
159 #if (!$insertsize){
|
|
160 # $insertsize = 300;
|
|
161 #}
|
|
162 if (!$mode){
|
|
163 $miramode = "mapping";
|
|
164 }else {
|
|
165 $miramode = "denovo";
|
|
166 }
|
|
167
|
|
168 if (!$trim){
|
|
169 $trim_off = "\"--noclipping -CL:pec=no\"";
|
|
170 }
|
|
171
|
|
172 my $out_command = $command; $out_command =~ s/\S+galaxy\///g;
|
|
173 print "\nFull command run:\n$out_command\n";
|
|
174 print "\nAll paramters seem to make sense:\n";
|
|
175 print "startiteration: $startiteration\n";
|
|
176 print "enditeration: $enditeration\n";
|
|
177 print "sample: $strainname\n";
|
|
178 print "refname: $refname\n";
|
|
179 my $out_readpool = $readpool; $out_readpool =~ s/.*galaxy\///;
|
|
180 print "readpool: $out_readpool\n";
|
|
181 print "maf: $maf\n";
|
|
182 my $out_quick = $quick; $out_quick =~ s/.*galaxy\///;
|
|
183 print "quick: $out_quick\n";
|
|
184 print "paired: $paired (off=0, on=1)\n";
|
|
185 print "assembly mode: $mode (mapping=0, denovo=1)\n";
|
|
186 print "verbose: $noshow (off=0, on=1)\n";
|
|
187 print "split: $splitting (off=0, on=1)\n";
|
|
188 print "minimum avg. coverage: $min_contig_cov (off=0)\n";
|
|
189 print "clean: $clean (off=0, on=1)\n";
|
|
190 print "trim reads: $trim (off=0, on=1)\n";
|
|
191 print "trim overhang: $trimoverhang (no=0, yes=1)\n";
|
|
192 print "platform: $platform\n";
|
|
193 print "kmer baiting: $k_bait\n";
|
|
194 #print "proofread: $proofreading (off=0, on=1)\n";
|
|
195
|
|
196 if ($proofreading){
|
|
197 print "\nproofreading is not yet enabled in this beta version of MITObim 1.8 - it is currently being optimized for MIRA4. \nplease refer to MITObim 1.6 if you wish to use this option. Sorry for the inconvenience!\n\n";
|
|
198 exit;
|
|
199
|
|
200 print "proofreading: on\n";
|
|
201 print "readlength: $readlength\n";
|
|
202 print "insertsize: $insertsize\n";
|
|
203 $MM = 0;
|
|
204 print "number of allowed missmatches in proofreading assembly: $MM\n";
|
|
205 $shme = "-AL:shme=$MM";
|
|
206 }elsif ((!$proofreading) && (!$mode) && ($platform eq "solexa")){
|
|
207 if ($MM == -1){
|
|
208 print "number of missmatches in mapping assembly: default (15% of average read length loaded)\n";
|
|
209 $shme = "";
|
|
210 }else {
|
|
211 print "number of missmatches in mapping assembly: $MM\n";
|
|
212 $shme = "-AL:shme=$MM";
|
|
213 }
|
|
214 print "proofreading: off\n";
|
|
215 }
|
|
216
|
|
217 if (!$trimoverhang){
|
|
218 $trimoverhang = "-SB:tor=no";
|
|
219 }else {
|
|
220 $trimoverhang = "-SB:tor=yes";
|
|
221 }
|
|
222
|
|
223 if ($redirect_temp) {
|
|
224 $redirect_temp="-DI:trt=$redirect_temp"
|
|
225 }
|
|
226
|
|
227 print "\nStarting MITObim \n";
|
|
228
|
|
229 my @iteration = ($startiteration .. $enditeration);
|
|
230 foreach (@iteration){
|
|
231 chomp;
|
|
232 my $currentiteration = $_;
|
|
233 mkdir "iteration$currentiteration" or die "MITObim will not overwrite an existing directory: iteration$currentiteration\n";
|
|
234 chdir "iteration$currentiteration" or die $!;
|
|
235 print "\n==============\n";
|
|
236 print " ITERATION $currentiteration\n";
|
|
237 print "==============\n";
|
|
238 print strftime("%b %e %H:%M:%S", localtime) . "\n\n";
|
|
239 # if (($proofreading) && ($currentiteration != 0)){
|
|
240 if ($proofreading){
|
|
241 $shme = "-AL:shme=0";
|
|
242 }
|
|
243
|
|
244 if ($maf){
|
|
245 print "\nrecover backbone by running miraconvert on maf file\n\n";
|
|
246 if ($currentiteration<2){
|
|
247 @output= qx($miraconvert -f maf -t fasta -A "$platform_settings -CO:fnicpst=yes" $maf tmp);
|
|
248 }else{
|
|
249 @output= qx($miraconvert -f maf -t fasta -y $min_contig_cov -A "$platform_settings -CO:fnicpst=yes" $maf tmp);
|
|
250 }
|
|
251 $exit = $? >> 8;
|
|
252 unless (!$noshow){
|
|
253 print "@output\n";
|
|
254 }
|
|
255 unless ($exit == 0){
|
|
256 if (!$noshow){
|
|
257 print "@output\n";
|
|
258 }
|
|
259 print "\nmiraconvert seems to have failed - see detailed output above\n";
|
|
260 exit;
|
|
261 }
|
|
262 # if ( ((($mode) && ($currentiteration > 1)) && (!$quick)) || ((($mode) && ($currentiteration >= 1)) && ($quick)) ){
|
|
263 # open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!";
|
|
264 # }else{
|
|
265 open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!\nIs the sampleID identical as in the initial MIRA assembly?\n";
|
|
266 # }
|
|
267 open(FH2,">backbone_it$currentiteration\_initial_$refname.fna") or die $!;
|
|
268 while (<FH1>) {
|
|
269 $_ =~ s/x/N/g;
|
|
270 print FH2 $_;
|
|
271 }
|
|
272 close(FH1);
|
|
273 close(FH2);
|
|
274 unlink glob ("tmp*");
|
|
275 }
|
|
276 MIRABAIT:
|
|
277 unless ($maf){
|
|
278 print "\nquick option baits reads from provided reference in iteration 0\n";
|
|
279 copy("$quick", "backbone_it$currentiteration\_initial_$refname.fna") or die "copy failed: $!";
|
|
280 }
|
|
281 &check_ref_length("backbone_it$currentiteration\_initial_$refname.fna","temp_baitfile.fasta",29800,$k_bait);
|
|
282
|
|
283 if ($splitting){
|
|
284 &remove_unmapped_contigs("temp_baitfile_backup.fasta","temp_baitfile.fasta");
|
|
285 if (-e "temp_baitfile_backup.fasta"){ #this file has only been created if splitting contigs was necessary
|
|
286 copy "temp_baitfile.fasta","backbone_it$currentiteration\_initial_$refname.fna";
|
|
287 }
|
|
288 }
|
|
289 print "\nfishing readpool using mirabait (k = $k_bait)\n\n";
|
|
290
|
|
291 # @output = (`mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-$refname\_in.$platform 2>&1`);
|
|
292 @output = qx($mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-readpool-it$currentiteration);
|
|
293 $exit = $? >> 8;
|
|
294 unless (!$noshow){
|
|
295 print "@output\n";
|
|
296 }
|
|
297 if (!-s "$strainname-readpool-it$currentiteration.fastq"){
|
|
298 print "\nyour readpool does not contain any reads with reasonable match (k = $k_bait) to your reference - Maybe you ll want to different settings or even a different reference?\n\n";
|
|
299 exit;
|
|
300 }
|
|
301 unless ($exit == 0){
|
|
302 if (!$noshow){
|
|
303 print "@output\n";
|
|
304 }
|
|
305 print "\nmirabait seems to have failed - see detailed output above\n";
|
|
306 exit;
|
|
307 }
|
|
308
|
|
309 FINDPAIRS:
|
|
310
|
|
311 unless (!$paired){
|
|
312 print "\nfinding pairs for baited reads\n\n";
|
|
313 copy ("$strainname-readpool-it$currentiteration.fastq", "$strainname-readpool-it$currentiteration-se.fastq") or die "copy failed: $!";
|
|
314 open(FH1,"<$strainname-readpool-it$currentiteration.fastq") or die $!;
|
|
315 open(FH2,">list");
|
|
316 my $index=1;
|
|
317 while (<FH1>) {
|
|
318 if ($index % 8 ==1 || $index % 8 ==5) {
|
|
319 chomp;
|
|
320 $_ =~ s/@//g;
|
|
321 ($key, $val) = split /\//;
|
|
322 $hash{$key} .= exists $hash{$key} ? ",$val" : $val;
|
|
323 }
|
|
324 $index++;
|
|
325 }
|
|
326
|
|
327 for (keys %hash){
|
|
328 $_ =~ s/$/\/1/g;
|
|
329 print FH2 "$_\n";
|
|
330 $_ =~ s/1$/2/g;
|
|
331 print FH2 "$_\n";
|
|
332 }
|
|
333 close(FH1);
|
|
334 close(FH2);
|
|
335 # @output = (`miraconvert -f fastq -t fastq -n list $readpool $strainname-$refname\_in.$platform 2>&1`);
|
|
336 @output = qx($miraconvert -f fastq -t fastq -n list $readpool $strainname-readpool-it$currentiteration );
|
|
337 $exit = $? >> 8;
|
|
338 unless (!$noshow){
|
|
339 print "@output\n";
|
|
340 }
|
|
341 unless ($exit == 0){
|
|
342 if (!$noshow){
|
|
343 print "@output\n";
|
|
344 }
|
|
345 print "\nmiraconvert seems to have failed - see detailed output above\n";
|
|
346 exit;
|
|
347 }
|
|
348 }
|
|
349 unlink("list");
|
|
350
|
|
351 MIRA:
|
|
352 print "\nrunning $miramode assembly using MIRA\n\n";
|
|
353 &create_manifest($currentiteration,$strainname,$refname,$miramode,$trim_off,$platform_settings,$shme,$paired,$trimoverhang,"$strainname-readpool-it$currentiteration.fastq","backbone_it$currentiteration\_initial_$refname.fna", $redirect_temp, $NFS_warn_only);
|
|
354 @output = qx($mira manifest.conf );
|
|
355
|
|
356 $exit = $? >> 8;
|
|
357 unless (!$noshow){
|
|
358 print "@output\n";
|
|
359 }
|
|
360 unless ($exit == 0){
|
|
361 if (!$noshow){
|
|
362 print "@output\n";
|
|
363 }
|
|
364 print "\nMIRA seems to have failed - see detailed output above\n";
|
|
365 exit;
|
|
366 }
|
|
367
|
|
368 if ($redirect_temp) {
|
|
369 my $tmp_link = readlink("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp");
|
|
370 rmtree ("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp") or die $!;
|
|
371 mkdir "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/" or die $1;
|
|
372 my @files = glob("$tmp_link/*");
|
|
373 for (@files) {
|
|
374 move ("$_", "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/") or die $!;
|
|
375 }
|
|
376 rmtree ("$tmp_link") or die $!;
|
|
377 }
|
|
378
|
|
379 @path = abs_path;
|
|
380 push (@path, "/$strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out.maf");
|
|
381 $maf = join("",@path);
|
|
382 unless (-e $maf){
|
|
383 print "maf file is not there \n";
|
|
384 exit;
|
|
385 }
|
|
386
|
|
387 # $current_contiglength = &get_contig_length("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt");
|
|
388 # $current_number_of_reads = (&get_number_of_reads("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt") - 1);
|
|
389
|
|
390 @current_contig_stats = &get_contig_stats("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt");
|
|
391 if (((scalar @current_contig_stats > 3) || ($current_contig_stats[0] > 1)) && ($proofreading)) {
|
|
392 print "assembly consists of more than one contigs - this is atm not permitted in proofreading mode. Sorry!\n\n";
|
|
393 exit 1;
|
|
394 }
|
|
395
|
|
396 PROOFREAD:
|
|
397 # if (($proofreading) && ($currentiteration >= 1)){
|
|
398 if ($proofreading){
|
|
399 print "proofreading option is currently disabled in this beta version of MITObim 1.7 - sorry for the inconvenience!\n\n";
|
|
400 exit;
|
|
401 }
|
|
402
|
|
403 $current_number_of_contigs = shift @current_contig_stats;
|
|
404 $current_number_of_reads = shift @current_contig_stats;
|
|
405 if (!$mode){ #in mapping assemblies the reference is counted as one read
|
|
406 $current_number_of_reads -= $current_number_of_contigs;
|
|
407 }
|
|
408 push (@number_of_contigs, $current_number_of_contigs);
|
|
409 push (@number_of_reads, $current_number_of_reads);
|
|
410 print "readpool contains $current_number_of_reads reads\n";
|
|
411 print "assembly contains $current_number_of_contigs contig(s)\n";
|
|
412 if (scalar @current_contig_stats == 1){
|
|
413 print "contig length: $current_contig_stats[0]\n";
|
|
414 }elsif (scalar @current_contig_stats == 3){
|
|
415 print "min contig length: ".$current_contig_stats[0]." bp\nmax contig length: ".$current_contig_stats[1]." bp\navg contig length: ".sprintf("%.0f", $current_contig_stats[2])." bp\n";
|
|
416 print "find details on individual contigs in: ". abs_path . "/$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt\n";
|
|
417 # printf("%1f",$current_contig_stats[2])."\n";
|
|
418 }else {
|
|
419 print "somethings wrong with your contig stats. Sorry!\n";
|
|
420 exit 1;
|
|
421 }
|
|
422
|
|
423 if ($clean){
|
|
424 &clean($clean_interval, $currentiteration);
|
|
425 }
|
|
426 # my $path=abs_path; print "Now at $path\n";
|
|
427 system("cp $strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out_AllStrains.unpadded.fasta ../finaloutput.fasta");
|
|
428 system("sed -i 's/_bb.*/_mitobim_out_$currentiteration\_iterations/' ../finaloutput.fasta");
|
|
429 if ($number_of_reads[-2]){
|
|
430 if ($number_of_reads[-2] >= $number_of_reads[-1]){
|
|
431 print "\nMITObim has reached a stationary read number after $currentiteration iterations!!\n$cite";
|
|
432 print strftime("%b %e %H:%M:%S", localtime) . "\n\n";
|
|
433 exit;
|
|
434 }
|
|
435 }
|
|
436 chdir ".." or die "Failed to go to parent directory: $!";
|
|
437 }
|
|
438 print "\nsuccessfully completed $enditeration iterations with MITObim! " . strftime("%b %e %H:%M:%S", localtime) . "\n$cite";
|
|
439
|
|
440 #
|
|
441 #
|
|
442 ###SUBROUTINES
|
|
443 #
|
|
444 #
|
|
445 #
|
|
446
|
|
447 sub set_platform{
|
|
448 my @platforms = ('solexa', '454', 'iontorrent', 'pacbio');
|
|
449 my %pf_settings = ("solexa" => "SOLEXA_SETTINGS", "454" => "454_SETTINGS", "iontorrent" => "IONTOR_SETTINGS", "pacbio" => "PCBIOHQ_SETTINGS -CO:mrpg=5");
|
|
450 my $user_setting = shift;
|
|
451 my @user_choice;
|
|
452 for (@platforms){
|
|
453 if ( $_ =~ /^$user_setting/ ){
|
|
454 push(@user_choice, $_);
|
|
455 }
|
|
456 }
|
|
457
|
|
458 if (scalar(@user_choice) > 1){
|
|
459 print "\nYour platform choice was ambiguous: $user_setting could be: @user_choice - Please try again\n";
|
|
460 exit;
|
|
461
|
|
462 } elsif (scalar(@user_choice) == 0){
|
|
463 print "\nPlease specify a valid sequencing platform - not specifying anything will default into: $platforms[0]\n\navailable options are:\n@platforms\n";
|
|
464 exit;
|
|
465 }
|
|
466
|
|
467 return ($user_choice[0], $pf_settings{$user_choice[0]})
|
|
468 }
|
|
469
|
|
470 sub get_contig_stats{
|
|
471 my $contig = $_[0];
|
|
472 my @array;
|
|
473 my @contiglength;
|
|
474 my (@readnumber, @stats);
|
|
475 my $readssum = 0;
|
|
476 open (CONTIGSTATS,"<$contig") or die $!;
|
|
477 while (<CONTIGSTATS>){
|
|
478 unless ($_ =~ /#/){
|
|
479 @array = split /\t/;
|
|
480 push (@contiglength, $array[1]);
|
|
481 push (@readnumber, $array[3]);
|
|
482 }
|
|
483 }
|
|
484 close (CONTIGSTATS);
|
|
485 if (scalar @readnumber == 1){
|
|
486 push (@stats, (scalar @readnumber, $readnumber[0], $contiglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, length of contig
|
|
487 }
|
|
488 elsif (scalar @readnumber > 1){
|
|
489 $readssum += $_ for @readnumber;
|
|
490 my $minlength = min @contiglength;
|
|
491 my $maxlength = max @contiglength;
|
|
492 my @avglength = &standard_deviation(@contiglength);
|
|
493 push (@stats, (scalar @readnumber, $readssum, $minlength, $maxlength, $avglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, minimal, maximal, avg length of contigs
|
|
494 }
|
|
495 return @stats;
|
|
496 }
|
|
497
|
|
498
|
|
499 #sub get_contig_length{
|
|
500 # my $contig = $_[0];
|
|
501 # my @contiglength;
|
|
502 # open (CONTIGSTATS,"<$contig") or die $!;
|
|
503 # while (<CONTIGSTATS>){
|
|
504 # unless ($_ =~ /#/){
|
|
505 # @contigslength = split /\t/;
|
|
506 # }
|
|
507 # }
|
|
508 # close (CONTIGSTATS);
|
|
509 # return $contiglength[1];
|
|
510 #}
|
|
511
|
|
512 #sub get_number_of_reads{
|
|
513 # my $contig = $_[0];
|
|
514 # my @contigstats;
|
|
515 # open (CONTIGSTATS,"<$contig") or die $!;
|
|
516 # while (<CONTIGSTATS>){
|
|
517 # unless ($_ =~ /#/){
|
|
518 # @contigstats = split /\t/;
|
|
519 # }
|
|
520 # }
|
|
521 # close (CONTIGSTATS);
|
|
522 # return $contigstats[3];
|
|
523 #}
|
|
524
|
|
525 sub proofread {
|
|
526 my $zero_MM = $_[0];
|
|
527 my $readtaglist_FH = $_[1];
|
|
528 my $contiglength = $_[2];
|
|
529 my $elevated_cov_lower_start = $_[3];
|
|
530 my $elevated_cov_lower_end = $_[4];
|
|
531 my $elevated_cov_upper_start = $_[5];
|
|
532 my $elevated_cov_upper_end = $_[6];
|
|
533 my $lower_limit = $_[7];
|
|
534 # my $lower_limit = 200;
|
|
535 my $lower_main_limit = $_[8];
|
|
536 # my $lower_main_limit = 500;
|
|
537 my $upper_limit = $contiglength - $lower_limit;
|
|
538 my $upper_main_limit = $contiglength - $lower_main_limit;
|
|
539 my @readtaglist;
|
|
540 my $ref;
|
|
541 my $junk;
|
|
542 my $current_id;
|
|
543 my %count;
|
|
544 my @readid_good;
|
|
545 my @readid_bad;
|
|
546 my @reads;
|
|
547 my @readlist;
|
|
548 my @readlist_good;
|
|
549 my @readlist_proofread;
|
|
550 my @total_readlist;
|
|
551 my @singleton;
|
|
552 my @singletons;
|
|
553 my @taglist;
|
|
554 my @taglist_line;
|
|
555 my @readtaglist_lower;
|
|
556 my @readtaglist_upper;
|
|
557 my @read_ids_lower;
|
|
558 my @read_ids_all;
|
|
559 my @read_ids_upper;
|
|
560 my %ids =();
|
|
561 my @unsorted;
|
|
562 my $min;
|
|
563 my $max;
|
|
564 my $tag;
|
|
565
|
|
566
|
|
567 print "\nlower limit: $lower_limit\n";
|
|
568 print "upper limit: $upper_limit\n";
|
|
569 print "lower main limit: $lower_main_limit\n";
|
|
570 print "upper main limit: $upper_main_limit\n\n";
|
|
571 open (TAGLIST,"<$readtaglist_FH") or die $!;
|
|
572 while (<TAGLIST>){
|
|
573 push (@readtaglist, "$_");
|
|
574 }
|
|
575 close (TAGLIST);
|
|
576
|
|
577 for (@readtaglist){
|
|
578 @taglist_line = split /\t/;
|
|
579 unless ($taglist_line[0] =~ /#/){
|
|
580 $ref = join ("\t", $taglist_line[0], $taglist_line[6]);
|
|
581 push (@read_ids_all, $ref);
|
|
582
|
|
583 if (($taglist_line[1] <= $lower_limit) || ($taglist_line[2] <= $lower_limit)){
|
|
584 # if ((($taglist_line[1] <= $lower_limit) || (($taglist_line[1] >= $coverage_limits_lower[0])&&($taglist_line[1] <= $coverage_limits_lower[1]))) || ($taglist_line[2] <= $lower_limit)){
|
|
585 $ref = join ("\t", $taglist_line[0], $taglist_line[6]);
|
|
586 push (@read_ids_lower, $ref);
|
|
587 }elsif (($taglist_line[1] >= $upper_limit) || ($taglist_line[2] >= $upper_limit)){
|
|
588 $ref = join ("\t", $taglist_line[0], $taglist_line[6]);
|
|
589 push (@read_ids_upper, $ref);
|
|
590 }
|
|
591 }
|
|
592 }
|
|
593 %ids = map { $_ => 1 } @read_ids_lower;
|
|
594 my @unique_lower = keys %ids;
|
|
595 %ids = map { $_ => 1 } @read_ids_upper;
|
|
596 my @unique_upper = keys %ids;
|
|
597 %ids = map { $_ => 1 } @read_ids_all;
|
|
598 my @unique_all = keys %ids;
|
|
599 for (@unique_all) {
|
|
600 my @junk = split /\//;
|
|
601 push (@reads, $junk[0]);
|
|
602 @junk = split /\t/;
|
|
603 push (@total_readlist, $junk[1]);
|
|
604 }
|
|
605
|
|
606 map { $count{$_}++ } @reads;
|
|
607 map {if ($count{$_} == 2){ @readid_good = split /\t/; push(@readlist, "$readid_good[1]");} elsif ($count{$_} == 1) { push(@readid_bad, "$_");}} keys (%count);
|
|
608 @reads = {};
|
|
609 undef %count;
|
|
610
|
|
611 for (@readlist){
|
|
612 chomp;
|
|
613 $current_id = $_;
|
|
614 my @pairs_lower = grep { $_ =~ /$current_id/} @unique_lower;
|
|
615 my @pairs_upper = grep{ $_ =~ /$current_id/} @unique_upper;
|
|
616 # print "good id: $current_id\n";
|
|
617 my $count_lower = scalar @pairs_lower;
|
|
618 my $count_upper = scalar @pairs_upper;
|
|
619 # print "count lower: $count_lower\n";
|
|
620 # print "count upper: $count_upper\n";
|
|
621 unless ((scalar @pairs_lower == 2) || (scalar @pairs_upper == 2)){
|
|
622 push (@readlist_good, "$current_id");
|
|
623 }
|
|
624 }
|
|
625 for (@readid_bad){
|
|
626 chomp;
|
|
627 @unsorted = ();
|
|
628 ($junk, $current_id) = split (/\t/);
|
|
629 print "current ID: $current_id\n";
|
|
630 @singleton = grep { $_ =~ /$current_id/} @total_readlist;
|
|
631 for (@singleton){
|
|
632 chomp;
|
|
633 $tag = $_;
|
|
634 @taglist = grep { $_ =~ /$tag/} @readtaglist;
|
|
635 # print "taglist: @taglist\n";
|
|
636 }
|
|
637 for (@taglist) {
|
|
638 @taglist_line = split /\t/;
|
|
639 push(@unsorted, $taglist_line[1], $taglist_line[2]);
|
|
640 $max = max @unsorted;
|
|
641 $min = min @unsorted;
|
|
642 }
|
|
643 # print "unsorted: @unsorted\n";
|
|
644 print "read mapping from $min to $max\n";
|
|
645 # print "min: $min\n";
|
|
646 # print "max: $max\n";
|
|
647
|
|
648 if ($min <= $lower_limit){
|
|
649 print "orphan discarded! min<lowerlimit\n----------------------------------------------\n";
|
|
650 }elsif ($max >= $upper_limit){
|
|
651 print "orphan discarded! max>upperlimit\n----------------------------------------------\n";
|
|
652 }elsif (($min >= $lower_main_limit) && ($max <= $upper_main_limit)){
|
|
653 print "orphan discarded! lower_main_limit<min-max<upper_main_limit\n----------------------------------------------\n";
|
|
654 }elsif (($min >= $elevated_cov_lower_start) && ($min <= $elevated_cov_lower_end - ($lower_limit / 2))){
|
|
655 print "orphan discarded! increased_coverage_lower_start<min<increased_coverage_lower_end\n----------------------------------------------\n";
|
|
656 }elsif (($max >= ($elevated_cov_upper_start + ($lower_limit / 2))) && ($max <= $elevated_cov_upper_end)){
|
|
657 print "orphan discarded! increased_coverage_upper_start<max<increased_coverage_upper_end\n----------------------------------------------\n";
|
|
658 }else {
|
|
659 push(@singletons, "@singleton\n");
|
|
660 print "orphan resurrected! \n----------------------------------------------\n";
|
|
661 }
|
|
662 # print "contiglength: $contiglength\n";
|
|
663 }
|
|
664
|
|
665 for (@singletons){
|
|
666 my @resurrection = split /\//;
|
|
667 push (@readlist_good, $resurrection[0]);
|
|
668 }
|
|
669 for (@readlist_good){
|
|
670 $_ =~ s/$/\/1\n/g;
|
|
671 push(@readlist_proofread, $_);
|
|
672 $_ =~ s/1$/2/g;
|
|
673 push(@readlist_proofread, $_);
|
|
674 }
|
|
675 return @readlist_proofread;
|
|
676 }
|
|
677
|
|
678 sub standard_deviation {
|
|
679 my(@numbers) = @_;
|
|
680 #Prevent division by 0 error in case you get junk data
|
|
681 return undef unless(scalar(@numbers));
|
|
682
|
|
683 # Step 1, find the mean of the numbers
|
|
684 my $total1 = 0;
|
|
685 foreach my $num (@numbers) {
|
|
686 if (!$num){
|
|
687 $num = 0;
|
|
688 }
|
|
689 $total1 += $num;
|
|
690 }
|
|
691 my $mean1 = $total1 / (scalar @numbers);
|
|
692 push (my @stdev, "$mean1");
|
|
693 # Step 2, find the mean of the squares of the differences between each number and the mean
|
|
694 my $total2 = 0;
|
|
695 foreach my $num (@numbers) {
|
|
696 if (!$num){
|
|
697 $num = 0;
|
|
698 }
|
|
699 $total2 += ($mean1-$num)**2;
|
|
700 }
|
|
701 my $mean2 = $total2 / (scalar @numbers);
|
|
702
|
|
703 # Step 3, standard deviation is the square root of the above mean
|
|
704 my $std_dev = sqrt($mean2);
|
|
705 push (@stdev, "$std_dev");
|
|
706 return @stdev;
|
|
707 }
|
|
708
|
|
709 sub assess_coverage{
|
|
710 my $readtaglist_FH = $_[0];
|
|
711 my @readtaglist;
|
|
712 my $from =$_[1];
|
|
713 my $to = $_[2];
|
|
714 my $where = $_[3];
|
|
715 my @taglist_line;
|
|
716 my @coverage_array_lower;
|
|
717 my @coverage_array_upper;
|
|
718 my @read_ids_lower;
|
|
719 my @read_ids_upper;
|
|
720 my %ids;
|
|
721 my @taglist;
|
|
722 my @unsorted;
|
|
723 my $min;
|
|
724 my $max;
|
|
725 my %coverage;
|
|
726 my @allnums;
|
|
727 my @coverage_change_position;
|
|
728 my @coverage_limits;
|
|
729
|
|
730 # print "assessing coverage from position $from to position $to\n";
|
|
731 open (TAGLIST,"<$readtaglist_FH") or die $!;
|
|
732 while (<TAGLIST>){
|
|
733 push (@readtaglist, "$_");
|
|
734 }
|
|
735 close (TAGLIST);
|
|
736
|
|
737 for (@readtaglist){
|
|
738 @taglist_line = split /\t/;
|
|
739 unless ($taglist_line[0] =~ /#/){
|
|
740 if ((($taglist_line[1] >= $from) && ($taglist_line[1] <= $to)) || (($taglist_line[2] >= $from) && ($taglist_line[2] <= $to))){
|
|
741 push (@coverage_array_lower, "$_");
|
|
742 push (@read_ids_lower, $taglist_line[6]);
|
|
743 }
|
|
744 }
|
|
745 }
|
|
746 %ids = map { $_ => 1 } @read_ids_lower;
|
|
747 my @unique_lower = keys %ids;
|
|
748 for (@unique_lower){
|
|
749 my @current_id = $_;
|
|
750 chomp;
|
|
751 @unsorted = ();
|
|
752 for (@current_id){
|
|
753 my $current_id = $_;
|
|
754 @taglist = grep { $_ =~ /$current_id/} @coverage_array_lower;
|
|
755 }
|
|
756 for (@taglist) {
|
|
757 @taglist_line = split /\t/;
|
|
758 push(@unsorted, $taglist_line[1], $taglist_line[2]);
|
|
759 $max = max @unsorted;
|
|
760 $min = min @unsorted;
|
|
761 }
|
|
762 my @nums = ($min .. $max);
|
|
763 for (@nums){
|
|
764 push (@allnums, "$_");
|
|
765 }
|
|
766 }
|
|
767 %coverage = map { $_ => 0 } @allnums;
|
|
768 map { $coverage{$_}++ } @allnums;
|
|
769 open (OUT,">out-$where.csv");
|
|
770
|
|
771 ########## detecting coverage peak
|
|
772 my $max_cov = 0;
|
|
773 my $max_cov_position;
|
|
774 my @cumulative_coverage;
|
|
775 map { unless (!$coverage{$_}){print OUT "$_,$coverage{$_}\n"; push (@cumulative_coverage, "$coverage{$_}"); if ($coverage{$_} > $max_cov){$max_cov = $coverage{$_}; $max_cov_position = $_; }}} ($from..$to);
|
|
776 my @average_coverage = &standard_deviation(@cumulative_coverage);
|
|
777 my $coverage_factor = $max_cov / $average_coverage[0];
|
|
778 open (OUT,">>out-$where.csv");
|
|
779 print OUT "\nmaximum coverage is $max_cov at position $max_cov_position\naverge coverage is: $average_coverage[0], sd: $average_coverage[1]\nfactor $coverage_factor\n";
|
|
780 close (OUT);
|
|
781
|
|
782 ######### detecting rapid changes in coverage
|
|
783 for ($from..($to - 10)){
|
|
784 my $position = $_;
|
|
785 my $cov = $coverage{$position};
|
|
786 unless (!$cov){
|
|
787 my @positions = ();
|
|
788 push (@positions, "$cov");
|
|
789 for (1 .. 10){
|
|
790 my $pos_plus = $position + $_;
|
|
791 if ($coverage{$pos_plus}){
|
|
792 push (@positions, "$coverage{$pos_plus}");
|
|
793 }
|
|
794 }
|
|
795 my @stdev = &standard_deviation(@positions);
|
|
796 if ($stdev[1] > 6.0){
|
|
797 print "positions ($position): @positions -> stdev: $stdev[1]\n";
|
|
798 push (@coverage_change_position, $position);
|
|
799 }elsif ($stdev[1] >= 4.5){
|
|
800 print "positions ($position): @positions -> stdev: $stdev[1]\n";
|
|
801 }
|
|
802 }
|
|
803 }
|
|
804 if (@coverage_change_position){
|
|
805 print "positions with rapidly changing coverage detected: @coverage_change_position\n";
|
|
806 my $start = min @coverage_change_position;
|
|
807 my $end = max @coverage_change_position;
|
|
808 push (@coverage_limits, "$start", "$end");
|
|
809 print "set limits from $coverage_limits[0] to $coverage_limits[1]\n";
|
|
810 }else{
|
|
811 print "no irregularities in coverage detected\n";
|
|
812 push (@coverage_limits, "0", "0");
|
|
813 return @coverage_limits;
|
|
814 }
|
|
815
|
|
816
|
|
817 ###### assessing whether coverage peak lies within putative conserved region, if yes accept prediction; if no, reject conserved region
|
|
818 if (($coverage_factor >= 1.6) && (($coverage_limits[0] < $max_cov_position) && ( $max_cov_position < $coverage_limits[1]))){
|
|
819
|
|
820 print "suspicious coverage peak detected within the predicted limits\n";
|
|
821 }else {
|
|
822 print "no coverage peak detected within predicted limits - rejecting limits\n";
|
|
823 @coverage_limits = ();
|
|
824 push (@coverage_limits, "0", "0");
|
|
825 }
|
|
826 return @coverage_limits;
|
|
827
|
|
828 }
|
|
829
|
|
830 sub check_ref_length{
|
|
831 my $ref=$_[0];
|
|
832 my $output_filename=$_[1];
|
|
833 my $critical=$_[2];
|
|
834 my $kmer = $_[3];
|
|
835 my @header;
|
|
836 my $header_count=0;
|
|
837 my (@sequence,@temp_output,@final_output);
|
|
838 my $full_sequence;
|
|
839 open(REF,"<$ref") or die $!;
|
|
840 while(<REF>){
|
|
841 chomp;
|
|
842 if ($_ =~ /^>/){
|
|
843 push(@header,$_);
|
|
844 # print "found header:\n$header[$header_count]\n";
|
|
845 $header_count++;
|
|
846 # print "header count: $header_count\n";
|
|
847 if (@sequence){
|
|
848 @temp_output=&finalize_sequence($critical,$header[-2],$kmer,@sequence);
|
|
849 for (@temp_output){
|
|
850 push(@final_output,$_);
|
|
851 }
|
|
852 }
|
|
853 undef @sequence;
|
|
854 }elsif ($_ =~ /[a-zA-Z]/){
|
|
855 # print "found sequence:\n$_\n";
|
|
856 push(@sequence,$_);
|
|
857 }
|
|
858 }
|
|
859 @temp_output=&finalize_sequence($critical,$header[-1],$kmer,@sequence);
|
|
860 for (@temp_output){
|
|
861 push(@final_output,$_);
|
|
862 }
|
|
863 # print "result:\n";
|
|
864 open (OUT,">$output_filename") or die $!;
|
|
865 for(@final_output){
|
|
866 # print "$_\n";
|
|
867 print OUT "$_\n";
|
|
868 }
|
|
869 close REF;
|
|
870 close OUT;
|
|
871
|
|
872 }
|
|
873
|
|
874 sub finalize_sequence{
|
|
875 my $critical = shift;
|
|
876 my $header = shift;
|
|
877 my $kmer = shift;
|
|
878 my $full_sequence=join("",@_);
|
|
879 my $factor;
|
|
880 my @output;
|
|
881 if (!$critical){
|
|
882 $factor=0;
|
|
883 }else{
|
|
884 $factor=ceil(length($full_sequence)/$critical);
|
|
885 }
|
|
886 if ($factor == 1){
|
|
887 push(@output,$header);
|
|
888 push(@output,$full_sequence);
|
|
889 }else{ #too long
|
|
890 print "\nreference is too long for mirabait to be handled in one go -> will be split into sub-sequences\n";
|
|
891 $header=substr $header, 1;
|
|
892 for (my $i=0; $i<$factor; $i++){
|
|
893 unless ((length(substr $full_sequence, $i*$critical, $critical+$kmer)-$kmer)<0){
|
|
894 push(@output,">sub$i\_" .$header);
|
|
895 push(@output,substr $full_sequence, $i*$critical, $critical+$kmer);
|
|
896 }
|
|
897 }
|
|
898 }
|
|
899 return @output;
|
|
900 }
|
|
901
|
|
902 sub create_manifest {
|
|
903 my ($iter, $sampleID, $refID, $mmode, $trim, $platform, $solexa_missmatches, $pair, $overhang, $reads, $ref, $redirect, $NFS_warn);
|
|
904 $iter = $_[0];
|
|
905 $sampleID = $_[1];
|
|
906 $refID = $_[2];
|
|
907 $mmode = $_[3];
|
|
908 $trim = $_[4];
|
|
909 $platform = $_[5];
|
|
910 $solexa_missmatches = $_[6];
|
|
911 $pair = $_[7];
|
|
912 $overhang = $_[8];
|
|
913 $reads = $_[9];
|
|
914 $ref = $_[10];
|
|
915 $redirect = $_[11];
|
|
916 $NFS_warn = $_[12];
|
|
917
|
|
918 if ($NFS_warn){
|
|
919 $NFS_warn = ":cnfs=warn"
|
|
920 }
|
|
921
|
|
922 open (MANIFEST,">manifest.conf") or die $!;
|
|
923 print MANIFEST "#manifest file for iteration $iter created by MITObim\n\nproject = $sampleID-$refID
|
|
924 \njob = genome,$mmode,accurate
|
|
925 \nparameters = -NW:mrnl=0$NFS_warn -AS:nop=1 $redirect $overhang $platform $trim -CO:msr=no $solexa_missmatches\n";
|
|
926 my @technology = split("_",$platform);
|
|
927 #-notraceinfo -
|
|
928 if ($mmode eq "mapping"){
|
|
929 print MANIFEST "\nreadgroup\nis_reference\ndata = $ref\nstrain = $refID\n";
|
|
930 }
|
|
931 print MANIFEST "\nreadgroup = reads\ndata = $reads\ntechnology = $technology[0]";
|
|
932 if ($pair){
|
|
933 # print MANIFEST "\nsegmentplacement = ---> <--- infoonly";
|
|
934 print MANIFEST "\nsegmentplacement = ---> <--- exclusion_criterion";
|
|
935 print MANIFEST "\ntemplatesize = -1 600 exclusion_criterion";
|
|
936 }
|
|
937 print MANIFEST "\nstrain = $sampleID\n";
|
|
938 close MANIFEST;
|
|
939 }
|
|
940 sub clean {
|
|
941 my $interval = shift;
|
|
942 my $cur = shift;
|
|
943 my $dir = $cur-$interval;
|
|
944 my $path=abs_path;
|
|
945 if (-d "$path/../iteration$dir"){
|
|
946 print "\nnow removing directory iteration$dir\n";
|
|
947 rmtree ("$path/../iteration$dir") or die $!;
|
|
948 }
|
|
949 }
|
|
950
|
|
951 sub remove_unmapped_contigs{
|
|
952 my $file = shift;
|
|
953 my $out = shift;
|
|
954 my ($head, $seq,$length);
|
|
955 my @split;
|
|
956 my ($dropped,$split,$trim)=(0,0,0);
|
|
957 open (FASTA,"<$out") or die $!."\ncould not open $file for reading\n";
|
|
958 while (<FASTA>){
|
|
959 chomp;
|
|
960 if ($_ =~ /^>/){
|
|
961 $head=$_;
|
|
962 }else {
|
|
963 if ($_ =~ /X/){
|
|
964 $dropped++;
|
|
965 print "drop:\n$head\n$_\n";
|
|
966 }else{
|
|
967 $length=length($_);
|
|
968 $_ =~ s/^N+//g; #remove all Ns from the beginning of the sequence
|
|
969 $_ =~ s/N+$//g; #remove all Ns from the end of the sequence
|
|
970 if ($length != length($_)){
|
|
971 $trim++;
|
|
972 }
|
|
973 @split=split(/N{3,}/); #split at every occurance of 3 or more Ns
|
|
974 if (@split > 1){
|
|
975 print "contig $head has been split into ".scalar @split." sub-contigs\n";
|
|
976 $split++;
|
|
977 for (my $i=0; $i<@split; $i++){
|
|
978 # print "$head\_$i\n$split[$i]\n";
|
|
979 $seq.="$head\_$i\n$split[$i]\n";
|
|
980 }
|
|
981 }else{
|
|
982 # print "$head\n$_\n";
|
|
983 $seq.="$head\n$_\n";
|
|
984 }
|
|
985 }
|
|
986 }
|
|
987 }
|
|
988 close FASTA;
|
|
989 if (($dropped) || ($split) || ($trim)){
|
|
990 print "creating backup of $out before changing it\n";
|
|
991 copy $out,$file;
|
|
992 open (OUT,">$out") or die $!."\ncould not open $out for writing\n";
|
|
993 print OUT "$seq\n";
|
|
994 close OUT;
|
|
995 }
|
|
996
|
|
997 if ($dropped){
|
|
998 print "\nremoved $dropped contigs from baitfile because they did not receive any hits\n";
|
|
999 }else{
|
|
1000 print "\nno need to remove any sequences from the baitfile\n";
|
|
1001 }
|
|
1002
|
|
1003 if (!$split){
|
|
1004 print "\nno need to split any contigs\n";
|
|
1005 }
|
|
1006 if ($trim){
|
|
1007 print "\n$trim contigs had N's trimmed off their ends\n";
|
|
1008 }
|
|
1009 }
|