6
+ − 1 #! /usr/bin/perl
+ − 2 #
+ − 3 # MITObim - mitochondrial baiting and iterative mapping
+ − 4 # wrapper script version 1.8
+ − 5 # Author: Christoph Hahn, 2012-2016
+ − 6 # christoph.hahn@nhm.uio.no
+ − 7
+ − 8 use strict;
+ − 9 use warnings;
+ − 10 use Getopt::Long;
+ − 11 use Cwd qw(abs_path);
+ − 12 use File::Copy;
+ − 13 use List::Util qw< min max >;
+ − 14 use POSIX qw(strftime);
+ − 15 use POSIX qw(ceil);
+ − 16 use File::Path 'rmtree';
+ − 17
+ − 18 my $startiteration = 1;
+ − 19 my $enditeration = 1;
+ − 20
+ − 21 my ($quick, $noshow, $help, $strainname, $paired, $mode, $refname, $readpool, $maf, $proofreading, $readlength, $insertsize, $MM, $trim, $trimoverhang, $k_bait, $clean, $clean_interval, $min_contig_cov) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 31, 0, 2, 0);
+ − 22 my ($miramode, $key, $val, $exit, $current_contiglength, $current_number_of_reads, $current_number_of_contigs);
+ − 23 my $splitting = 0;
+ − 24 my $platform = "solexa";
+ − 25 my $platform_settings;# = "SOLEXA";
+ − 26 my $shme = "";
+ − 27 my $trim_off = "";
+ − 28 my $redirect_temp = "";
+ − 29 my $NFS_warn_only = "";
+ − 30 my ($mirapath, $mira, $miraconvert, $mirabait) = ("", "mira", "miraconvert", "mirabait");
+ − 31 my (@reads, @output, @path, @current_contig_stats, @contiglengths, @number_of_reads, @number_of_contigs);
+ − 32 my %hash;
+ − 33 my $PROGRAM = "\nMITObim - mitochondrial baiting and iterative mapping\n";
+ − 34 my $VERSION = "version 1.8\n";
+ − 35 my $AUTHOR = "author: Christoph Hahn, (c) 2012-2016\n";
+ − 36 my $cite = "\nif you found MITObim useful, please cite:
+ − 37 Hahn C, Bachmann L and Chevreux B. (2013) Reconstructing mitochondrial genomes directly from genomic next-generation sequencing reads -
+ − 38 a baiting and iterative mapping approach. Nucl. Acids Res. 41(13):e129. doi: 10.1093/nar/gkt371\n\n";
+ − 39 my $USAGE = "\nusage: ./MITObim.pl <parameters>
+ − 40 \nparameters:
+ − 41 -start <int> iteration to start with, default=1
+ − 42 -end <int> iteration to end with, default=1
+ − 43 -sample <string> sampleID as used in initial MIRA assembly
+ − 44 -ref <string> referenceID as used in initial MIRA assembly
+ − 45 -readpool <FILE> readpool in fastq format (*.gz is also allowed)
+ − 46 -maf <FILE> maf file from previous MIRA assembly
+ − 47 \noptional:
+ − 48 --quick <FILE> starts process with initial baiting using provided fasta reference
+ − 49 --kbait <int> set kmer for baiting stringency (default: 31)
+ − 50 --platform specify sequencing platform (default: 'solexa'; other options: 'iontor', '454', 'pacbio')
+ − 51 --denovo runs MIRA in denovo mode (default: mapping)
+ − 52 --pair finds pairs after baiting (relies on /1 and /2 header convention for read pairs) (default: no)
+ − 53 --verbose show detailed output of MIRA modules (default: no)
+ − 54 --split split reference at positions with more than 5N (default: no)
+ − 55 --help shows this helpful information
+ − 56 --clean retain only the last 2 iteration directories (default: no)
+ − 57 --trimreads trim data (default: no; we recommend to trim beforehand and feed MITObim with pre trimmed data)
+ − 58 --trimoverhang trim overhang up- and downstream of reference (default: no)
+ − 59 --missmatch <int> number of allowed missmatches in mapping - only for illumina data (default: 15% of avg. read length)
+ − 60 --min_cov <int> minimum average coverage of contigs to be retained (default: off)
+ − 61 --mirapath <string> full path to MIRA binaries (only needed if MIRA is not in PATH)
+ − 62 --redirect_tmp redirect temporary output to this location (useful in case you are running MITObim on an NFS mount)
+ − 63 --NFS_warn_only allow MIRA to run on NFS mount without aborting - warn only (expert option - see MIRA documentation 'check_nfs')
+ − 64 \nexamples:
+ − 65 ./MITObim.pl -start 1 -end 5 -sample StrainX -ref reference-mt -readpool illumina_readpool.fastq -maf initial_assembly.maf
+ − 66 ./MITObim.pl -end 10 --quick reference.fasta -sample StrainY -ref reference-mt -readpool illumina_readpool.fastq\n\n";
+ − 67 # --proofread applies proofreading (atm only to be used if starting the process from a single short seed reference)
+ − 68 # --readlength <int> read length of illumina library, default=150, relevant only for proofreading
+ − 69 # --insert <int> insert size of illumina library, default=300, relevant only for proofreading
+ − 70
+ − 71 my $command = $0;
+ − 72 for (@ARGV){
+ − 73 $command .= " $_";
+ − 74 }
+ − 75
+ − 76 GetOptions ( "start=i" => \$startiteration,
+ − 77 "end=i" => \$enditeration,
+ − 78 "kbait=i" => \$k_bait,
+ − 79 "quick=s" => \$quick,
+ − 80 "verbose!" => \$noshow,
+ − 81 "sample=s" => \$strainname,
+ − 82 "paired" => \$paired,
+ − 83 "denovo" => \$mode,
+ − 84 "ref=s" => \$refname,
+ − 85 "readpool=s" => \$readpool,
+ − 86 "help!" => \$help,
+ − 87 "clean!" => \$clean,
+ − 88 "mirapath=s" => \$mirapath,
+ − 89 "maf=s" => \$maf,
+ − 90 # "proofreading!" => \$proofreading,
+ − 91 "trimreads!" => \$trim,
+ − 92 "trimoverhang!" => \$trimoverhang,
+ − 93 "missmatch=i" => \$MM,
+ − 94 "platform=s" => \$platform,
+ − 95 # "readlength=i" => \$readlength,
+ − 96 # "insertsize=i" => \$insertsize,
+ − 97 "split!" => \$splitting,
+ − 98 "min_cov=i" => \$min_contig_cov,
+ − 99 "redirect_tmp=s" => \$redirect_temp,
+ − 100 "NFS_warn_only!" => \$NFS_warn_only) or die "Incorrect usage!\n$USAGE";
+ − 101
+ − 102
+ − 103 print $PROGRAM;
+ − 104 print $VERSION;
+ − 105 print $AUTHOR;
+ − 106
+ − 107 print $USAGE and exit if $help;
+ − 108 print $USAGE and exit if !$readpool;
+ − 109 unless ($quick){
+ − 110 print $USAGE and exit if !$maf;
+ − 111 }
+ − 112 print $USAGE and exit if !$refname;
+ − 113
+ − 114 $readpool=abs_path($readpool);
+ − 115 unless (-e $readpool){
+ − 116 print "Cant find the readpool. Is the path correct?\n";
+ − 117 }
+ − 118 if ($maf){
+ − 119 $maf=abs_path($maf);
+ − 120 unless (-e $maf){
+ − 121 print "Cant find *.maf file. Is the path correct?\n";
+ − 122 }
+ − 123 }
+ − 124
+ − 125 ($platform, $platform_settings) = &set_platform($platform);
+ − 126
+ − 127 if ($quick){
+ − 128 $quick=abs_path($quick);
+ − 129 unless (-e $quick){
+ − 130 print "quick option selected but is the path to the file correct?\n";
+ − 131 exit 1;
+ − 132 }
+ − 133 print "\nquick option selected! -maf option will be ignored (if given)\n";
+ − 134 $maf = 0;
+ − 135 $startiteration = 0;
+ − 136 }
+ − 137 print $USAGE and exit if ($startiteration > $enditeration);
+ − 138 unless (((-e $maf)||($quick)) && (-e $readpool)){
+ − 139 print "\nAre readpool AND maf/reference files there?\n";
+ − 140 exit 1;
+ − 141 }
+ − 142 if ($mirapath){
+ − 143 if (-e "$mirapath/mira"){
+ − 144 print "found executables in the path specified by the user - good!\n";
+ − 145 $mira = "$mirapath/mira";
+ − 146 $miraconvert = "$mirapath/miraconvert";
+ − 147 $mirabait = "$mirapath/mirabait";
+ − 148 }else{
+ − 149 print "somethings wrong with the path to mira.\n";
+ − 150 exit 1;
+ − 151 }
+ − 152 }
+ − 153
+ − 154
+ − 155 ##if not given otherwise, readlength and insertsize are set to default. automatic readlength and insertsize detection will be implemented in time.
+ − 156 #if (!$readlength){
+ − 157 # $readlength = 150;
+ − 158 #}
+ − 159 #if (!$insertsize){
+ − 160 # $insertsize = 300;
+ − 161 #}
+ − 162 if (!$mode){
+ − 163 $miramode = "mapping";
+ − 164 }else {
+ − 165 $miramode = "denovo";
+ − 166 }
+ − 167
+ − 168 if (!$trim){
+ − 169 $trim_off = "\"--noclipping -CL:pec=no\"";
+ − 170 }
+ − 171
+ − 172 my $out_command = $command; $out_command =~ s/\S+galaxy\///g;
+ − 173 print "\nFull command run:\n$out_command\n";
+ − 174 print "\nAll paramters seem to make sense:\n";
+ − 175 print "startiteration: $startiteration\n";
+ − 176 print "enditeration: $enditeration\n";
+ − 177 print "sample: $strainname\n";
+ − 178 print "refname: $refname\n";
+ − 179 my $out_readpool = $readpool; $out_readpool =~ s/.*galaxy\///;
+ − 180 print "readpool: $out_readpool\n";
+ − 181 print "maf: $maf\n";
+ − 182 my $out_quick = $quick; $out_quick =~ s/.*galaxy\///;
+ − 183 print "quick: $out_quick\n";
+ − 184 print "paired: $paired (off=0, on=1)\n";
+ − 185 print "assembly mode: $mode (mapping=0, denovo=1)\n";
+ − 186 print "verbose: $noshow (off=0, on=1)\n";
+ − 187 print "split: $splitting (off=0, on=1)\n";
+ − 188 print "minimum avg. coverage: $min_contig_cov (off=0)\n";
+ − 189 print "clean: $clean (off=0, on=1)\n";
+ − 190 print "trim reads: $trim (off=0, on=1)\n";
+ − 191 print "trim overhang: $trimoverhang (no=0, yes=1)\n";
+ − 192 print "platform: $platform\n";
+ − 193 print "kmer baiting: $k_bait\n";
+ − 194 #print "proofread: $proofreading (off=0, on=1)\n";
+ − 195
+ − 196 if ($proofreading){
+ − 197 print "\nproofreading is not yet enabled in this beta version of MITObim 1.8 - it is currently being optimized for MIRA4. \nplease refer to MITObim 1.6 if you wish to use this option. Sorry for the inconvenience!\n\n";
+ − 198 exit;
+ − 199
+ − 200 print "proofreading: on\n";
+ − 201 print "readlength: $readlength\n";
+ − 202 print "insertsize: $insertsize\n";
+ − 203 $MM = 0;
+ − 204 print "number of allowed missmatches in proofreading assembly: $MM\n";
+ − 205 $shme = "-AL:shme=$MM";
+ − 206 }elsif ((!$proofreading) && (!$mode) && ($platform eq "solexa")){
+ − 207 if ($MM == -1){
+ − 208 print "number of missmatches in mapping assembly: default (15% of average read length loaded)\n";
+ − 209 $shme = "";
+ − 210 }else {
+ − 211 print "number of missmatches in mapping assembly: $MM\n";
+ − 212 $shme = "-AL:shme=$MM";
+ − 213 }
+ − 214 print "proofreading: off\n";
+ − 215 }
+ − 216
+ − 217 if (!$trimoverhang){
+ − 218 $trimoverhang = "-SB:tor=no";
+ − 219 }else {
+ − 220 $trimoverhang = "-SB:tor=yes";
+ − 221 }
+ − 222
+ − 223 if ($redirect_temp) {
+ − 224 $redirect_temp="-DI:trt=$redirect_temp"
+ − 225 }
+ − 226
+ − 227 print "\nStarting MITObim \n";
+ − 228
+ − 229 my @iteration = ($startiteration .. $enditeration);
+ − 230 foreach (@iteration){
+ − 231 chomp;
+ − 232 my $currentiteration = $_;
+ − 233 mkdir "iteration$currentiteration" or die "MITObim will not overwrite an existing directory: iteration$currentiteration\n";
+ − 234 chdir "iteration$currentiteration" or die $!;
+ − 235 print "\n==============\n";
+ − 236 print " ITERATION $currentiteration\n";
+ − 237 print "==============\n";
+ − 238 print strftime("%b %e %H:%M:%S", localtime) . "\n\n";
+ − 239 # if (($proofreading) && ($currentiteration != 0)){
+ − 240 if ($proofreading){
+ − 241 $shme = "-AL:shme=0";
+ − 242 }
+ − 243
+ − 244 if ($maf){
+ − 245 print "\nrecover backbone by running miraconvert on maf file\n\n";
+ − 246 if ($currentiteration<2){
+ − 247 @output= qx($miraconvert -f maf -t fasta -A "$platform_settings -CO:fnicpst=yes" $maf tmp);
+ − 248 }else{
+ − 249 @output= qx($miraconvert -f maf -t fasta -y $min_contig_cov -A "$platform_settings -CO:fnicpst=yes" $maf tmp);
+ − 250 }
+ − 251 $exit = $? >> 8;
+ − 252 unless (!$noshow){
+ − 253 print "@output\n";
+ − 254 }
+ − 255 unless ($exit == 0){
+ − 256 if (!$noshow){
+ − 257 print "@output\n";
+ − 258 }
+ − 259 print "\nmiraconvert seems to have failed - see detailed output above\n";
+ − 260 exit;
+ − 261 }
+ − 262 # if ( ((($mode) && ($currentiteration > 1)) && (!$quick)) || ((($mode) && ($currentiteration >= 1)) && ($quick)) ){
+ − 263 # open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!";
+ − 264 # }else{
+ − 265 open(FH1,"<tmp_$strainname.unpadded.fasta") or die "$!\nIs the sampleID identical as in the initial MIRA assembly?\n";
+ − 266 # }
+ − 267 open(FH2,">backbone_it$currentiteration\_initial_$refname.fna") or die $!;
+ − 268 while (<FH1>) {
+ − 269 $_ =~ s/x/N/g;
+ − 270 print FH2 $_;
+ − 271 }
+ − 272 close(FH1);
+ − 273 close(FH2);
+ − 274 unlink glob ("tmp*");
+ − 275 }
+ − 276 MIRABAIT:
+ − 277 unless ($maf){
+ − 278 print "\nquick option baits reads from provided reference in iteration 0\n";
+ − 279 copy("$quick", "backbone_it$currentiteration\_initial_$refname.fna") or die "copy failed: $!";
+ − 280 }
+ − 281 &check_ref_length("backbone_it$currentiteration\_initial_$refname.fna","temp_baitfile.fasta",29800,$k_bait);
+ − 282
+ − 283 if ($splitting){
+ − 284 &remove_unmapped_contigs("temp_baitfile_backup.fasta","temp_baitfile.fasta");
+ − 285 if (-e "temp_baitfile_backup.fasta"){ #this file has only been created if splitting contigs was necessary
+ − 286 copy "temp_baitfile.fasta","backbone_it$currentiteration\_initial_$refname.fna";
+ − 287 }
+ − 288 }
+ − 289 print "\nfishing readpool using mirabait (k = $k_bait)\n\n";
+ − 290
+ − 291 # @output = (`mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-$refname\_in.$platform 2>&1`);
+ − 292 @output = qx($mirabait -k $k_bait -n 1 temp_baitfile.fasta $readpool $strainname-readpool-it$currentiteration);
+ − 293 $exit = $? >> 8;
+ − 294 unless (!$noshow){
+ − 295 print "@output\n";
+ − 296 }
+ − 297 if (!-s "$strainname-readpool-it$currentiteration.fastq"){
+ − 298 print "\nyour readpool does not contain any reads with reasonable match (k = $k_bait) to your reference - Maybe you ll want to different settings or even a different reference?\n\n";
+ − 299 exit;
+ − 300 }
+ − 301 unless ($exit == 0){
+ − 302 if (!$noshow){
+ − 303 print "@output\n";
+ − 304 }
+ − 305 print "\nmirabait seems to have failed - see detailed output above\n";
+ − 306 exit;
+ − 307 }
+ − 308
+ − 309 FINDPAIRS:
+ − 310
+ − 311 unless (!$paired){
+ − 312 print "\nfinding pairs for baited reads\n\n";
+ − 313 copy ("$strainname-readpool-it$currentiteration.fastq", "$strainname-readpool-it$currentiteration-se.fastq") or die "copy failed: $!";
+ − 314 open(FH1,"<$strainname-readpool-it$currentiteration.fastq") or die $!;
+ − 315 open(FH2,">list");
+ − 316 my $index=1;
+ − 317 while (<FH1>) {
+ − 318 if ($index % 8 ==1 || $index % 8 ==5) {
+ − 319 chomp;
+ − 320 $_ =~ s/@//g;
+ − 321 ($key, $val) = split /\//;
+ − 322 $hash{$key} .= exists $hash{$key} ? ",$val" : $val;
+ − 323 }
+ − 324 $index++;
+ − 325 }
+ − 326
+ − 327 for (keys %hash){
+ − 328 $_ =~ s/$/\/1/g;
+ − 329 print FH2 "$_\n";
+ − 330 $_ =~ s/1$/2/g;
+ − 331 print FH2 "$_\n";
+ − 332 }
+ − 333 close(FH1);
+ − 334 close(FH2);
+ − 335 # @output = (`miraconvert -f fastq -t fastq -n list $readpool $strainname-$refname\_in.$platform 2>&1`);
+ − 336 @output = qx($miraconvert -f fastq -t fastq -n list $readpool $strainname-readpool-it$currentiteration );
+ − 337 $exit = $? >> 8;
+ − 338 unless (!$noshow){
+ − 339 print "@output\n";
+ − 340 }
+ − 341 unless ($exit == 0){
+ − 342 if (!$noshow){
+ − 343 print "@output\n";
+ − 344 }
+ − 345 print "\nmiraconvert seems to have failed - see detailed output above\n";
+ − 346 exit;
+ − 347 }
+ − 348 }
+ − 349 unlink("list");
+ − 350
+ − 351 MIRA:
+ − 352 print "\nrunning $miramode assembly using MIRA\n\n";
+ − 353 &create_manifest($currentiteration,$strainname,$refname,$miramode,$trim_off,$platform_settings,$shme,$paired,$trimoverhang,"$strainname-readpool-it$currentiteration.fastq","backbone_it$currentiteration\_initial_$refname.fna", $redirect_temp, $NFS_warn_only);
+ − 354 @output = qx($mira manifest.conf );
+ − 355
+ − 356 $exit = $? >> 8;
+ − 357 unless (!$noshow){
+ − 358 print "@output\n";
+ − 359 }
+ − 360 unless ($exit == 0){
+ − 361 if (!$noshow){
+ − 362 print "@output\n";
+ − 363 }
+ − 364 print "\nMIRA seems to have failed - see detailed output above\n";
+ − 365 exit;
+ − 366 }
+ − 367
+ − 368 if ($redirect_temp) {
+ − 369 my $tmp_link = readlink("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp");
+ − 370 rmtree ("$strainname-$refname\_assembly/$strainname-$refname\_d_tmp") or die $!;
+ − 371 mkdir "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/" or die $1;
+ − 372 my @files = glob("$tmp_link/*");
+ − 373 for (@files) {
+ − 374 move ("$_", "$strainname-$refname\_assembly/$strainname-$refname\_d_tmp/") or die $!;
+ − 375 }
+ − 376 rmtree ("$tmp_link") or die $!;
+ − 377 }
+ − 378
+ − 379 @path = abs_path;
+ − 380 push (@path, "/$strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out.maf");
+ − 381 $maf = join("",@path);
+ − 382 unless (-e $maf){
+ − 383 print "maf file is not there \n";
+ − 384 exit;
+ − 385 }
+ − 386
+ − 387 # $current_contiglength = &get_contig_length("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt");
+ − 388 # $current_number_of_reads = (&get_number_of_reads("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt") - 1);
+ − 389
+ − 390 @current_contig_stats = &get_contig_stats("$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt");
+ − 391 if (((scalar @current_contig_stats > 3) || ($current_contig_stats[0] > 1)) && ($proofreading)) {
+ − 392 print "assembly consists of more than one contigs - this is atm not permitted in proofreading mode. Sorry!\n\n";
+ − 393 exit 1;
+ − 394 }
+ − 395
+ − 396 PROOFREAD:
+ − 397 # if (($proofreading) && ($currentiteration >= 1)){
+ − 398 if ($proofreading){
+ − 399 print "proofreading option is currently disabled in this beta version of MITObim 1.7 - sorry for the inconvenience!\n\n";
+ − 400 exit;
+ − 401 }
+ − 402
+ − 403 $current_number_of_contigs = shift @current_contig_stats;
+ − 404 $current_number_of_reads = shift @current_contig_stats;
+ − 405 if (!$mode){ #in mapping assemblies the reference is counted as one read
+ − 406 $current_number_of_reads -= $current_number_of_contigs;
+ − 407 }
+ − 408 push (@number_of_contigs, $current_number_of_contigs);
+ − 409 push (@number_of_reads, $current_number_of_reads);
+ − 410 print "readpool contains $current_number_of_reads reads\n";
+ − 411 print "assembly contains $current_number_of_contigs contig(s)\n";
+ − 412 if (scalar @current_contig_stats == 1){
+ − 413 print "contig length: $current_contig_stats[0]\n";
+ − 414 }elsif (scalar @current_contig_stats == 3){
+ − 415 print "min contig length: ".$current_contig_stats[0]." bp\nmax contig length: ".$current_contig_stats[1]." bp\navg contig length: ".sprintf("%.0f", $current_contig_stats[2])." bp\n";
+ − 416 print "find details on individual contigs in: ". abs_path . "/$strainname-$refname\_assembly/$strainname-$refname\_d_info/$strainname-$refname\_info_contigstats.txt\n";
+ − 417 # printf("%1f",$current_contig_stats[2])."\n";
+ − 418 }else {
+ − 419 print "somethings wrong with your contig stats. Sorry!\n";
+ − 420 exit 1;
+ − 421 }
+ − 422
+ − 423 if ($clean){
+ − 424 &clean($clean_interval, $currentiteration);
+ − 425 }
+ − 426 # my $path=abs_path; print "Now at $path\n";
+ − 427 system("cp $strainname-$refname\_assembly/$strainname-$refname\_d_results/$strainname-$refname\_out_AllStrains.unpadded.fasta ../finaloutput.fasta");
+ − 428 system("sed -i 's/_bb.*/_mitobim_out_$currentiteration\_iterations/' ../finaloutput.fasta");
+ − 429 if ($number_of_reads[-2]){
+ − 430 if ($number_of_reads[-2] >= $number_of_reads[-1]){
+ − 431 print "\nMITObim has reached a stationary read number after $currentiteration iterations!!\n$cite";
+ − 432 print strftime("%b %e %H:%M:%S", localtime) . "\n\n";
+ − 433 exit;
+ − 434 }
+ − 435 }
+ − 436 chdir ".." or die "Failed to go to parent directory: $!";
+ − 437 }
+ − 438 print "\nsuccessfully completed $enditeration iterations with MITObim! " . strftime("%b %e %H:%M:%S", localtime) . "\n$cite";
+ − 439
+ − 440 #
+ − 441 #
+ − 442 ###SUBROUTINES
+ − 443 #
+ − 444 #
+ − 445 #
+ − 446
+ − 447 sub set_platform{
+ − 448 my @platforms = ('solexa', '454', 'iontorrent', 'pacbio');
+ − 449 my %pf_settings = ("solexa" => "SOLEXA_SETTINGS", "454" => "454_SETTINGS", "iontorrent" => "IONTOR_SETTINGS", "pacbio" => "PCBIOHQ_SETTINGS -CO:mrpg=5");
+ − 450 my $user_setting = shift;
+ − 451 my @user_choice;
+ − 452 for (@platforms){
+ − 453 if ( $_ =~ /^$user_setting/ ){
+ − 454 push(@user_choice, $_);
+ − 455 }
+ − 456 }
+ − 457
+ − 458 if (scalar(@user_choice) > 1){
+ − 459 print "\nYour platform choice was ambiguous: $user_setting could be: @user_choice - Please try again\n";
+ − 460 exit;
+ − 461
+ − 462 } elsif (scalar(@user_choice) == 0){
+ − 463 print "\nPlease specify a valid sequencing platform - not specifying anything will default into: $platforms[0]\n\navailable options are:\n@platforms\n";
+ − 464 exit;
+ − 465 }
+ − 466
+ − 467 return ($user_choice[0], $pf_settings{$user_choice[0]})
+ − 468 }
+ − 469
+ − 470 sub get_contig_stats{
+ − 471 my $contig = $_[0];
+ − 472 my @array;
+ − 473 my @contiglength;
+ − 474 my (@readnumber, @stats);
+ − 475 my $readssum = 0;
+ − 476 open (CONTIGSTATS,"<$contig") or die $!;
+ − 477 while (<CONTIGSTATS>){
+ − 478 unless ($_ =~ /#/){
+ − 479 @array = split /\t/;
+ − 480 push (@contiglength, $array[1]);
+ − 481 push (@readnumber, $array[3]);
+ − 482 }
+ − 483 }
+ − 484 close (CONTIGSTATS);
+ − 485 if (scalar @readnumber == 1){
+ − 486 push (@stats, (scalar @readnumber, $readnumber[0], $contiglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, length of contig
+ − 487 }
+ − 488 elsif (scalar @readnumber > 1){
+ − 489 $readssum += $_ for @readnumber;
+ − 490 my $minlength = min @contiglength;
+ − 491 my $maxlength = max @contiglength;
+ − 492 my @avglength = &standard_deviation(@contiglength);
+ − 493 push (@stats, (scalar @readnumber, $readssum, $minlength, $maxlength, $avglength[0])); #@stats contains: number of contigs, total number of reads used to build the contigs, minimal, maximal, avg length of contigs
+ − 494 }
+ − 495 return @stats;
+ − 496 }
+ − 497
+ − 498
+ − 499 #sub get_contig_length{
+ − 500 # my $contig = $_[0];
+ − 501 # my @contiglength;
+ − 502 # open (CONTIGSTATS,"<$contig") or die $!;
+ − 503 # while (<CONTIGSTATS>){
+ − 504 # unless ($_ =~ /#/){
+ − 505 # @contigslength = split /\t/;
+ − 506 # }
+ − 507 # }
+ − 508 # close (CONTIGSTATS);
+ − 509 # return $contiglength[1];
+ − 510 #}
+ − 511
+ − 512 #sub get_number_of_reads{
+ − 513 # my $contig = $_[0];
+ − 514 # my @contigstats;
+ − 515 # open (CONTIGSTATS,"<$contig") or die $!;
+ − 516 # while (<CONTIGSTATS>){
+ − 517 # unless ($_ =~ /#/){
+ − 518 # @contigstats = split /\t/;
+ − 519 # }
+ − 520 # }
+ − 521 # close (CONTIGSTATS);
+ − 522 # return $contigstats[3];
+ − 523 #}
+ − 524
+ − 525 sub proofread {
+ − 526 my $zero_MM = $_[0];
+ − 527 my $readtaglist_FH = $_[1];
+ − 528 my $contiglength = $_[2];
+ − 529 my $elevated_cov_lower_start = $_[3];
+ − 530 my $elevated_cov_lower_end = $_[4];
+ − 531 my $elevated_cov_upper_start = $_[5];
+ − 532 my $elevated_cov_upper_end = $_[6];
+ − 533 my $lower_limit = $_[7];
+ − 534 # my $lower_limit = 200;
+ − 535 my $lower_main_limit = $_[8];
+ − 536 # my $lower_main_limit = 500;
+ − 537 my $upper_limit = $contiglength - $lower_limit;
+ − 538 my $upper_main_limit = $contiglength - $lower_main_limit;
+ − 539 my @readtaglist;
+ − 540 my $ref;
+ − 541 my $junk;
+ − 542 my $current_id;
+ − 543 my %count;
+ − 544 my @readid_good;
+ − 545 my @readid_bad;
+ − 546 my @reads;
+ − 547 my @readlist;
+ − 548 my @readlist_good;
+ − 549 my @readlist_proofread;
+ − 550 my @total_readlist;
+ − 551 my @singleton;
+ − 552 my @singletons;
+ − 553 my @taglist;
+ − 554 my @taglist_line;
+ − 555 my @readtaglist_lower;
+ − 556 my @readtaglist_upper;
+ − 557 my @read_ids_lower;
+ − 558 my @read_ids_all;
+ − 559 my @read_ids_upper;
+ − 560 my %ids =();
+ − 561 my @unsorted;
+ − 562 my $min;
+ − 563 my $max;
+ − 564 my $tag;
+ − 565
+ − 566
+ − 567 print "\nlower limit: $lower_limit\n";
+ − 568 print "upper limit: $upper_limit\n";
+ − 569 print "lower main limit: $lower_main_limit\n";
+ − 570 print "upper main limit: $upper_main_limit\n\n";
+ − 571 open (TAGLIST,"<$readtaglist_FH") or die $!;
+ − 572 while (<TAGLIST>){
+ − 573 push (@readtaglist, "$_");
+ − 574 }
+ − 575 close (TAGLIST);
+ − 576
+ − 577 for (@readtaglist){
+ − 578 @taglist_line = split /\t/;
+ − 579 unless ($taglist_line[0] =~ /#/){
+ − 580 $ref = join ("\t", $taglist_line[0], $taglist_line[6]);
+ − 581 push (@read_ids_all, $ref);
+ − 582
+ − 583 if (($taglist_line[1] <= $lower_limit) || ($taglist_line[2] <= $lower_limit)){
+ − 584 # if ((($taglist_line[1] <= $lower_limit) || (($taglist_line[1] >= $coverage_limits_lower[0])&&($taglist_line[1] <= $coverage_limits_lower[1]))) || ($taglist_line[2] <= $lower_limit)){
+ − 585 $ref = join ("\t", $taglist_line[0], $taglist_line[6]);
+ − 586 push (@read_ids_lower, $ref);
+ − 587 }elsif (($taglist_line[1] >= $upper_limit) || ($taglist_line[2] >= $upper_limit)){
+ − 588 $ref = join ("\t", $taglist_line[0], $taglist_line[6]);
+ − 589 push (@read_ids_upper, $ref);
+ − 590 }
+ − 591 }
+ − 592 }
+ − 593 %ids = map { $_ => 1 } @read_ids_lower;
+ − 594 my @unique_lower = keys %ids;
+ − 595 %ids = map { $_ => 1 } @read_ids_upper;
+ − 596 my @unique_upper = keys %ids;
+ − 597 %ids = map { $_ => 1 } @read_ids_all;
+ − 598 my @unique_all = keys %ids;
+ − 599 for (@unique_all) {
+ − 600 my @junk = split /\//;
+ − 601 push (@reads, $junk[0]);
+ − 602 @junk = split /\t/;
+ − 603 push (@total_readlist, $junk[1]);
+ − 604 }
+ − 605
+ − 606 map { $count{$_}++ } @reads;
+ − 607 map {if ($count{$_} == 2){ @readid_good = split /\t/; push(@readlist, "$readid_good[1]");} elsif ($count{$_} == 1) { push(@readid_bad, "$_");}} keys (%count);
+ − 608 @reads = {};
+ − 609 undef %count;
+ − 610
+ − 611 for (@readlist){
+ − 612 chomp;
+ − 613 $current_id = $_;
+ − 614 my @pairs_lower = grep { $_ =~ /$current_id/} @unique_lower;
+ − 615 my @pairs_upper = grep{ $_ =~ /$current_id/} @unique_upper;
+ − 616 # print "good id: $current_id\n";
+ − 617 my $count_lower = scalar @pairs_lower;
+ − 618 my $count_upper = scalar @pairs_upper;
+ − 619 # print "count lower: $count_lower\n";
+ − 620 # print "count upper: $count_upper\n";
+ − 621 unless ((scalar @pairs_lower == 2) || (scalar @pairs_upper == 2)){
+ − 622 push (@readlist_good, "$current_id");
+ − 623 }
+ − 624 }
+ − 625 for (@readid_bad){
+ − 626 chomp;
+ − 627 @unsorted = ();
+ − 628 ($junk, $current_id) = split (/\t/);
+ − 629 print "current ID: $current_id\n";
+ − 630 @singleton = grep { $_ =~ /$current_id/} @total_readlist;
+ − 631 for (@singleton){
+ − 632 chomp;
+ − 633 $tag = $_;
+ − 634 @taglist = grep { $_ =~ /$tag/} @readtaglist;
+ − 635 # print "taglist: @taglist\n";
+ − 636 }
+ − 637 for (@taglist) {
+ − 638 @taglist_line = split /\t/;
+ − 639 push(@unsorted, $taglist_line[1], $taglist_line[2]);
+ − 640 $max = max @unsorted;
+ − 641 $min = min @unsorted;
+ − 642 }
+ − 643 # print "unsorted: @unsorted\n";
+ − 644 print "read mapping from $min to $max\n";
+ − 645 # print "min: $min\n";
+ − 646 # print "max: $max\n";
+ − 647
+ − 648 if ($min <= $lower_limit){
+ − 649 print "orphan discarded! min<lowerlimit\n----------------------------------------------\n";
+ − 650 }elsif ($max >= $upper_limit){
+ − 651 print "orphan discarded! max>upperlimit\n----------------------------------------------\n";
+ − 652 }elsif (($min >= $lower_main_limit) && ($max <= $upper_main_limit)){
+ − 653 print "orphan discarded! lower_main_limit<min-max<upper_main_limit\n----------------------------------------------\n";
+ − 654 }elsif (($min >= $elevated_cov_lower_start) && ($min <= $elevated_cov_lower_end - ($lower_limit / 2))){
+ − 655 print "orphan discarded! increased_coverage_lower_start<min<increased_coverage_lower_end\n----------------------------------------------\n";
+ − 656 }elsif (($max >= ($elevated_cov_upper_start + ($lower_limit / 2))) && ($max <= $elevated_cov_upper_end)){
+ − 657 print "orphan discarded! increased_coverage_upper_start<max<increased_coverage_upper_end\n----------------------------------------------\n";
+ − 658 }else {
+ − 659 push(@singletons, "@singleton\n");
+ − 660 print "orphan resurrected! \n----------------------------------------------\n";
+ − 661 }
+ − 662 # print "contiglength: $contiglength\n";
+ − 663 }
+ − 664
+ − 665 for (@singletons){
+ − 666 my @resurrection = split /\//;
+ − 667 push (@readlist_good, $resurrection[0]);
+ − 668 }
+ − 669 for (@readlist_good){
+ − 670 $_ =~ s/$/\/1\n/g;
+ − 671 push(@readlist_proofread, $_);
+ − 672 $_ =~ s/1$/2/g;
+ − 673 push(@readlist_proofread, $_);
+ − 674 }
+ − 675 return @readlist_proofread;
+ − 676 }
+ − 677
+ − 678 sub standard_deviation {
+ − 679 my(@numbers) = @_;
+ − 680 #Prevent division by 0 error in case you get junk data
+ − 681 return undef unless(scalar(@numbers));
+ − 682
+ − 683 # Step 1, find the mean of the numbers
+ − 684 my $total1 = 0;
+ − 685 foreach my $num (@numbers) {
+ − 686 if (!$num){
+ − 687 $num = 0;
+ − 688 }
+ − 689 $total1 += $num;
+ − 690 }
+ − 691 my $mean1 = $total1 / (scalar @numbers);
+ − 692 push (my @stdev, "$mean1");
+ − 693 # Step 2, find the mean of the squares of the differences between each number and the mean
+ − 694 my $total2 = 0;
+ − 695 foreach my $num (@numbers) {
+ − 696 if (!$num){
+ − 697 $num = 0;
+ − 698 }
+ − 699 $total2 += ($mean1-$num)**2;
+ − 700 }
+ − 701 my $mean2 = $total2 / (scalar @numbers);
+ − 702
+ − 703 # Step 3, standard deviation is the square root of the above mean
+ − 704 my $std_dev = sqrt($mean2);
+ − 705 push (@stdev, "$std_dev");
+ − 706 return @stdev;
+ − 707 }
+ − 708
+ − 709 sub assess_coverage{
+ − 710 my $readtaglist_FH = $_[0];
+ − 711 my @readtaglist;
+ − 712 my $from =$_[1];
+ − 713 my $to = $_[2];
+ − 714 my $where = $_[3];
+ − 715 my @taglist_line;
+ − 716 my @coverage_array_lower;
+ − 717 my @coverage_array_upper;
+ − 718 my @read_ids_lower;
+ − 719 my @read_ids_upper;
+ − 720 my %ids;
+ − 721 my @taglist;
+ − 722 my @unsorted;
+ − 723 my $min;
+ − 724 my $max;
+ − 725 my %coverage;
+ − 726 my @allnums;
+ − 727 my @coverage_change_position;
+ − 728 my @coverage_limits;
+ − 729
+ − 730 # print "assessing coverage from position $from to position $to\n";
+ − 731 open (TAGLIST,"<$readtaglist_FH") or die $!;
+ − 732 while (<TAGLIST>){
+ − 733 push (@readtaglist, "$_");
+ − 734 }
+ − 735 close (TAGLIST);
+ − 736
+ − 737 for (@readtaglist){
+ − 738 @taglist_line = split /\t/;
+ − 739 unless ($taglist_line[0] =~ /#/){
+ − 740 if ((($taglist_line[1] >= $from) && ($taglist_line[1] <= $to)) || (($taglist_line[2] >= $from) && ($taglist_line[2] <= $to))){
+ − 741 push (@coverage_array_lower, "$_");
+ − 742 push (@read_ids_lower, $taglist_line[6]);
+ − 743 }
+ − 744 }
+ − 745 }
+ − 746 %ids = map { $_ => 1 } @read_ids_lower;
+ − 747 my @unique_lower = keys %ids;
+ − 748 for (@unique_lower){
+ − 749 my @current_id = $_;
+ − 750 chomp;
+ − 751 @unsorted = ();
+ − 752 for (@current_id){
+ − 753 my $current_id = $_;
+ − 754 @taglist = grep { $_ =~ /$current_id/} @coverage_array_lower;
+ − 755 }
+ − 756 for (@taglist) {
+ − 757 @taglist_line = split /\t/;
+ − 758 push(@unsorted, $taglist_line[1], $taglist_line[2]);
+ − 759 $max = max @unsorted;
+ − 760 $min = min @unsorted;
+ − 761 }
+ − 762 my @nums = ($min .. $max);
+ − 763 for (@nums){
+ − 764 push (@allnums, "$_");
+ − 765 }
+ − 766 }
+ − 767 %coverage = map { $_ => 0 } @allnums;
+ − 768 map { $coverage{$_}++ } @allnums;
+ − 769 open (OUT,">out-$where.csv");
+ − 770
+ − 771 ########## detecting coverage peak
+ − 772 my $max_cov = 0;
+ − 773 my $max_cov_position;
+ − 774 my @cumulative_coverage;
+ − 775 map { unless (!$coverage{$_}){print OUT "$_,$coverage{$_}\n"; push (@cumulative_coverage, "$coverage{$_}"); if ($coverage{$_} > $max_cov){$max_cov = $coverage{$_}; $max_cov_position = $_; }}} ($from..$to);
+ − 776 my @average_coverage = &standard_deviation(@cumulative_coverage);
+ − 777 my $coverage_factor = $max_cov / $average_coverage[0];
+ − 778 open (OUT,">>out-$where.csv");
+ − 779 print OUT "\nmaximum coverage is $max_cov at position $max_cov_position\naverge coverage is: $average_coverage[0], sd: $average_coverage[1]\nfactor $coverage_factor\n";
+ − 780 close (OUT);
+ − 781
+ − 782 ######### detecting rapid changes in coverage
+ − 783 for ($from..($to - 10)){
+ − 784 my $position = $_;
+ − 785 my $cov = $coverage{$position};
+ − 786 unless (!$cov){
+ − 787 my @positions = ();
+ − 788 push (@positions, "$cov");
+ − 789 for (1 .. 10){
+ − 790 my $pos_plus = $position + $_;
+ − 791 if ($coverage{$pos_plus}){
+ − 792 push (@positions, "$coverage{$pos_plus}");
+ − 793 }
+ − 794 }
+ − 795 my @stdev = &standard_deviation(@positions);
+ − 796 if ($stdev[1] > 6.0){
+ − 797 print "positions ($position): @positions -> stdev: $stdev[1]\n";
+ − 798 push (@coverage_change_position, $position);
+ − 799 }elsif ($stdev[1] >= 4.5){
+ − 800 print "positions ($position): @positions -> stdev: $stdev[1]\n";
+ − 801 }
+ − 802 }
+ − 803 }
+ − 804 if (@coverage_change_position){
+ − 805 print "positions with rapidly changing coverage detected: @coverage_change_position\n";
+ − 806 my $start = min @coverage_change_position;
+ − 807 my $end = max @coverage_change_position;
+ − 808 push (@coverage_limits, "$start", "$end");
+ − 809 print "set limits from $coverage_limits[0] to $coverage_limits[1]\n";
+ − 810 }else{
+ − 811 print "no irregularities in coverage detected\n";
+ − 812 push (@coverage_limits, "0", "0");
+ − 813 return @coverage_limits;
+ − 814 }
+ − 815
+ − 816
+ − 817 ###### assessing whether coverage peak lies within putative conserved region, if yes accept prediction; if no, reject conserved region
+ − 818 if (($coverage_factor >= 1.6) && (($coverage_limits[0] < $max_cov_position) && ( $max_cov_position < $coverage_limits[1]))){
+ − 819
+ − 820 print "suspicious coverage peak detected within the predicted limits\n";
+ − 821 }else {
+ − 822 print "no coverage peak detected within predicted limits - rejecting limits\n";
+ − 823 @coverage_limits = ();
+ − 824 push (@coverage_limits, "0", "0");
+ − 825 }
+ − 826 return @coverage_limits;
+ − 827
+ − 828 }
+ − 829
+ − 830 sub check_ref_length{
+ − 831 my $ref=$_[0];
+ − 832 my $output_filename=$_[1];
+ − 833 my $critical=$_[2];
+ − 834 my $kmer = $_[3];
+ − 835 my @header;
+ − 836 my $header_count=0;
+ − 837 my (@sequence,@temp_output,@final_output);
+ − 838 my $full_sequence;
+ − 839 open(REF,"<$ref") or die $!;
+ − 840 while(<REF>){
+ − 841 chomp;
+ − 842 if ($_ =~ /^>/){
+ − 843 push(@header,$_);
+ − 844 # print "found header:\n$header[$header_count]\n";
+ − 845 $header_count++;
+ − 846 # print "header count: $header_count\n";
+ − 847 if (@sequence){
+ − 848 @temp_output=&finalize_sequence($critical,$header[-2],$kmer,@sequence);
+ − 849 for (@temp_output){
+ − 850 push(@final_output,$_);
+ − 851 }
+ − 852 }
+ − 853 undef @sequence;
+ − 854 }elsif ($_ =~ /[a-zA-Z]/){
+ − 855 # print "found sequence:\n$_\n";
+ − 856 push(@sequence,$_);
+ − 857 }
+ − 858 }
+ − 859 @temp_output=&finalize_sequence($critical,$header[-1],$kmer,@sequence);
+ − 860 for (@temp_output){
+ − 861 push(@final_output,$_);
+ − 862 }
+ − 863 # print "result:\n";
+ − 864 open (OUT,">$output_filename") or die $!;
+ − 865 for(@final_output){
+ − 866 # print "$_\n";
+ − 867 print OUT "$_\n";
+ − 868 }
+ − 869 close REF;
+ − 870 close OUT;
+ − 871
+ − 872 }
+ − 873
+ − 874 sub finalize_sequence{
+ − 875 my $critical = shift;
+ − 876 my $header = shift;
+ − 877 my $kmer = shift;
+ − 878 my $full_sequence=join("",@_);
+ − 879 my $factor;
+ − 880 my @output;
+ − 881 if (!$critical){
+ − 882 $factor=0;
+ − 883 }else{
+ − 884 $factor=ceil(length($full_sequence)/$critical);
+ − 885 }
+ − 886 if ($factor == 1){
+ − 887 push(@output,$header);
+ − 888 push(@output,$full_sequence);
+ − 889 }else{ #too long
+ − 890 print "\nreference is too long for mirabait to be handled in one go -> will be split into sub-sequences\n";
+ − 891 $header=substr $header, 1;
+ − 892 for (my $i=0; $i<$factor; $i++){
+ − 893 unless ((length(substr $full_sequence, $i*$critical, $critical+$kmer)-$kmer)<0){
+ − 894 push(@output,">sub$i\_" .$header);
+ − 895 push(@output,substr $full_sequence, $i*$critical, $critical+$kmer);
+ − 896 }
+ − 897 }
+ − 898 }
+ − 899 return @output;
+ − 900 }
+ − 901
+ − 902 sub create_manifest {
+ − 903 my ($iter, $sampleID, $refID, $mmode, $trim, $platform, $solexa_missmatches, $pair, $overhang, $reads, $ref, $redirect, $NFS_warn);
+ − 904 $iter = $_[0];
+ − 905 $sampleID = $_[1];
+ − 906 $refID = $_[2];
+ − 907 $mmode = $_[3];
+ − 908 $trim = $_[4];
+ − 909 $platform = $_[5];
+ − 910 $solexa_missmatches = $_[6];
+ − 911 $pair = $_[7];
+ − 912 $overhang = $_[8];
+ − 913 $reads = $_[9];
+ − 914 $ref = $_[10];
+ − 915 $redirect = $_[11];
+ − 916 $NFS_warn = $_[12];
+ − 917
+ − 918 if ($NFS_warn){
+ − 919 $NFS_warn = ":cnfs=warn"
+ − 920 }
+ − 921
+ − 922 open (MANIFEST,">manifest.conf") or die $!;
+ − 923 print MANIFEST "#manifest file for iteration $iter created by MITObim\n\nproject = $sampleID-$refID
+ − 924 \njob = genome,$mmode,accurate
+ − 925 \nparameters = -NW:mrnl=0$NFS_warn -AS:nop=1 $redirect $overhang $platform $trim -CO:msr=no $solexa_missmatches\n";
+ − 926 my @technology = split("_",$platform);
+ − 927 #-notraceinfo -
+ − 928 if ($mmode eq "mapping"){
+ − 929 print MANIFEST "\nreadgroup\nis_reference\ndata = $ref\nstrain = $refID\n";
+ − 930 }
+ − 931 print MANIFEST "\nreadgroup = reads\ndata = $reads\ntechnology = $technology[0]";
+ − 932 if ($pair){
+ − 933 # print MANIFEST "\nsegmentplacement = ---> <--- infoonly";
+ − 934 print MANIFEST "\nsegmentplacement = ---> <--- exclusion_criterion";
+ − 935 print MANIFEST "\ntemplatesize = -1 600 exclusion_criterion";
+ − 936 }
+ − 937 print MANIFEST "\nstrain = $sampleID\n";
+ − 938 close MANIFEST;
+ − 939 }
+ − 940 sub clean {
+ − 941 my $interval = shift;
+ − 942 my $cur = shift;
+ − 943 my $dir = $cur-$interval;
+ − 944 my $path=abs_path;
+ − 945 if (-d "$path/../iteration$dir"){
+ − 946 print "\nnow removing directory iteration$dir\n";
+ − 947 rmtree ("$path/../iteration$dir") or die $!;
+ − 948 }
+ − 949 }
+ − 950
+ − 951 sub remove_unmapped_contigs{
+ − 952 my $file = shift;
+ − 953 my $out = shift;
+ − 954 my ($head, $seq,$length);
+ − 955 my @split;
+ − 956 my ($dropped,$split,$trim)=(0,0,0);
+ − 957 open (FASTA,"<$out") or die $!."\ncould not open $file for reading\n";
+ − 958 while (<FASTA>){
+ − 959 chomp;
+ − 960 if ($_ =~ /^>/){
+ − 961 $head=$_;
+ − 962 }else {
+ − 963 if ($_ =~ /X/){
+ − 964 $dropped++;
+ − 965 print "drop:\n$head\n$_\n";
+ − 966 }else{
+ − 967 $length=length($_);
+ − 968 $_ =~ s/^N+//g; #remove all Ns from the beginning of the sequence
+ − 969 $_ =~ s/N+$//g; #remove all Ns from the end of the sequence
+ − 970 if ($length != length($_)){
+ − 971 $trim++;
+ − 972 }
+ − 973 @split=split(/N{3,}/); #split at every occurance of 3 or more Ns
+ − 974 if (@split > 1){
+ − 975 print "contig $head has been split into ".scalar @split." sub-contigs\n";
+ − 976 $split++;
+ − 977 for (my $i=0; $i<@split; $i++){
+ − 978 # print "$head\_$i\n$split[$i]\n";
+ − 979 $seq.="$head\_$i\n$split[$i]\n";
+ − 980 }
+ − 981 }else{
+ − 982 # print "$head\n$_\n";
+ − 983 $seq.="$head\n$_\n";
+ − 984 }
+ − 985 }
+ − 986 }
+ − 987 }
+ − 988 close FASTA;
+ − 989 if (($dropped) || ($split) || ($trim)){
+ − 990 print "creating backup of $out before changing it\n";
+ − 991 copy $out,$file;
+ − 992 open (OUT,">$out") or die $!."\ncould not open $out for writing\n";
+ − 993 print OUT "$seq\n";
+ − 994 close OUT;
+ − 995 }
+ − 996
+ − 997 if ($dropped){
+ − 998 print "\nremoved $dropped contigs from baitfile because they did not receive any hits\n";
+ − 999 }else{
+ − 1000 print "\nno need to remove any sequences from the baitfile\n";
+ − 1001 }
+ − 1002
+ − 1003 if (!$split){
+ − 1004 print "\nno need to split any contigs\n";
+ − 1005 }
+ − 1006 if ($trim){
+ − 1007 print "\n$trim contigs had N's trimmed off their ends\n";
+ − 1008 }
+ − 1009 }