comparison abacas.1.1.pl @ 0:a1fdc6925620 draft default tip

"planemo upload for repository https://github.com/phac-nml/abacas commit f6856961094e89e4cad0ee7df6c2a49bf005e4bf"
author nml
date Thu, 21 Nov 2019 12:53:20 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a1fdc6925620
1 #!/usr/bin/env perl
2 # Copyright (C) 2008 Genome Research Limited. All Rights Reserved.
3 #
4 # This program is free software; you can redistribute it and/or
5 # modify it under the terms of the GNU General Public License
6 # as published by the Free Software Foundation; either version 2
7 # of the License, or (at your option) any later version.
8 #This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
12 #
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16
17
18 use strict;
19 use warnings;
20 use POSIX qw(ceil floor);
21 use Getopt::Std;
22
23 #-------------------------------------------------------------------------------
24
25 if (@ARGV < 1) { usage();}
26
27 my ($help, $reference, $query_file, $choice, $sen, $seed, $mlfas, $fasta_bin, $avoid_Ns,
28 $tbx, $min_id, $min_cov, $diff_cov, $min_len,$add_bin_2ps, $pick_primer,
29 $flank, $chk_uniq,$redoMummer, $is_circular,$escapeToPrimers, $debug, $gaps_2file, $prefix)
30 =checkUserInput( @ARGV );
31
32 my $ref_inline;
33 if ($escapeToPrimers ==1)
34 {
35 pickPrimers ($reference, $query_file, $flank, $chk_uniq);
36 exit;
37 }
38 #BEGIN
39 #-------------------------------------------------------------------------------
40 print_header();
41
42 $ref_inline = Ref_Inline($reference);
43 #Get length of the reference sequence
44 my $ref_len = length($ref_inline);
45 print "PREPARING DATA FOR $choice \n";
46
47 ###################
48 # Running MUMmer #
49 ###################
50 my ($path_dir, $run_mum, $path_toPass);
51 if ($debug)
52 {
53 print "the seed is $seed \n";
54 print "RedoMummer= ",$redoMummer."\n";
55 }
56 my @do_mum_return;
57 if ($redoMummer)
58 {
59 @do_mum_return = doMummer($reference, $query_file, $choice,$sen,$seed,$min_id, $min_cov, $diff_cov,$min_len, $debug, $is_circular) or die "Couldn't run MUMmer\n";
60 }
61
62 ####################################
63 # Processing tiling output #
64 ####################################
65 if ($debug) {
66 print "Do tiling...\n";
67 }
68
69 #--------------------------------
70 my $mummer_tiling = $do_mum_return[0];
71 $path_dir = $do_mum_return[2];
72 $path_toPass = $do_mum_return[1];
73 ##################
74 #Do Tiling
75 #-------------------------------------------
76 doTiling ($mummer_tiling, $path_toPass, $path_dir,$reference, $query_file, $choice, $prefix,$mlfas, $fasta_bin, $avoid_Ns, $ref_len, $gaps_2file, $ref_inline, $add_bin_2ps, $pick_primer, $flank, $chk_uniq, $tbx);
77
78
79
80
81
82 ##########################################################################################################################################################
83 #################################Contig ordering ##########################################################
84 ###########################
85
86 sub help
87 {
88
89 die <<EOF
90
91 ***********************************************************************************
92 * ABACAS: Algorithm Based Automatic Contiguation of Assembled Sequences *
93 * *
94 * *
95 * Copyright (C) 2009 The Wellcome Trust Sanger Institute, Cambridge, UK. *
96 * All Rights Reserved. *
97 * *
98 ***********************************************************************************
99
100 USAGE
101 abacas.pl -r <reference file: single fasta> -q <query sequence file: fasta> -p <nucmer/promer> [OPTIONS]
102
103 -r reference sequence in a single fasta file
104 -q contigs in multi-fasta format
105 -p MUMmer program to use: 'nucmer' or 'promer'
106 OR
107 abacas.pl -r <reference file: single fasta> -q <pseudomolecule/ordered sequence file: fasta> -e
108 OPTIONS
109 -h print usage
110 -d use default nucmer/promer parameters
111 -s int minimum length of exact matching word (nucmer default = 12, promer default = 4)
112 -m print ordered contigs to file in multifasta format
113 -b print contigs in bin to file
114 -N print a pseudomolecule without "N"s
115 -i int mimimum percent identity [default 40]
116 -v int mimimum contig coverage [default 40]
117 -V int minimum contig coverage difference [default 1]
118 -l int minimum contig length [default 1]
119 -t run tblastx on contigs that are not mapped
120 -g string (file name) print gaps on reference to file name
121 -a append contigs in bin to the pseudomolecule
122 -o prefix output files will have this prefix
123 -P pick primer sets to close gaps
124 -f int number of flanking bases on either side of a gap for primer design (default 350)
125 -R Redo mummer [default 1, use -R 0 to avoid running mummer]
126 -e Escape contig ordering i.e. go to primer design
127 -c Reference sequence is circular
128
129 EOF
130 }
131 ########
132 sub usage
133 {
134
135 die <<EOF
136
137 USAGE
138 abacas.pl -r <reference file: single fasta> -q <query sequence file: fasta> -p <nucmer/promer> [OPTIONS]
139 -r reference sequence in a single fasta file
140 -q contigs in multi-fasta format
141 -p MUMmer program to use: 'nucmer' or 'promer'
142 for contig ordering and primer design
143
144 OR
145 abacas.pl -r <reference file: single fasta> -q <pseudomolecule/ordered sequence file: fasta> -e 1
146 to escape contig ordering and go directly to primer design
147
148 OR
149 abacas.pl -h for help
150
151 EOF
152 }
153 ########
154 ##################
155 sub print_header
156 {
157 print "
158 ***********************************************************************************
159 * ABACAS: Algorithm Based Automatic Contiguation of Assembled Sequences *
160 * *
161 * *
162 * Copyright (C) 2009 The Wellcome Trust Sanger Institute, Cambridge, UK. *
163 * All Rights Reserved. *
164 * *
165 ***********************************************************************************
166 \n";
167 }
168 #########################################
169 sub checkUserInput{
170 my %options;
171 getopts('hr:q:p:ds:mbNi:v:V:l:tg:ao:Pf:R:u:ecD', \%options);
172
173 my ($help, $reference, $query_file, $choice, $sen, $seed, $mlfas, $fasta_bin, $avoid_Ns,
174 $tbx, $min_id, $min_cov, $diff_cov, $min_len,$add_bin_2ps, $pick_primer,
175 $flank, $chk_uniq,$redoMummer, $is_circular,$escapeToPrimers, $debug, $gaps_2file, $prefix);
176
177 if($options{h}) {
178 $help = $options{h};
179 help();
180 }
181 else{
182 $help =0;
183 }
184 if ($options{r} && $options{q} ){
185 ($reference, $query_file) = ($options{r},$options{q});
186
187 }
188 else
189 {
190 usage() unless $options{e};
191 }
192
193 if ($options{p})
194 {
195 $choice = $options{p};
196 unless ($choice eq "nucmer" || $choice eq "promer"){
197 print "Unknown MuMmer function\n Please use nucmer or promer\n";
198 exit;
199 }
200 }
201 else
202 {
203 usage() unless $options{e};
204 }
205 if ($options{e}) #$escapeToPrimers)
206 {
207 print_header();
208 print "Primer design selected,... escaping contig ordering\n";
209 $escapeToPrimers = 1;
210 $chk_uniq = "nucmer";
211 $choice = "";
212 }
213 else
214 {
215 $escapeToPrimers = 0;
216 }
217 if ($options{d}) {$sen =1;} else {$sen =0;} #print $sen , " ---sen\n";
218 #print $options{t}, "\n"; exit;
219 if($options{t}) {$tbx = 1;} else {$tbx = 0;} #print $tbx, " ---tbx\n"; #
220 unless ($options{s}){
221 if ($choice eq "nucmer"){
222 $seed = 12;
223 }
224 else{
225 $seed =4;
226 }
227 }
228 if ($options{m}){$mlfas =1;} else { $mlfas =0; } #print $mlfas , " ---mlfasta\n";
229 if ($options{b}) { $fasta_bin =1;} else {$fasta_bin =0;}
230 if ($options {N}) {$avoid_Ns =1;} else {$avoid_Ns=0;}
231
232 unless($options{i}) {$min_id =40;}
233 unless ($options{v}) {$min_cov =40;}
234 unless ($options{V}) {$diff_cov =1;}
235 unless ($options {l}) {$min_len = 1;}
236 if ($options{a}) {$add_bin_2ps = 1; }else {$add_bin_2ps =0;}
237 if ($options{P}) {$pick_primer=1;} else {$pick_primer =0;}
238 if ($options{f}) {$flank = $options{f};} else {$flank = 1000;}
239 if ($options{u}) {$chk_uniq = $options{u}; } else {$chk_uniq = "nucmer";}
240
241 if($options{R}) {$redoMummer = $options{R}; }else {$redoMummer = 1;}
242 unless ($options{c}) {$is_circular = 0;}
243 if ($options{g}) {$gaps_2file = $options{g};} else {$gaps_2file ="";}
244 if($options{o}) {$prefix = $options{o};} else {$prefix = "";}
245 unless ($options{D}) {$debug =0};
246 if ($tbx ==1 && $fasta_bin !=1)
247 {
248 print "ERROR: Please use -t -b if you want to run tblastx on contigs in bin\n";
249 exit;
250 }
251 #print $tbx, "\n"; exit;
252 return ($help, $reference, $query_file, $choice, $sen, $seed, $mlfas, $fasta_bin, $avoid_Ns,
253 $tbx, $min_id, $min_cov, $diff_cov, $min_len,$add_bin_2ps, $pick_primer,
254 $flank, $chk_uniq,$redoMummer, $is_circular,$escapeToPrimers, $debug, $gaps_2file, $prefix);
255
256 }
257 #############
258 ## get the reference sequence in one line
259 #--------------------------------------------------
260 sub Ref_Inline
261 {
262 my $ref = shift;
263 open (refFH, $ref) or die "Could not open file\n";
264 my $seq ="";
265 my @r = <refFH>;
266 my $num_chr =0;
267 foreach(@r){
268 if ($_ =~ /\>/){
269 $num_chr +=1;
270 }
271 }
272 if ($num_chr > 1){
273 print "\nERROR: Please use a single fasta reference file. You can simply merge chromosomes in to a union fasta file.\n\n";
274 exit;
275 }
276 shift @r;
277 foreach(@r){
278 chomp;
279 $seq = $seq.$_;
280 }
281 return $seq;
282 }
283 ################
284 # Run mummer
285 #--------------------------------------------
286 sub doMummer
287 {
288 my ($reference, $query_file, $choice, $sen,$seed,$min_id, $min_cov, $diff_cov,$min_len, $debug, $is_circular ) = @_;
289
290 my $df = 'delta-filter';
291 my $st = 'show-tiling';
292 my $ask = 'which';
293 my ($path_toPass, $run_mum); # params to return...
294 my ($command, $Path, $dir) = checkProg($choice);
295 my ($run_df, $df_path, $df_dir) = checkProg($df);
296 my ($run_st, $st_path, $st_dir) = checkProg($st);
297 my (@running, @deltaRes, @coordsRes);
298 if ($choice eq "nucmer")
299 {
300 if ($sen ==0)
301 {
302 @running = `$command --maxmatch -l $seed -p $choice $reference $query_file &> /dev/null`;
303 @deltaRes = `$run_df -q $choice.delta >$choice.filtered.delta`;
304 if ($is_circular == 1)
305 {
306 @coordsRes = `$run_st -c -i $min_id -v $min_cov -V $diff_cov -l $min_len -R -u unused_contigs.out $choice.filtered.delta > $choice.tiling`;
307 }
308 else
309 {
310 @coordsRes = `$run_st -i $min_id -v $min_cov -V $diff_cov -l $min_len -R -u unused_contigs.out $choice.filtered.delta > $choice.tiling`;
311
312 }
313 }
314 else
315 {
316 @running = `$command -p $choice $reference $query_file &> /dev/null`;
317 @deltaRes = `$run_df -q $choice.delta >$choice.filtered.delta`;
318 if ($is_circular ==1) {@coordsRes = `$run_st -c -i $min_id -v $min_cov -V $diff_cov -l $min_len -R -u unused_contigs.out $choice.filtered.delta > $choice.tiling`;}
319 else { @coordsRes = `$run_st -i $min_id -v $min_cov -V $diff_cov -l $min_len -R -u unused_contigs.out $choice.filtered.delta > $choice.tiling`;}
320 }
321
322 }
323 else
324 {
325 if ($sen ==0)
326 {
327 @running = `$command --maxmatch -l $seed -x 1 -p $choice $reference $query_file &> /dev/null`;
328 @deltaRes = `$run_df -q $choice.delta >$choice.filtered.delta`;
329 if ($is_circular == 1)
330 {
331 @coordsRes= `$run_st -c -i $min_id -v $min_cov -V $diff_cov -l $min_len -R -u unused_contigs.out $choice.filtered.delta > $choice.tiling`;
332 }
333 else
334 {
335 @coordsRes= `$run_st -i $min_id -v $min_cov -V $diff_cov -l $min_len -R -u unused_contigs.out $choice.filtered.delta > $choice.tiling`;
336 }
337 }
338 else
339 {
340 @running = `$command -l $seed -p $choice $reference $query_file &> /dev/null`;
341 @deltaRes = `$run_df -q $choice.delta >$choice.filtered.delta`;
342 if ($is_circular == 1) {
343 @coordsRes= `$run_st -c -i $min_id -v $min_cov -V $diff_cov -l $min_len -R -u unused_contigs.out $choice.filtered.delta > $choice.tiling`;
344 }
345 else
346 {
347 @coordsRes= `$run_st -i $min_id -v $min_cov -V $diff_cov -l $min_len -R -u unused_contigs.out $choice.filtered.delta > $choice.tiling`;
348 }
349 }
350 }
351 my $Coordsfull= "$choice.tiling";
352 return ($Coordsfull,$Path, $dir);
353 }
354
355 ## #############################################################################
356 sub checkProg{ #checks if a given excutable is in the path...
357 my $prog = shift;
358 my $ask = 'which';
359 my @check_prog = `$ask $prog`;
360 my $path_toPass;
361 my $path_dir;
362 my $command;
363 if (defined $check_prog[0] && $check_prog[0] =~ /$prog$/)
364 {
365 $path_toPass = $prog;
366 $command = $prog;
367 }
368 else
369 {
370 print "\nENTER the directory for your ", $prog, " executables [default ./]: ";
371 my $path=<STDIN>;
372 chomp $path;
373 $path_dir = $path;
374 if ($path_dir =~/\/$/)
375 {
376 $path_dir = $path_dir;
377 }
378 else
379 {
380 $path_dir = $path_dir."/";
381 }
382 my @final_check = `$ask $command`;
383 if (exists $final_check[0] && $final_check[0] =~ /$prog$/)
384 {
385 $command = $path_dir.$prog;
386 $path_toPass = $command;
387 }
388 else
389 {
390 print "ERROR: Could not run ", $prog, ", please check if it is installed in your path\n or provide the directory \n";
391 exit;
392 }
393 }
394
395 return ($command, $path_toPass, $path_dir);
396 }
397
398 ##############################
399 # converts a fasta file to an ordered single line
400 #--------------------------------------------------
401 sub Fasta2ordered
402 {
403 if( @_ != 1 )
404 {
405 print "Usage: Fasta2ordered <fasta_file>\n";
406 exit;
407 }
408 my $fast = shift; #print $fast; exit;
409 my @fasta = split (/\n/, $fast);
410 if ($fasta[0] =~ /\>/ ) #remove chromosome name if exists in the onput sequence.
411 {
412 my $ch_name = $fasta[0];
413 shift @fasta;
414 }
415 #print $fasta[0]; exit;
416 foreach(@fasta){chomp;}
417 my $num_lines = scalar(@fasta);
418 my $dna = '';
419 for(my $i=0; $i< $num_lines; $i+=1)
420 {
421 $dna = $dna.$fasta[$i];
422 }
423 my $ordered_dna = $dna;
424 return $ordered_dna;
425 }
426 ##############################################################
427 # Hash input contigs
428 #------------------------------------------------
429
430 sub hash_contigs {
431 if( @_ != 1 )
432 {
433 print "Usage: hash_contigs contigs_file";
434 exit;
435 }
436 my $contigs_file = shift;
437 if( $contigs_file =~ /\.gz$/ )
438 {
439 open(CONTIGS, $contigs_file, "gunzip -c $contigs_file |" ) or die "Cant open contigs file: $!\n";
440 }
441 else
442 {
443 open( CONTIGS, $contigs_file) or die "Cant open contigs file: $!\n";
444 }
445
446 my %contigs_hash; # hash to store contig names and sequences
447 my $contigName;
448
449
450 while (<CONTIGS>) ##add error checking...
451 {
452 if (/^>(\S+)/) {
453 $contigName=$1;
454 }
455 else {
456 chomp;
457 $contigs_hash{$contigName} .= $_;
458 }
459 }
460 close(CONTIGS);
461 #tdo
462 ## check if qual exists
463 my %contigs_qual_hash;
464
465
466 if (-r "$contigs_file.qual" or -r "$contigs_file.qual.gz") {
467 if( -r "$contigs_file.qual.gz" )
468 {
469 open(CONTIGS, "$contigs_file.qual.gz", "gunzip -c $contigs_file.qual.gz |" ) or die "Cant open contigs file: $!\n";
470 }
471 else
472 {
473 open( CONTIGS, "$contigs_file.qual" ) or die "Cant open contigs file: $!\n";
474 }
475
476
477 while (<CONTIGS>) {
478 if (/^>(\S+)/) {
479 $contigName=$1;
480 }
481 else {
482 chomp;
483 $contigs_qual_hash{$contigName} .= $_;
484 }
485 }
486
487 } # end tdo # end if exist
488
489
490 return (\%contigs_hash,\%contigs_qual_hash);
491 }
492 #######################
493 ##############################
494 ### it gets a delta name
495 sub getMummerComparison{
496 my $deltaName = shift;
497
498
499 ### transform the delta file to coords
500 my $call ="show-coords -H -T -q $deltaName > $deltaName.coords ";
501 !system(" $call") or die "Problems doing the show-coords comparison: $call $!\n";
502
503
504 ### willh old results
505 my %h;
506
507 ### has as index the postion with the max hits
508 my %h_max;
509 my $tmp=0;
510 my $tmp_index;
511 my $key='';
512 my $is_promer =0;
513 if ($deltaName =~/^promer/)
514 {
515 $is_promer =1;
516 }
517
518 open (F,"$deltaName.coords") or die "Problem in getComparisonFile to open file $deltaName.coords\n";
519 my @File=<F>;
520 my @a;
521 @a=split(/\s+/,$File[0]);
522 $tmp=$a[5];
523 $tmp_index=$a[0];
524 if ($is_promer ==1)
525 {
526 $key=$a[12]; ## nucmer: $key = $a[8]
527 }
528 else
529 {
530 $key = $a[8];
531 }
532
533 foreach (@File) {
534 @a=split(/\s+/);
535
536 if ($is_promer ==1)
537 {
538 push @{ $h{$a[12]}}, "$a[12]\t$a[11]\t$a[7]\t$a[5]\t0\t0\t$a[2]\t$a[3]\t$a[0]\t$a[1]\t1\t$a[5]\n";
539 if ($key eq $a[12] and $a[5]>$tmp)
540 {
541 $tmp=$a[5]; # length
542 $tmp_index=$a[0]; # position reference
543 }
544 elsif ($key ne $a[12]) {
545 ### here possible bugg...
546 $h_max{$tmp_index}=$key;
547 $key=$a[12];
548 $tmp=$a[5]; # length
549 $tmp_index=$a[0]; # position reference
550
551 }
552 } #end if
553 #else i.e. if nucmer
554 else
555 {
556 push @{ $h{$a[8]}}, "$a[8]\t$a[6]\t$a[6]\t$a[5]\t0\t0\t$a[2]\t$a[3]\t$a[0]\t$a[1]\t1\t$a[5]\n";
557 if ($key eq $a[8] and $a[5]>$tmp)
558 {
559 $tmp=$a[5]; # length
560 $tmp_index=$a[0]; # position reference
561 }
562 elsif ($key ne $a[8]) {
563 ### here possible bugg...
564 $h_max{$tmp_index}=$key;
565 $key=$a[8];
566 $tmp=$a[5]; # length
567 $tmp_index=$a[0]; # position reference
568
569 }
570 }
571
572 }
573 $h_max{$tmp_index}=$key;
574 # print Dumper %h_max;
575
576 return (\%h,\%h_max);
577 }
578 ##################################
579 ###########################
580 sub writeBinContigs2Ref{
581 my $nameBin = shift;
582 my $name = shift;
583
584 open (F, "$nameBin") or die "Couldn't find file $nameBin: $!\n";
585
586 my @ar;
587
588 my $count=0;
589
590 while (<F>) {
591 push @ar, $_;
592 $count++;
593 }
594 #### sa4: added error checking:- if file is empty
595 if (scalar(@ar) < 1)
596 {
597 print "No contigs in unusedcontigs file\n";
598 $count = 0;
599
600 }
601 else
602 {
603 open (F, "> $name.notMapped.contigs.tab") or die "Couldn't write file $name.tab: $!\n";
604 print F doArt(\@ar);
605 close(F);
606 }
607 return $count;
608
609 }
610
611
612 ##############################
613 sub doArt{
614 my ($ref) = @_;
615
616
617 ## hash of array with all positions of the contig
618 my %Pos;
619
620 ## Hash with note of result line of nucmer
621 my %lines;
622
623 foreach (@$ref) {
624 chomp;
625 my @ar=split(/\t/);
626 push @{ $Pos{$ar[12]}}, "$ar[0]..$ar[1]";
627 $lines{$ar[12]} .= "FT $_\n";
628 }
629
630 my $res;
631
632 foreach my $contig (keys %lines) {
633
634 if (scalar(@{ $Pos{$contig} } >1)) {
635 my $tmp;
636
637 foreach (@{ $Pos{$contig} }) {
638 $tmp.="$_,";
639 }
640 $tmp =~ s/,$//g; # get away last comma
641 $res .= "FT contig join($tmp)\n";
642 }
643 else {
644 $res .= "FT contig $Pos{$contig}[0]\n";
645 }
646 $res .= "FT /systematic_id=\"$contig\"\n";
647 $res .= "FT /note=\"Contig $contig couldn't map perfectly.\n";
648 $res .= $lines{$contig};
649 $res .= "FT \"\n";
650
651 }
652
653 return $res;
654 }
655
656 ##########################################################################
657 #---------------------------
658 sub makeN #creates a string of Ns
659 {
660 my $n = shift;
661 my $Ns= "N";
662 for (my $i =1; $i < $n; $i+=1)
663 {
664 $Ns = $Ns."N";
665 }
666 return $Ns;
667 }
668
669 ###########################################################################
670 ## reverse complement a sequence
671 #---------------------------------------------
672 sub revComp {
673 my $dna = shift;
674 my $revcomp = reverse($dna);
675
676 $revcomp =~ tr/ACGTacgt/TGCAtgca/;
677
678 return $revcomp;
679 }
680
681 ################################################################################
682 ### function to visualize rep. regions in Reference genome.
683 sub findRepeats
684 {
685 my $reference = shift;
686 my $name = shift;
687 my $path_prog = shift;
688
689 # get path
690 my ($path_coords) = $path_prog;
691
692 $path_coords =~ s/nucmer/show-coords/;
693
694 my $call = "$path_prog --maxmatch -c 100 -b 40 -p $name.repeats -l 25 $reference $reference &> /dev/null ";
695 !system("$call") or die "Problems doing the nucmer comparison: $call $!\n";
696 $call ="$path_coords -r -c -l $name.repeats.delta > $name.repeats.coords ";
697 !system(" $call") or die "Problems doing the show-coords comparison: $call $!\n";
698
699 my @Res;
700 open (F, "$name.repeats.coords" ) or die "Problems to open file $reference.repeats.coords: Is MUMmer installed correctly and inserted in the PATH environment variable? ($!)\n";
701 $_=<F>; $_=<F>; $_=<F>; $_=<F>;$_=<F>;
702 while (<F>) {
703 my @ar = split(/\s+/);
704 if (!($ar[1] == $ar[4] or $ar[2] == $ar[5] or $ar[7] > 100000)) { # to exclude self alignment
705
706 foreach ($ar[1]..$ar[2]) {
707 $Res[($_-1)]++;
708 }
709 }
710 }
711
712 ### write the result to the plot file
713
714 my $res;
715 foreach (@Res){
716 if (defined($_)) {
717 $res .= "$_\n";
718 }
719 else {
720 $res .= "0\n";
721 }
722 }
723
724 open (F, "> $name.Repeats.plot") or die "Couldn't open file $name.plot to write: $! \n";
725 print F $res;
726 close(F);
727
728 ### delete files
729 unlink("$name.repeats.delta");
730 unlink("$name.repeats.coords");
731 unlink("$name.repeats.cluster");
732 }
733
734 ################################################################################
735 # reverse a list of qualities
736 #------------------------------
737 sub reverseQual{
738 my $str = shift;
739
740 $str =~ s/\s+$//g;
741
742 my @ar = split(/\s/,$str);
743 my $res;
744
745 for (my $i=(scalar(@ar)-1);$i>=0;$i--) {
746 $res.="$ar[$i] ";
747 }
748 return $res;
749 }
750
751 ##############################################################################
752 # ----------------
753 sub getPosCoords{
754 my $ref_ar = shift;
755 my $contig = shift;
756
757 my $offset = shift;
758 my $res;
759 # print Dumper $$ref_ar{$contig};
760 foreach (@{$$ref_ar{$contig}}) {
761 print "in getPos Coords: $_\n";;
762
763 my @ar=split(/\t/);
764 $ar[6]+=$offset;
765 $ar[7]+=$offset;
766 $res .= join("\t",@ar);;
767 }
768
769 return $res;
770
771 }
772 #############################################################################
773 # -----------------------------
774 sub getPosCoordsTurn{
775 my $ref_ar = shift;
776 my $contig = shift;
777
778 my $offset = shift;
779
780
781 my $res;
782
783 # print Dumper $$ref_ar{$contig};
784 foreach (@{$$ref_ar{$contig}}) {
785 my @ar=split(/\t/);
786 my $tmp_8=$ar[8];
787 my $tmp_9=$ar[9];
788
789 $ar[8]=$ar[6]+$offset;
790 $ar[9]=$ar[7]+$offset;
791 $ar[6]=$tmp_8;
792 $ar[7]=$tmp_9;
793
794 ## change query subject
795
796 $res .= join("\t",@ar);;
797 }
798
799 return $res;
800
801 }
802
803 ############################################################################
804 #------------------------
805 sub printStats{
806
807 my ($num_fortillingcontigs,$num_notsetTilling,$num_mapped, $num_contigs, $num_inComparisoncontigs, $ref_len, $total_bases_mpd) = @_;
808 $num_fortillingcontigs=$num_notsetTilling+$num_mapped;
809 my $res;
810 $res.= "Short statistics of run.\n";
811 $res.= "$num_contigs\t\tcontigs entered to be mapped against the reference.\n";
812 $res.= sprintf("$num_inComparisoncontigs\t%0.2f \%\tmade a hit against the reference to the given parameter (-s -d etc)\n",($num_inComparisoncontigs*100/$num_contigs));
813 $res.= sprintf("$num_fortillingcontigs\t%0.2f \%\twere considered for the tilling graph (-s -d etc)\n",($num_fortillingcontigs*100/$num_contigs));
814 $res.= sprintf("$num_mapped\t%0.2f \%\tare mapped in the tilling graph (-s -d etc)\n",($num_mapped*100/$num_contigs));
815 $res.= sprintf("\nCoverage: The reference is $ref_len long. Up to $total_bases_mpd bp (%0.2f \%) are covered by the contigs (This doesn't mean that these regions are really similar...)\n",($total_bases_mpd*100/$ref_len));
816
817 print $res;
818
819 }
820 ##################################################
821 #### Do Tiling
822 #----------------------------------------------------
823 sub doTiling {
824
825 my ($mummer_tiling, $path_toPass, $path_dir,$reference, $query_file, $choice, $prefix,$mlfas, $fasta_bin, $avoid_Ns, $ref_len, $gaps_2file, $ref_inline, $add_bin_2ps, $pick_primer, $flank, $chk_uniq, $run_blast) = @_;
826
827 ### these are also defined in the main script.... to be changed!!
828 my ($num_contigs , $num_inbincontigs, $avg_cont_size,$num_overlaps , $num_gaps,
829 $num_mapped,$total_bases_mpd,$p_ref_covered,$num_ambigus,$num_inComparisoncontigs,
830 $num_fortillingcontigs,$num_notsetTilling)= (0,0,0,0,0,0,0,0,0,0,0,0);
831
832 my ($href, $ref_contigs_qual) = hash_contigs($query_file);
833 my $qualAvailable=0;
834 my %contigs_hash = %{$href};
835 my @c_keys = keys %contigs_hash;
836 $num_contigs = scalar(@c_keys);
837 my @cont_lens;
838 my (@ids ,$id, $id_len);
839 my (@Rs, @Re, @G, @Len, @cov, @pid, @orient, @Cid);
840 my (@Ps, @Pe);
841 my ($total);
842 my $g; #define gap size between contigs
843 my $tiling_gap; #gap size from tiling graph
844 open (TIL, $mummer_tiling) or die "Could not open $mummer_tiling: $!";
845 while (<TIL>)
846 {
847 chomp;
848 if ($_ =~/^>/)
849 {
850 my $line = substr $_, 1;
851 my @splits = split /\s+/, $line;
852 $id = $splits[0];
853 push @ids, $id;
854 $id_len= $splits[1];
855 }
856 else
857 {
858 my @splits = split /\s+/, $_;
859 push @Rs, $splits[0];
860 push @Re, $splits[1];
861 push @G, $splits[2];
862 push @Len, $splits[3];
863 push @cov, $splits[4];
864 push @pid, $splits[5];
865 push @orient, $splits[6];
866 push @Cid, $splits[7];
867 }
868
869 }
870 close (TIL);
871 if (scalar(@Rs) != scalar(@Re))
872 {
873 print "ERROR: unequal array size\n";
874 exit;
875 }
876 else
877 {
878 $total = scalar(@Rs);
879 $num_mapped = scalar(@Rs);
880 }
881 my $ref_loc = $reference; # get locations of reference and query files
882 my $qry_loc = $query_file;
883 my $dif_dir =0; #assume query and reference are in the working directory
884 my @splits_reference = split (/\//, $reference);
885 my $new_reference_file = $splits_reference[(scalar(@splits_reference)-1)];
886 my @splits_query = split (/\//, $query_file);
887 my $new_query_file = $splits_query[(scalar(@splits_query)-1)];
888 if ($prefix eq "")
889 {
890 $prefix = $new_query_file."_".$new_reference_file;
891 }
892 #-------------------------------------------------------------------
893 #define file handles for output files and open files to write output
894 #-------------------------------------------------------------------
895 my ($seqFH,$tabFH,$binFH,$crunchFH, $gapFH, $mlFH, $dbinFH, $avoidNFH);
896 open ($seqFH, '>', $prefix . '.fasta') or die "Could not open file $prefix.fasta for write: $!\n";
897 open ($tabFH, '>', $prefix . '.tab') or die "Could not open file $prefix.tab for write: $!\n";
898 open ($binFH, '>', $prefix . '.bin') or die "Could not open file $prefix.bin for write: $!\n";
899 open ($crunchFH, '>', $prefix . '.crunch') or die "Could not open file $prefix.crunch for write: $!\n";
900 open ($gapFH, '>', $prefix . '.gaps') or die "Could not open file $prefix.gaps for write: $!\n";
901 if ($mlfas ==1)
902 {
903 open ($mlFH, '>', $prefix . '.contigs.fas') or die "Could not open file $prefix.contigs.fas for write: $!\n";
904 }
905 if ($fasta_bin ==1)
906 {
907 open ($dbinFH, '>', $prefix . '.contigsInbin.fas') or die "Could not open file $prefix.contigsInbin.fas for write: $!\n";
908 }
909 if ($avoid_Ns ==1)
910 {
911 open ($avoidNFH, '>', $prefix .'.NoNs.fasta') or die "Could not open file $prefix.NoNs.fasta for write: $!\n";
912 }
913 #-------------------------------------------------------------------------
914 # Writing tiling graph and generating a pseudomolecule
915 # Note use use ps for pseudomolecule
916 #@Ps = start of ps, and @Pe = end of ps
917 my $ps_start =1;
918 $Ps[0] = 1;
919 $Pe[0] = $Ps[0] + $Len[0] -1;
920 my $tmp_qual;
921 my $tmp_nqual;
922 my $tmp_seq ="";
923 my $tmp_nseq ="";
924 print "Total contigs = $total \n";
925
926 #------------------------------------------------------------
927 # The 'for loop' loops over each contig in the Tiling output
928 #Writing to file is done for each contig to speed up the process
929 #This part could potentially be a separate subroutine
930
931 print $tabFH "ID ",$id, "\n";
932 print $seqFH ">", "ordered_", $id, "\n";
933 for (my $i=1; $i <= $total; $i+=1)
934 {
935 my $covv =sprintf("%.0f",$cov[$i -1]); #ROUNDING
936 my $pidd = sprintf("%.0f", $pid[$i -1]);
937 my ($contig_coord, $color, $contig_seq);
938 my $contig_qual='';
939 $tiling_gap = $G[$i -1];
940 if ($tiling_gap <= 5){ #insert 100Ns for overlaps and gaps of size less than or equal to 5bp
941 $g = 99; # default gap size to
942 }
943 else{
944 $g = $tiling_gap;
945 }
946 if (defined($Len[$i]))
947 {
948 $Ps[$i] = $Pe[$i-1] +$g +1;
949 $Pe[$i] = $Ps[$i] + $Len[$i] -1;
950 $total_bases_mpd+=$Len[$i];
951 }
952
953 if ($Rs[$i -1] <0) #check if a reference starting position is less than 0
954 {
955 $Rs[$i -1] =1;
956 }
957
958 if($orient[$i-1] eq "+")
959 {
960 $contig_coord = $Ps[$i -1]."..".$Pe[$i-1];
961 $color = 4;
962 $contig_seq = $contigs_hash{$Cid[$i-1]};
963 }
964 else
965 {
966 $contig_coord = "complement(".$Ps[$i -1]."..".$Pe[$i-1].")";
967 $color =3;
968 $contig_seq = revComp($contigs_hash{$Cid[$i-1]}); #REVERSE COMPLEMENT A SEQUENCE
969
970 }
971 push (@cont_lens, length($contig_seq));
972
973 # tdo
974 if (defined($$ref_contigs_qual{$Cid[$i-1]})) {
975 ## flag to know, that the qual exists
976 $qualAvailable=1;
977 $contig_qual = $$ref_contigs_qual{$Cid[$i-1]};
978 }
979 $tmp_qual .= $contig_qual;
980 $tmp_seq .= $contig_seq;
981 if ($avoid_Ns ==1)
982 {
983 $tmp_nseq.= $contig_seq;
984 #tdo
985 $tmp_nqual .= $contig_qual;
986 }
987 if ($mlfas ==1)
988 {
989 my $head = $Cid[$i-1];
990 my $multifasta_seq = write_Fasta_headers ($contig_seq,$head);
991 print $mlFH $multifasta_seq;
992 }
993 if ($Re[$i -1] > $ref_len)
994 {
995 $Re[$i -1] = $ref_len -1;
996 }
997 if ($Pe[$i -1] > length($tmp_seq))
998 {
999 $Pe[$i -1] = length($tmp_seq);
1000 }
1001
1002 #-----------------------------------------
1003 print $crunchFH $covv, " ", $pidd, " ", $Ps[$i -1], " ", $Pe[$i -1], " ", $Cid[$i -1], " ", $Rs[$i -1], " ", $Re[$i-1], " ", "unknown NONE\n";
1004
1005 #WRITE FEATURE FILE
1006 print $tabFH "FT contig ",$contig_coord, "\n";
1007 print $tabFH "FT ", "/systematic_id=\"", $Cid[$i-1],"\"","\n";
1008 print $tabFH "FT ", "/method=\"", "mummer\"", "\n";
1009 print $tabFH "FT ", "/Contig_coverage=\"",$cov[$i -1], "\"", "\n";
1010 print $tabFH "FT ", "/Percent_identity=\"",$pid[$i -1], "\"", "\n";
1011 if ($tiling_gap > 1) #WRITE GAP LOCATIONS AND SIZE TO FILE
1012 {
1013 my $gap_start = $Pe[$i -1] +1;
1014 my $gap_end = "";
1015 if (defined $Ps[$i])
1016 {
1017 $gap_end = $Ps[$i] -1;
1018 }
1019 my $ref_start = $Re[$i -1] +1;
1020 my $ref_end;
1021 if (defined $Rs[$i])
1022 {
1023 $ref_end =$Rs[$i]-1;
1024 }
1025 else
1026 {
1027 $ref_end = "END";
1028 }
1029 print $gapFH "Gap\t",$tiling_gap, "\t", $gap_start, "\t", $gap_end, "\t", $ref_start, "\t", $ref_end,"\n";
1030 if ($gaps_2file ne "" && $ref_start < $ref_len)
1031 {
1032 my $ref_gapsFH;
1033 open ($ref_gapsFH, '>', $gaps_2file.'.Gaps_onRef') or die "Could not open file $gaps_2file.Gaps_onRef for write: $!\n";
1034 my $gapOnref = substr ($ref_inline, $ref_start, $g);
1035 print $ref_gapsFH ">",$g,"_",$ref_start, "\n";
1036 my $file_toPrint = write_Fasta ($gapOnref);
1037 print $ref_gapsFH $file_toPrint;
1038 }
1039 }
1040 else
1041 {
1042 $color = 5;
1043 print $tabFH "FT ", "/Overlapping=\"", "YES\"", "\n";
1044 }
1045 my $ns = makeN($g);
1046 $tmp_seq = $tmp_seq.$ns;
1047 #tdo
1048 for (1..length($ns)) {
1049 $tmp_qual .= "0 ";
1050 }
1051 print $tabFH "FT ", "/colour=\"",$color, "\"", "\n";
1052 }
1053 #------------------------------------------------------------------
1054 #tdo
1055 my @Quality_Array;
1056 if ($qualAvailable) {
1057 @Quality_Array = split(/\s/,$tmp_qual);
1058 my $res;
1059 foreach (@Quality_Array) {
1060 $res .= "$_\n";
1061 }
1062 ## get name
1063 my @splits_query = split (/\//, $query_file);
1064 $new_query_file = $splits_query[(scalar(@splits_query)-1)];
1065 open (F,"> $new_query_file.qual.plot") or die "problems\n";
1066 print F $res;
1067 close(F);
1068 }
1069 ##WRITE PSEUDOMOLECULE WITHOUT 'N's
1070 #--------------------------------------
1071 if ($avoid_Ns ==1)
1072 {
1073 print $avoidNFH ">", "ordered_", $id, "without 'N's","\n";
1074 my $toWrite = write_Fasta ($tmp_nseq);
1075 print $avoidNFH $toWrite;
1076 }
1077 ####################################
1078 #WRITE CONTIGS WITH NO HIT TO FILE #
1079 #################################
1080 my %Cids;
1081
1082 foreach(@Cid)
1083 {
1084 chomp;
1085 $Cids{$_} = 1;
1086 }
1087 my @contigs_2bin = ();
1088 my %h_contigs_2bin;
1089
1090 foreach (@c_keys)
1091 {
1092 push(@contigs_2bin, $_) unless exists $Cids{$_};
1093
1094 }
1095 foreach(@contigs_2bin)
1096 {
1097 $h_contigs_2bin{$_}=1;
1098
1099 print $binFH "$_ \n";
1100
1101 }
1102 $num_inbincontigs= scalar(@contigs_2bin);
1103
1104 ########
1105 # WRITE PSEUDOMOLECULE TO FILE
1106 #----------------------------------
1107 my $new_seq = $tmp_seq;
1108 my $prev_len = length($tmp_seq);
1109 my $total_len = $prev_len;
1110 foreach (@contigs_2bin)
1111 {
1112 chomp;
1113 #my $binseq = $contigs_hash{$contigs_2bin[$i]};
1114 my $l = length ($contigs_hash{$_});
1115 $total_len +=$l;
1116 }
1117 if ($add_bin_2ps ==1) #appending unmapped contigs to pseudomolecule
1118 {
1119
1120 for (my $i =0; $i < scalar(@contigs_2bin); $i+=1)
1121 {
1122 my $binseq = $contigs_hash{$contigs_2bin[$i]};
1123 $new_seq .=$contigs_hash{$contigs_2bin[$i]};
1124 my $len_current_contig = length($binseq);
1125 my $start = $prev_len +1;
1126 my $end = $start + $len_current_contig -1;
1127 my $col = 7;
1128 if ($start > $total_len)
1129 {
1130 $start = $total_len;
1131 }
1132 if ($end >$total_len)
1133 {
1134 $end = $total_len;
1135 }
1136 my $co_cord = $start."..".$end;
1137 my $note = "NO_HIT";
1138 print $tabFH "FT contig ",$co_cord, "\n";
1139 print $tabFH "FT ", "/systematic_id=\"", $contigs_2bin[$i],"\"","\n";
1140 print $tabFH "FT ", "/method=\"", "mummer\"", "\n";
1141 print $tabFH "FT ", "/colour=\"",$col, "\"", "\n";
1142 print $tabFH "FT ", "/", $note, "\n";
1143 $prev_len= $end;
1144 }
1145 }
1146 my $to_write = write_Fasta ($new_seq);
1147 print $seqFH $to_write;
1148 ########
1149 #WRITE CONTIGS IN BIN TO FILE #
1150 #------------------------------------------------------
1151 if ($fasta_bin ==1)
1152 {
1153 foreach(@contigs_2bin)
1154 {
1155 print $dbinFH ">", $_, "\n";
1156 my $to_write = write_Fasta($contigs_hash{$_});
1157 print $dbinFH $to_write;
1158 }
1159 }
1160 #unlink ("$choice.delta");
1161 #unlink ("$choice.filtered.delta");
1162 #unlink ("$choice.cluster");
1163 #unlink ("$choice.tiling");
1164 #PRINT FINAL MESSAGE
1165 print " FINISHED CONTIG ORDERING\n";
1166 print "\nTo view your results in ACT\n\t\t Sequence file 1: $new_reference_file\n\t\t Comparison file 1: $prefix.crunch\n\t\t Sequence file 2: $prefix.fasta\n
1167 \t\tACT feature file is: $prefix.tab\n
1168 \t\tContigs bin file is: $prefix.bin\n
1169 \t\tGaps in pseudomolecule are in: $prefix.gaps\n\n";
1170
1171 #Run tblastx....
1172 if ($run_blast ==1)
1173 {
1174 print "Running tblastx on contigs in bin...\nThis may take several minutes ...\n";
1175 my $formatdb = 'formatdb -p F -i' ;
1176 # my @formating = `
1177 !system("$formatdb $new_reference_file") or die "ERROR: Could not find 'formatdb' for blast\n";
1178 my $blast_opt = 'blastall -m 9 -p tblastx -d ';
1179 my $contigs_inBin = $prefix.'.contigsInbin.fas';
1180 # my @bigger_b = `
1181 !system("$blast_opt $new_reference_file -i $contigs_inBin -o blast.out") or die "ERROR: Could not find 'blastall' , please install blast in your working path (other dir==0)\n$blast_opt $new_reference_file -i $contigs_inBin -o blast.out\n \n";
1182 }
1183
1184
1185 if ($pick_primer == 1)
1186 {
1187 print " DESIGNING PRIMERS FOR GAP CLOSURE...\n";
1188 my $qq = "$prefix.fasta";
1189 pickPrimers($qq, $reference, $flank, $path_toPass, $chk_uniq,$qualAvailable,@Quality_Array);
1190 }
1191 }
1192
1193
1194 #------------------------------
1195 sub write_Fasta {
1196 my $sequence = shift;
1197 my $fasta_seq ="";
1198 my $length = length($sequence);
1199 if ($length <= 60)
1200 {
1201 $fasta_seq = $sequence ;
1202 }
1203 elsif ($length> 60 )
1204 {
1205 for (my $i =0; $i < $length; $i+=60)
1206 {
1207 my $tmp_s = substr $sequence, $i, 60;
1208 $fasta_seq .= $tmp_s."\n";
1209 }
1210 }
1211
1212 return $fasta_seq;
1213 }
1214
1215 sub write_Fasta_headers {
1216 my $sequence = shift;
1217 my $header = shift;
1218 my $fasta_seq =">$header\n";
1219 my $length = length($sequence);
1220 if ($length <= 60)
1221 {
1222 $fasta_seq.= $sequence ;
1223 }
1224 elsif ($length> 60 )
1225 {
1226 for (my $i =0; $i < $length; $i+=60)
1227 {
1228 my $tmp_s = substr $sequence, $i, 60;
1229 $fasta_seq .= $tmp_s."\n";
1230 }
1231 }
1232
1233 return $fasta_seq;
1234 }
1235
1236 #---------------------------------------------- END OF CONTIG ORDERING SUBROUTINES----------------------------------------------------
1237
1238 #----------------------------------------------- PRIMER DESIGN ---------------------------------------------------
1239 sub pickPrimers
1240 {
1241 #$ps = pseudo molecule,$rf = reference, $flan = flanking region size
1242 my ($ps,$rf, $flan, $passed_path, $chk_uniq,$qualAvailable, @Quality_Array);
1243 if (@_==4){
1244 ($rf,$ps, $flan, $chk_uniq) = @_;
1245 print "Primers without ordering..\n";
1246 print $rf;
1247 $passed_path = "nucmer";
1248 $qualAvailable =0;
1249 @Quality_Array = [];
1250 }
1251 else #(@_ == 7)
1252 {
1253 ($ps,$rf, $flan, $passed_path, $chk_uniq, $qualAvailable, @Quality_Array) = @_;
1254 }
1255
1256 my $dna='';
1257 my @gappedSeq;
1258 my $records='';
1259 my @sequence;
1260 my $input='';
1261 #tdo
1262 my @gappedQual;
1263 #my $quality='';
1264 my $path_toPass = $passed_path;
1265 my @fasta;
1266 my $query='';
1267 my @exc_regions;
1268 my $ch_name;
1269 #my $flank = $flan;
1270 open (FH, $rf) or die "Could not open reference file\n";
1271 open (FH2, $ps) or die "Could not open query/pseudomolecule file\n";
1272 my $ref; #print ".... ", $rf; exit;
1273 my @r = <FH>;
1274 my @qry = <FH2>;
1275 my $dn = join ("", @qry);
1276 $ref = join ("", @r);
1277 $dna = Fasta2ordered ($dn);
1278 #check if primer3 is installed
1279 my $pr3 = "primer3_core";
1280 my ($pr3_path, $pr3_Path, $pr3_path_dir) = checkProg ($pr3);
1281 #my @check_prog = `which primer3_core`;
1282 open (PRI, '>primer3.summary.out') or die "Could not open file for write\n";
1283
1284 #
1285
1286 #print $ref; exit;
1287 #PARSING FOR PRIMER3 INPUT
1288 my ($opt,$min,$max,$optTemp,$minTemp,$maxTemp,$flank,$lowRange,$maxRange,$gcMin,$gcOpt,$gcMax,$gclamp,$exclude,$quality) = getPoptions($qualAvailable);
1289 my ($gap_position,@positions, %seq_hash);
1290
1291 my $exc1 = $flank -$exclude; #start of left exclude
1292 print "Please wait... extracting target regions ...\n";
1293 #regular expression extracts dna sequence before and after gaps in sequence (defined by N)
1294 while($dna=~ /([atgc]{$flank,$flank}N+[atgc]{$flank,$flank})/gi)
1295 {
1296 $records= $1;
1297 push (@gappedSeq, $records);
1298 $gap_position = index($dna, $records);
1299 push @positions, $gap_position;
1300 $seq_hash{$gap_position}=$records;
1301 #dna
1302 if ($qualAvailable) {
1303 my $res;
1304 for (my $nn=($gap_position-1); $nn <= ($gap_position-1+length($records)-1); $nn++) {
1305 $res.="$Quality_Array[$nn] ";
1306 }
1307 push @gappedQual, $res;
1308 }
1309 }
1310 #loop prints out primer targets into a file format accepted by primer3
1311 my $count=1;
1312 my $identify='';
1313 my $seq_num = scalar @gappedSeq;
1314 my $name= " ";
1315
1316 my ($totalp, @left_name, @right_name, @left_seq, @right_seq);
1317
1318 my ($leftP_names, $rightP_names, $leftP_seqs, $rightP_seqs, $leftP_start, $leftP_lens, $rightP_ends, $rightP_lens,$left_Exclude,$right_Exclude, $primers_toExc, $prod_size)=
1319 ("","","","","","","","","","", "", "");
1320
1321 print $seq_num, " gaps found in target sequence\n";
1322 print "Please wait...\nLooking for primers...\n";
1323 print "Running Primer3 and checking uniquness of primers...\nCounting left and right primer hits from a nucmer mapping (-c 15 -l 15)\n";
1324
1325 for (my $i=0; $i<$seq_num; $i+=1)
1326 {
1327 $identify = $count++;
1328 if (defined $ch_name)
1329 {
1330 $name = $ch_name;
1331 }
1332 my $len = length($gappedSeq[$i]);
1333 my $exc2 = $len - $flank;
1334 open(FILE, '>data') or die "Could not open file\n";
1335 #tdo
1336 my $qual='';
1337 if ($qualAvailable) {
1338 $qual="PRIMER_SEQUENCE_QUALITY=$gappedQual[$i]\nPRIMER_MIN_QUALITY=$quality\n";
1339 }
1340
1341 #WARNING: indenting the following lines may cause problems in primer3
1342 print FILE "PRIMER_SEQUENCE_ID=Starting_Pos $positions[$i]
1343 SEQUENCE=$gappedSeq[$i]
1344 PRIMER_OPT_SIZE=$opt
1345 PRIMER_MIN_SIZE=$min
1346 PRIMER_MAX_SIZE=$max
1347 PRIMER_OPT_TM=$optTemp
1348 PRIMER_MIN_TM=$minTemp
1349 PRIMER_MAX_TM=$maxTemp
1350 PRIMER_NUM_NS_ACCEPTED=1
1351 PRIMER_PRODUCT_SIZE_RANGE=$lowRange-$maxRange
1352 PRIMER_MIN_GC=$gcMin
1353 PRIMER_GC_CLAMP =$gclamp
1354 PRIMER_OPT_GC_PERCENT=$gcOpt
1355 PRIMER_MAX_GC=$gcMax
1356 PRIMER_INTERNAL_OLIGO_EXCLUDED_REGION=$exc1,$exclude $exc2,$exclude
1357 ".$qual."Number To Return=1
1358 =\n";
1359 close FILE;
1360
1361 #runs primer3 from commandline
1362
1363 ################# NOTE: PRIMER3 SHOULD BE IN YOUR WORKING PATH #########
1364
1365 my @Pr3_output = `$pr3_path -format_output <data`;
1366 #print $positions[$i], "\t", $i, " ", $path_toPass, " ", $rf, $exc1, " ",$exc2, "\n";
1367 my $fil = join (":%:", @Pr3_output);
1368 my ($uniq_primer, $string,$left_nm,$right_nm,$left_sq, $right_sq,$left_strt,$left_ln, $right_End,$right_ln,$primers_toExclude, $product_size)
1369 = check_Primers ($fil, $positions[$i], $i,$path_toPass, $rf, $exc1, $exc2);
1370
1371 print PRI $string;
1372 if ($uniq_primer ==1)
1373 {
1374 $leftP_names.=$left_nm."\n";
1375 $rightP_names.=$right_nm."\n";
1376 $leftP_seqs.=$left_sq."\n";
1377 $rightP_seqs.=$right_sq."\n";
1378 $leftP_start.=$left_strt."\n";
1379 $leftP_lens.=$left_ln."\n";
1380 $rightP_ends.=$right_End."\n";
1381 $rightP_lens.=$right_ln."\n";
1382 $left_Exclude.=$exc1."\n";
1383 $right_Exclude.=$exc2."\n";
1384 $prod_size.=$product_size."\n";
1385 }
1386 if ($primers_toExclude ne "")
1387 {
1388 $primers_toExc.= $primers_toExclude; #."\n";
1389 }
1390
1391 }
1392 write_Primers ($leftP_names, $rightP_names, $leftP_seqs, $rightP_seqs, $leftP_start, $leftP_lens, $rightP_ends, $rightP_lens,$primers_toExc,$left_Exclude,$right_Exclude, $prod_size);
1393 #write_Primers (@left_name, @right_name, @left_seq, @right_seq,@left_start, @left_len, @right_end, @right_len, @left_exclude, @right_exclude, $primers_toExclude);
1394
1395 }
1396
1397 #checks the uniqueness of primers
1398 #input an array with promer3 output for each gap
1399 sub check_Primers
1400 {
1401
1402 my ($fil, $position, $index,$path_toPass, $rf, $exc1, $exc2) = @_;
1403 my @Pr3_output = split /:%:/, $fil;
1404 my ($left_name, $right_name, $left_seq, $right_seq, $left_start,$left_len,$right_end,$right_len,$left_exclude,$right_exclude) = ("", "", "", "", "", "", "", "", "", "");
1405 my $primers_toExclude ="";
1406 my $product_size ="";
1407 my $string ="";
1408 my $uniq_primer = 0;
1409 $string.="=========================================================================================\n";
1410 $string.="Primer set for region starting at ".$position."\n";
1411
1412 if (defined $Pr3_output[5] && defined $Pr3_output[6])
1413 {
1414 if ($Pr3_output[5]=~ /LEFT PRIMER/)
1415 {
1416 # print $Pr3_output[5];
1417 #check uniquness of primer against the genome
1418 my @splits_1 = split (/\s+/, $Pr3_output[5]);
1419 my $left_primer = $splits_1[8];
1420 my $left_st = $splits_1[2];
1421 my $left_length = $splits_1[3];
1422
1423 my @splits_2 = split (/\s+/, $Pr3_output[6]);
1424 my $right_primer = $splits_2[8];
1425 my $right_st = $splits_2[2];
1426 my $right_length = $splits_2[3];
1427
1428 open (QRY_1, '>./left_query'); # open a file for left primers
1429 print QRY_1 ">left_01\n"; #
1430 print QRY_1 $left_primer,"\n";
1431 open (QRY_2, '>./right_query');
1432 print QRY_2 ">right_01\n";
1433 print QRY_2 $right_primer,"\n";
1434
1435 my ($left_count, $right_count);
1436 #if ($chk_uniq eq "nucmer")
1437 #{
1438 my $options = "-c 15 --coords -l 15 ";
1439 my $rq = "right_query";
1440 my $lq = "left_query";
1441 my (@right_ps, @left_ps);
1442 # print $path_toPass, "\t", $options, "\n";
1443
1444
1445 my @Rrun = `$path_toPass $options -p R $rf $rq &> /dev/null`;
1446 print ".";
1447 my $f1 = "R.coords";
1448 open (RP, $f1) or die "Could not open file $f1 while checking uniqueness of right primer\n";
1449 while (<RP>)
1450 {
1451 chomp;
1452 if ($_ =~ /right_01$/)
1453 {
1454 push @right_ps, $_;
1455 }
1456 }
1457 close (RP);
1458 my @Lrun = `$path_toPass $options -p L $rf $lq &> /dev/null`;
1459 print ".";
1460 my $f2 = "L.coords";
1461 open (LQ, $f2) or die "Could not open file $f2\n";
1462 while (<LQ>)
1463 {
1464 chomp;
1465 if ($_ =~ /left_01$/)
1466 {
1467 push @left_ps, $_;
1468 }
1469 }
1470 close (LQ);
1471 $right_count = scalar (@right_ps);
1472 $left_count = scalar(@left_ps);
1473 #check if a primer is not in the excluded region::
1474 my $primer_NearEnd =0;
1475 if ($left_st > $exc1 || $right_st < $exc2)
1476 {
1477 $primer_NearEnd = 1;
1478 }
1479
1480 if ($left_count < 2 && $right_count<2 && $primer_NearEnd ==0)
1481 {
1482 $string.=$left_count."\t".$Pr3_output[5]."\n";
1483 $string.=$right_count."\t".$Pr3_output[6]."\n";
1484 $string.="***************************** PRIMER3 OUTPUT **************************\n";
1485 foreach (@Pr3_output) {$string.=$_;}
1486
1487 my @prod_size_split = split /\s+/, $Pr3_output[10];
1488
1489 $product_size = substr($prod_size_split[2], 0, -1);
1490 $left_name = $position;
1491 $right_name = $position;
1492 my $lp_uc = uc ($left_primer);
1493 my $rp_uc = uc($right_primer);
1494 #print $left_count, "..", $right_count, "\t";
1495 $left_seq = $lp_uc;
1496 $right_seq= $rp_uc;
1497
1498 $left_start= $left_st;
1499 $left_len = $left_length;
1500
1501 $right_end = $right_st;
1502 $right_len = $right_length;
1503
1504 $left_exclude = $exc1;
1505 $right_exclude =$exc2;
1506 $uniq_primer =1;
1507 }
1508 else
1509 {
1510 if ($primer_NearEnd ==1)
1511 {
1512 $string.="One of the oligos is near the end of a contig\n";
1513 }
1514 else
1515 {
1516 $string.="Primer set not unique\n";
1517 }
1518 $primers_toExclude.=">L.".$position."\n".$left_primer."\n";
1519 $primers_toExclude.=">R.".$position."\n".$right_primer."\n";
1520 }
1521
1522 }
1523 else
1524 {
1525 $string.="No Primers found\n";
1526 }
1527 }
1528
1529 return ($uniq_primer, $string,$left_name,$right_name,$left_seq, $right_seq,$left_start,$left_len, $right_end,$right_len,$primers_toExclude, $product_size);
1530
1531
1532 }
1533
1534 ###------------------------------------
1535 # Writes primers and their regions to file
1536 sub write_Primers {
1537 my ($leftP_names, $rightP_names, $leftP_seqs, $rightP_seqs, $leftP_start, $leftP_lens, $rightP_ends, $rightP_lens,$primers_toExclude,$left_Exclude,$right_Exclude, $product_sizes) = @_;
1538 my (@left_name, @right_name, @left_seq, @right_seq, @left_start, @left_len, @right_end, @right_len, @left_exclude, @right_exclude, @product_size);
1539
1540 #open files to read
1541 @left_name = split /\n/, $leftP_names;
1542 @right_name= split /\n/, $rightP_names;
1543 @left_seq = split /\n/, $leftP_seqs;
1544 @right_seq = split /\n/, $rightP_seqs;
1545
1546 @left_start = split /\n/, $leftP_start;
1547 @left_len = split /\n/, $leftP_lens;
1548 @right_end = split/\n/, $rightP_ends;
1549 @right_len = split /\n/, $rightP_lens;
1550 @left_exclude = split /\n/, $left_Exclude;
1551 @right_exclude = split /\n/,$right_Exclude;
1552 @product_size = split /\n/, $product_sizes;
1553
1554 my $primers_withSize ="";
1555 open (SEN, '>sense_primers.out') or die "Could not open file for write\n";
1556 open (ASEN, '>antiSense_primers.out') or die "Could not open file for write\n";
1557 open (REG_1, '>sense_regions.out') or die "Could not open file for write\n";
1558 open (REG_2, '>antiSense_regions.out') or die "Could not open file for write\n";
1559
1560 if ($primers_toExclude ne "")
1561 {
1562 open (PEX, '>primers_toExclude.out') or die "Could not open file for write\n";
1563 print PEX $primers_toExclude;
1564 }
1565
1566
1567 my $totalp = scalar (@left_name);
1568
1569 #print $totalp, "\n"; exit;
1570
1571 my $well_pos;
1572 my $max_plates = ceil($totalp/96);
1573 #print "MAX Ps ", $max_plates, "\n";
1574 my $plate=1;
1575 my $sen ="";
1576 my $asen ="";
1577 my $plate_counter =0;
1578 my $wells = 96;
1579 for (my $index =0; $index < $totalp; $index += $wells)
1580 {
1581 my $do = $index;
1582 my $upper_bound= $index + $wells;
1583 if ($upper_bound > $totalp)
1584 {
1585 $upper_bound = $totalp;
1586 }
1587
1588 for (my $j=$index; $j <= ($upper_bound-1); $j+=1)
1589 {
1590 my $i = $j;
1591 if ($j < 96)
1592 {
1593 $well_pos = get_WellPosition ($j);
1594 }
1595 else
1596 {
1597 $well_pos = get_WellPosition ($j - $wells)
1598 }
1599
1600 #$primers_withSize.=$product_size[$i]."\t"."Plate_".$plate. "\t\tS.".$i."\tS.".$left_name[$i]."\t".
1601 print SEN "Plate_".$plate, "\t\t","S.", $i, "\tS.", $left_name[$i], "\t", $left_seq[$i], "\t\t+", "\t", $well_pos, "\n";
1602 print ASEN "Plate_".$plate, "\t\t","AS.", $i, "\tAS.", $right_name[$i], "\t", $right_seq[$i], "\t\t-","\t", $well_pos,"\n";
1603 print REG_1 "Plate_".$plate, "\t\t","S.", $i, "\t", $left_start[$i], "\t", $left_len[$i], "\n";
1604 print REG_2 "Plate_".$plate, "\t\t","AS.", $i, "\t", $right_end[$i], "\t",$right_len[$i], "\n";
1605
1606 }
1607 $plate +=1;
1608 }
1609
1610 #delete tmp. files
1611 #my $rm = "rm -f";
1612 system ("rm -f data left_query right_query R.delta R.cluster R.coords L.delta L.cluster L.coords");
1613 print "\nPRIMER DESIGN DONE\n\n";
1614 # end of primer design program
1615 }#//
1616 #####
1617 # returns a well position for oligos
1618 sub get_WellPosition{
1619
1620 my $j = shift;
1621 my $well_pos;
1622 if ($j < 12)
1623 {
1624 $well_pos = "a".($j+1);
1625 }
1626 elsif ($j>11 && $j<24) {
1627 $well_pos = "b". (($j+1) -12);
1628 }
1629 elsif ($j>23 && $j<36) {
1630 $well_pos = "c". (($j+1) -24);
1631 }
1632 elsif ($j>35 && $j<48) {
1633 $well_pos = "d". (($j+1) - 36);
1634 }
1635 elsif($j>47 && $j<60) {
1636 $well_pos = "e". (($j+1) -48);
1637 }
1638 elsif ($j>59 && $j<72)
1639 {
1640 $well_pos = "f". (($j+1) - 60);
1641 }
1642 elsif ($j>71 && $j< 84)
1643 {
1644 $well_pos = "g". (($j+1) - 72);
1645 }
1646 elsif ($j>83 && $j<96)
1647 {
1648 $well_pos = "h". (($j+1) - 84);
1649 }
1650 return $well_pos;
1651 }
1652
1653
1654 ####################################################################
1655 #get options for primer design
1656 #----------------------
1657 sub getPoptions{
1658
1659 my $qualAvailable = shift;
1660 #### USER INPUTS ##########
1661 #ask for optimum primer size
1662 print "\nEnter Optimum Primer size (default 20 bases):";
1663 my $opt=<STDIN>;
1664 chomp $opt;
1665 if($opt eq '')
1666 {
1667 $opt = 20;
1668 }
1669 #ask for minimum primer size
1670 print "\nEnter Minimum Primer size (default 18 bases):";
1671 my $min=<STDIN>;
1672 chomp $min;
1673 if($min eq '')
1674 {
1675 $min = 18;
1676 }
1677 #ask for maximum primer size
1678 print "\nEnter Maximum Primer size (default 27 bases):";
1679 my $max= <STDIN>;
1680 chomp $max;
1681 if($max eq '')
1682 {
1683 $max= 27;
1684 }
1685 #ask for optimum primer temperature
1686 print "\nEnter Optimum melting temperature (Celcius) for a primer oligo (default 60.0C):";
1687 my $optTemp=<STDIN>;
1688 chomp $optTemp;
1689 if($optTemp eq '')
1690 {
1691 $optTemp = 60.0;
1692 }
1693 #ask for minimum primer temperature
1694 print "\nEnter Minimum melting temperature (Celcius) for a primer oligo (default 57.0C):";
1695 my $minTemp=<STDIN>;
1696 chomp $minTemp;
1697 if($minTemp eq '')
1698 {
1699 $minTemp = 57.0;
1700 }
1701 #ask for maximum primer temperature
1702 print "\nEnter Maximum melting temperature (Celcius) for a primer oligo (default 63.0C):";
1703 my $maxTemp=<STDIN>;
1704 chomp $maxTemp;
1705 if($maxTemp eq '')
1706 {
1707 $maxTemp = 63.0;
1708 }
1709 print "\nEnter flanking region size (default 1000 bases): ";
1710 my $flank=<STDIN>;
1711 chomp $flank;
1712 if ($flank eq '')
1713 {
1714 $flank = 1000;
1715 }
1716 #ask for primer product range
1717 print "\nEnter minimum product size produced by primers (default =flanking size):";
1718 my $lowRange=<STDIN>;
1719 chomp $lowRange;
1720 if($lowRange eq '')
1721 {
1722 $lowRange = $flank;
1723 }
1724 print "\nEnter maxmimum product size produced by primers (default 7000):";
1725 my $maxRange=<STDIN>;
1726 chomp $maxRange;
1727 if($maxRange eq '')
1728 {
1729 $maxRange = 7000;
1730 }
1731 #ask for minimum GC content in primers
1732 print "\nEnter minimum GC content in primers (default 20%):";
1733 my $gcMin=<STDIN>;
1734 chomp $gcMin;
1735 if($gcMin eq '')
1736 {
1737 $gcMin = 20.0;
1738 }
1739 #ask for optimum GC content in primers
1740 print "\nEnter optimum GC content in primers (default 50%):";
1741 my $gcOpt=<STDIN>;
1742 chomp $gcOpt;
1743 if($gcOpt eq '')
1744 {
1745 $gcOpt = 50.0;
1746 }
1747 #ask for maximum GC content in primers
1748 print "\nEnter maximum GC content in primers (default 80%):";
1749 my $gcMax=<STDIN>;
1750 chomp $gcMax;
1751 if($gcMax eq '')
1752 {
1753 $gcMax = 80.0;
1754 }
1755 print "\nEnter GC clamp (default 1):";
1756 my $gclamp=<STDIN>;
1757 chomp $gclamp;
1758 if($gclamp eq '')
1759 {
1760 $gclamp = 1;
1761 }
1762 print "\nEnter size of region to exclude at the end of contigs (default 100 bases):";
1763 my $exclude=<STDIN>;
1764 chomp $exclude;
1765 if ($exclude eq '')
1766 {
1767 $exclude = 100;
1768 }
1769
1770
1771 #tdo
1772 my $quality='';
1773 if ($qualAvailable)
1774 {
1775
1776 print "\nEnter minimum quality for primer pick (default 40):";
1777 $quality=<STDIN>;
1778 chomp $quality;
1779 if($quality eq '')
1780 {
1781 $quality = 40;
1782 }
1783 }
1784
1785
1786 return ($opt,$min,$max,$optTemp,$minTemp,$maxTemp,$flank,$lowRange,$maxRange,$gcMin,$gcOpt,$gcMax,$gclamp,$exclude, $quality);
1787
1788 }
1789 ###############
1790 #-----------------------------------------------------END of PRIMER DESIGN ----------------------------------------------------------------
1791 #-----------------------------------------------------END OF ABACAS -----------------------------------------------------------------------
1792
1793