comparison phyloconversion/seqConverterG.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/usr/bin/perl -w
2 #
3 # seqConverter.pl v1.2
4 # Last modified August 12, 2010 21:09
5 # Minor modifications for running in Galaxy by TH Oakley Jan 2012
6 # (c) Olaf R.P. Bininda-Emonds
7 #
8 # Input:
9 # Sequence data in any of fasta, GenBank, nexus, (classic or extended) phylip,
10 # or Se-Al formats
11 #
12 # Output:
13 # Sequence data in any of fasta, nexus, (classic or extended) phylip, and/or
14 # Se-Al formats.
15 #
16 # Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]
17 # [-G] [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t]
18 # [-u] [-v] [-h]
19 # [-O]<outfile> specify outfile
20 # options: -a = print out accession numbers instead of taxon labels for nexus
21 # and phylip output
22 # -c<number> = global genetic code (to be used for translation unless
23 # otherwise specified) (default = standard code; end
24 # with * to override local settings)
25 # -d<filename> = file containing raw sequence information; * = batch
26 # convert all specified file types in working
27 # directory (suffixes must be .fasta / .fas, .gb,
28 # .nexus / .nex, .phylip / .ph, or .seal as
29 # appropriate)
30 # -g = do not convert ~ gap characters (e.g., from BioEdit) to -
31 # -G = convert flanking gaps to Ns
32 # -H = convert sequence data to haplotype data
33 # -i<f|g|n|p|s> = format of sequence file (fasta (f), GenBank (g),
34 # nexus (n), phylip (p), or Se-Al (s)); default =
35 # autodetect
36 # -j = produce jackknifed data sets each with a single sequence
37 # deleted
38 # -l<number> = interleave nexus- and/or phylip-formatted output
39 # (default = sequential) with specified sequence length
40 # (between 10 and 100 inclusive; default = 80);
41 # NOTE: also sets default interleave length for fasta
42 # output
43 # -n = convert ambiguous nucleotides to Ns
44 # -o<f|n|pc|pe|s> = output results in any or all of fasta (f), nexus
45 # (n), classic or extended phylip (pc or pe),
46 # and/or Se-Al (s) formats (NOTE: flag can be
47 # invoked multiple times)
48 # -r<a|i> = order sequences in final output alphabetically by name
49 # (a; default) or in input order from file (i)
50 # -s = split data set into individual partitions according to nexus
51 # charset statements
52 # -t = translate sequences into amino acids
53 # -u = interactive user-input mode
54 # -h = print this message and quit
55 # -v = verbose output
56
57 use strict;
58 use POSIX;
59 #THO added ability for Galaxy -G to specify outfile name
60 my $OutFile;
61
62 # Set user-input defaults and associated parameters
63 # Data set variables
64 my $inputType = ""; # Options are "fasta", "GenBank", "nexus", "phylip", and "Se-Al"
65 my ($readFile, @seqFiles);
66 my $dataSource;
67 my $seqType = "nucleotide";
68 my $globalGenCode = 0;
69 my $globalOverride = 0;
70
71 my (@accNum, %nameLabel, %sequence, %geneticCode, %accPresent);
72 my (@charsetList, %charsetStart, %charsetEnd);
73 my (%deletedSeq, %finalSeq);
74 my $seqCount;
75 my $ntax;
76 my $nameClean = 1;
77 my $skipDuplicates = 1;
78
79 # User input variables
80 my $seqOrder = "alphabetical"; # Options are "alphabetical" (default) and "input"
81 my $seqSplit = 0;
82 my $jackknife = 0;
83 my $gapChange = 1;
84 my $ambigChange = 0;
85 my $translateSeqs = 0;
86 my $haploTyping = 0;
87 my $haploFile;
88 my $flankGap = 0;
89
90 # Translation variables and genetic codes
91 my %transTable = ('1' => 'standard',
92 '2' => 'vertebrate mitochondrial',
93 '3' => 'yeast mitochondrial',
94 '4' => 'mold, protozoan and colenterate mitochondrial and mycoplasam/spiroplasma',
95 '5' => 'invertebrate mitochondrial',
96 '6' => 'ciliate, dasycladacean and hexamita nuclear',
97 '9' => 'echinoderm mitochondrial',
98 '10' => 'euplotid nuclear',
99 '11' => 'bacterial and plant plastid',
100 '12' => 'alternative yeast nuclear',
101 '13' => 'ascidian mitochondrial',
102 '14' => 'alternative flatworm mitochondrial',
103 '15' => 'Blepharisma nuclear',
104 '16' => 'chlorophycean mitochondrial',
105 '21' => 'trematode mitochondrial',
106 '22' => 'Scenedesmus obliquus mitochondrial',
107 '23' => 'Thraustochytrium mitochondrial');
108 my %gb2seal = ('1' => '0',
109 '2' => '1',
110 '3' => '2',
111 '4' => '3',
112 '4' => '4',
113 '5' => '5',
114 '6' => '6',
115 '9' => '7',
116 '10' => '8',
117 '11' => '9',
118 '12' => '10',
119 '13' => '11',
120 '14' => '12',
121 '15' => '13',
122 '16' => '1',
123 '21' => '1',
124 '22' => '1',
125 '23' => '1');
126 my %seal2gb = ('0' => '1',
127 '1' => '2',
128 '2' => '3',
129 '3' => '4',
130 '4' => '4',
131 '5' => '5',
132 '6' => '6',
133 '7' => '9',
134 '8' => '10',
135 '9' => '11',
136 '10' => '12',
137 '11' => '13',
138 '12' => '14',
139 '13' => '15');
140 my %genCodes;
141 foreach my $code (qw(0 1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23))
142 {
143 $genCodes{$code} = 1;
144 }
145 my %DNAtoAA;
146 my @ambigList = qw(A C G T M R W S Y K V H D B N);
147 my %ambigCode = ('A' => 'A', 'C' => 'C', 'G' => 'G', 'T' => 'T',
148 'AC' => 'M', 'AG' => 'R', 'AT' => 'W', 'CG' => 'S', 'CT' => 'Y', 'GT' => 'K',
149 'ACG' => 'V', 'ACT' => 'H', 'AGT' => 'D', 'CGT' => 'B', 'ACGT' => 'N');
150 my (%constitNTlist);
151 while (my ($nt, $code) = each %ambigCode) # Where $nt = key and $code = value
152 {
153 push @{$constitNTlist{$code}}, $_ foreach (split("",$nt));
154 }
155
156 # Output variables
157 my $maxLength;
158 my $fastaPrint = 0;
159 my $fastaOut;
160 my $nexusPrint = 0;
161 my $nexusOut;
162 my ($phylipTradPrint, $phylipExtPrint) = (0, 0);
163 my $phylipOut;
164 my $sealPrint = 0;
165 my $sealOut;
166 my $outFormat = "sequential";
167 my $interleaveLength = 80;
168 my $fastaLength = 80;
169 my $accPrint = 0;
170
171 # Miscellaneous variables
172 my $verbose = 0;
173 my $debug = 0;
174 my $perlScript = "seqConverterG.pl";
175 my $version = "1.2";
176
177 # Read in user input
178 if (not @ARGV or join(' ', @ARGV) =~ /\s-u/ or $ARGV[0] =~ /^-u/) # Enter interactive user-input mode
179 {
180 print "Entering interactive user-input mode. Type \"q\" at any prompt to exit program.\n";
181
182 # Get print format
183 my $defaultAcc = ($accPrint) ? "y" : "n";
184 undef $accPrint;
185 until (defined $accPrint)
186 {
187 print "\tPrint out accession numbers only to nexus- and/or phylip output (y|n) [$defaultAcc]: ";
188 $accPrint = <stdin>;
189 chomp ($accPrint);
190 exit(0) if ($accPrint eq "q");
191 if (substr($accPrint, 0, 1) =~ /^n/i or $accPrint eq "")
192 {
193 $accPrint = 0;
194 }
195 elsif (substr($accPrint, 0, 1) =~ /^y/i)
196 {
197 $accPrint = 1;
198 }
199 else
200 {
201 print "\t\tInvalid input ($accPrint)\n";
202 undef $accPrint;
203 }
204 }
205
206 # Get datafile
207 until (defined $readFile)
208 {
209 print "\tEnter name of data file (* = batch convert): ";
210 $readFile = <stdin>;
211 chomp ($readFile);
212 exit(0) if ($readFile eq "q");
213 unless (-e $readFile or $readFile eq "*")
214 {
215 print "\t\tFile '$readFile' does not exist\n";
216 undef $readFile;
217 }
218 }
219 push @seqFiles, $readFile;
220
221 # Get format of datafile
222 my $defaultInput = "autodetect";
223 undef $inputType;
224 until (defined $inputType)
225 {
226 print "\tEnter format of file $readFile (fasta|GenBank|nexus|phylip|Se-Al) [$defaultInput]: ";
227 $inputType = <stdin>;
228 chomp ($inputType);
229 exit(0) if ($inputType =~ /^q/i);
230 if (substr($inputType, 0, 1) =~ /^a/i or $inputType eq "")
231 {
232 $inputType = "autodetect";
233 }
234 elsif (substr($inputType, 0, 1) =~ /^f/i)
235 {
236 $inputType = "fasta";
237 }
238 elsif (substr($inputType, 0, 1) =~ /^g/i)
239 {
240 $inputType = "GenBank";
241 }
242 elsif (substr($inputType, 0, 1) =~ /^n/i)
243 {
244 $inputType = "nexus";
245 }
246 elsif (substr($inputType, 0, 1) =~ /^p/i)
247 {
248 $inputType = "phylip";
249 }
250 elsif (substr($inputType, 0, 1) =~ /^s/i)
251 {
252 $inputType = "Se-Al";
253 }
254 else
255 {
256 print "\t\tInvalid input ($inputType)\n";
257 undef $inputType;
258 }
259 }
260 $inputType = "" if ($inputType eq "autodetect");
261
262 # Get whether or not to clean sequence labels
263 my $defaultClean = ($nameClean) ? "y" : "n";
264 undef $nameClean;
265 until (defined $nameClean)
266 {
267 print "\tClean sequence labels by changing non-alphanumeric characters to underscores (y|n)? [$defaultClean]: ";
268 $nameClean = <stdin>;
269 chomp ($nameClean);
270 exit(0) if ($nameClean =~ /^q/i);
271 if (substr($nameClean, 0, 1) =~ /^y/i or $nameClean eq "")
272 {
273 $nameClean = 1;
274 }
275 elsif (substr($nameClean, 0, 1) =~ /^n/i)
276 {
277 $nameClean = 0;
278 }
279 else
280 {
281 print "\t\tInvalid input ($seqSplit)\n";
282 undef $nameClean;
283 }
284 }
285
286 # Get whether or not to split sequences
287 my $defaultSplit = ($seqSplit) ? "y" : "n";
288 undef $seqSplit;
289 until (defined $seqSplit)
290 {
291 print "\tSplit into individual partitions following charset statements (y|n)? [$defaultSplit]: ";
292 $seqSplit = <stdin>;
293 chomp ($seqSplit);
294 exit(0) if ($seqSplit =~ /^q/i);
295 if (substr($seqSplit, 0, 1) =~ /^n/i or $seqSplit eq "")
296 {
297 $seqSplit = 0;
298 }
299 elsif (substr($seqSplit, 0, 1) =~ /^y/i)
300 {
301 $seqSplit = 1;
302 }
303 else
304 {
305 print "\t\tInvalid input ($seqSplit)\n";
306 undef $seqSplit;
307 }
308 }
309
310 # Get whether or not to jackknife sequences
311 my $defaultJack = ($jackknife) ? "y" : "n";
312 undef $jackknife;
313 until (defined $jackknife)
314 {
315 print "\tProduce replicate, jackknifed data sets (y|n)? [$defaultJack]: ";
316 $jackknife = <stdin>;
317 chomp ($jackknife);
318 exit(0) if ($jackknife =~ /^q/i);
319 if (substr($jackknife, 0, 1) =~ /^n/i or $jackknife eq "")
320 {
321 $jackknife = 0;
322 }
323 elsif (substr($jackknife, 0, 1) =~ /^y/i)
324 {
325 $jackknife = 1;
326 }
327 else
328 {
329 print "\t\tInvalid input ($jackknife)\n";
330 undef $jackknife;
331 }
332 }
333
334 # Get whether or not to convert gaps
335 my $defaultGaps = ($gapChange) ? "y" : "n";
336 undef $gapChange;
337 until (defined $gapChange)
338 {
339 print "\tConvert ~ gap characters to - (y|n)? [$defaultGaps]: ";
340 $gapChange = <stdin>;
341 chomp ($gapChange);
342 exit(0) if ($gapChange =~ /^q/i);
343 if (substr($gapChange, 0, 1) =~ /^n/i)
344 {
345 $gapChange = 0;
346 }
347 elsif (substr($gapChange, 0, 1) =~ /^y/i or $gapChange eq "")
348 {
349 $gapChange = 1;
350 }
351 else
352 {
353 print "\t\tInvalid input ($gapChange)\n";
354 undef $gapChange;
355 }
356 }
357
358 # Get whether or not to convert flanking gaps
359 my $defaultFlank = ($flankGap) ? "y" : "n";
360 undef $flankGap;
361 until (defined $flankGap)
362 {
363 print "\tConvert flanking gap characters to Ns (y|n)? [$defaultFlank]: ";
364 $flankGap = <stdin>;
365 chomp ($flankGap);
366 exit(0) if ($flankGap =~ /^q/i);
367 if (substr($flankGap, 0, 1) =~ /^n/i)
368 {
369 $flankGap = 0;
370 }
371 elsif (substr($flankGap, 0, 1) =~ /^y/i or $flankGap eq "")
372 {
373 $flankGap = 1;
374 }
375 else
376 {
377 print "\t\tInvalid input ($flankGap)\n";
378 undef $flankGap;
379 }
380 }
381
382 # Get whether or not to convert ambiguous nucleotides
383 my $defaultAmbig = ($ambigChange) ? "y" : "n";
384 undef $ambigChange;
385 until (defined $ambigChange)
386 {
387 print "\tConvert ambiguous nucleotides to Ns (y|n)? [$defaultAmbig]: ";
388 $ambigChange = <stdin>;
389 chomp ($ambigChange);
390 exit(0) if ($ambigChange =~ /^q/i);
391 if (substr($ambigChange, 0, 1) =~ /^n/i or $ambigChange eq "")
392 {
393 $ambigChange = 0;
394 }
395 elsif (substr($ambigChange, 0, 1) =~ /^y/i)
396 {
397 $ambigChange = 1;
398 }
399 else
400 {
401 print "\t\tInvalid input ($ambigChange)\n";
402 undef $ambigChange;
403 }
404 }
405
406 # Get output order of sequences
407 my $defaultOrder = $seqOrder;
408 undef $seqOrder;
409 until (defined $seqOrder)
410 {
411 print "\tEnter output order for sequences (alphabetical|clustal|input file) [$defaultOrder]: ";
412 $seqOrder = <stdin>;
413 chomp ($seqOrder);
414 exit(0) if ($seqOrder =~ /^q/i);
415 if (substr($seqOrder, 0, 1) =~ /^i/i)
416 {
417 $seqOrder = "input";
418 }
419 elsif (substr($seqOrder, 0, 1) =~ /^a/i or $seqOrder eq "")
420 {
421 $seqOrder = "alphabetical";
422 }
423 else
424 {
425 print "\t\tInvalid input ($seqOrder)\n";
426 undef $seqOrder;
427 }
428 }
429
430 # Get whether or not to convert to haplotypes
431 my $defaultHaplo = ($haploTyping) ? "y" : "n";
432 undef $haploTyping;
433 until (defined $haploTyping)
434 {
435 print "\tConvert sequence data to haplotypes (y|n)? [$defaultHaplo]: ";
436 $haploTyping = <stdin>;
437 chomp ($haploTyping);
438 exit(0) if ($haploTyping =~ /^q/i);
439 if (substr($haploTyping, 0, 1) =~ /^n/i or $haploTyping eq "")
440 {
441 $haploTyping = 0;
442 }
443 elsif (substr($haploTyping, 0, 1) =~ /^y/i)
444 {
445 $haploTyping = 1;
446 }
447 else
448 {
449 print "\t\tInvalid input ($haploTyping)\n";
450 undef $haploTyping;
451 }
452 }
453
454 # Get whether or not to convert to amino acids
455 my $defaultTranslate = ($translateSeqs) ? "y" : "n";
456 undef $translateSeqs;
457 until (defined $translateSeqs)
458 {
459 print "\tConvert sequence data to amino acids (y|n)? [$defaultTranslate]: ";
460 $translateSeqs = <stdin>;
461 chomp ($translateSeqs);
462 exit(0) if ($translateSeqs =~ /^q/i);
463 if (substr($translateSeqs, 0, 1) =~ /^n/i or $translateSeqs eq "")
464 {
465 $translateSeqs = 0;
466 }
467 elsif (substr($translateSeqs, 0, 1) =~ /^y/i)
468 {
469 $translateSeqs = 1;
470 }
471 else
472 {
473 print "\t\tInvalid input ($translateSeqs)\n";
474 undef $translateSeqs;
475 }
476 }
477
478 # Get genetic code for translation
479 my $defaultCode = $globalGenCode;
480 $globalGenCode = 99;
481 print "\tGenetic codes available for protein translation:\n";
482 foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23))
483 {
484 print "\t\t$code: $transTable{$code}";
485 print "\n";
486 }
487 until (defined $genCodes{$globalGenCode})
488 {
489 $globalOverride = 0;
490 print "\tEnter global genetic code to be applied (0 = no translation; follow with an * to override local code) [$defaultCode]: ";
491 $globalGenCode = <stdin>;
492 chomp ($globalGenCode);
493 exit(0) if ($globalGenCode =~ /^q/i);
494 $globalGenCode = $defaultCode if ($globalGenCode eq "");
495 if ($globalGenCode =~ /\*$/)
496 {
497 $globalOverride = 1;
498 $globalGenCode =~ s/\*$//;
499 }
500 print "\t\tInvalid input ($globalGenCode)\n" unless (defined $genCodes{$globalGenCode} or $globalGenCode == 0);
501 }
502
503 # Get output formats
504 my $defaultFasta = ($fastaPrint) ? "y" : "n";
505 undef $fastaPrint;
506 until (defined $fastaPrint)
507 {
508 print "\tOutput results in fasta format (y|n) [$defaultFasta]: ";
509 $fastaPrint = <stdin>;
510 chomp ($fastaPrint);
511 exit(0) if ($fastaPrint =~ /^q/i);
512 if (substr($fastaPrint, 0, 1) =~ /^y/i)
513 {
514 $fastaPrint = 1;
515 }
516 elsif (substr($fastaPrint, 0, 1) =~ /^n/i or $fastaPrint eq "")
517 {
518 $fastaPrint = 0;
519 }
520 else
521 {
522 print "\t\tInvalid input ($fastaPrint)\n";
523 undef $fastaPrint;
524 }
525 }
526
527 my $defaultNexus = ($nexusPrint) ? "y" : "n";
528 undef $nexusPrint;
529 until (defined $nexusPrint)
530 {
531 print "\tOutput results in nexus format (y|n) [$defaultNexus]: ";
532 $nexusPrint = <stdin>;
533 chomp ($nexusPrint);
534 exit(0) if ($nexusPrint =~ /^q/i);
535 if (substr($nexusPrint, 0, 1) =~ /^y/i)
536 {
537 $nexusPrint = 1;
538 }
539 elsif (substr($nexusPrint, 0, 1) =~ /^n/i or $nexusPrint eq "")
540 {
541 $nexusPrint = 0;
542 }
543 else
544 {
545 print "\t\tInvalid input ($nexusPrint)\n";
546 undef $nexusPrint;
547 }
548 }
549
550 my $defaultPhylip = ($phylipTradPrint) ? "y" : "n";
551 undef $phylipTradPrint;
552 until (defined $phylipTradPrint or $phylipExtPrint)
553 {
554 print "\tOutput results in traditional phylip format (y|n) [$defaultPhylip]: ";
555 $phylipTradPrint = <stdin>;
556 chomp ($phylipTradPrint);
557 exit(0) if ($phylipTradPrint =~ /^q/i);
558 if (substr($phylipTradPrint, 0, 1) =~ /^y/i)
559 {
560 $phylipTradPrint = 1;
561 }
562 elsif (substr($phylipTradPrint, 0, 1) =~ /^n/i or $phylipTradPrint eq "")
563 {
564 $phylipTradPrint = 0;
565 }
566 else
567 {
568 print "\t\tInvalid input ($phylipTradPrint)\n";
569 undef $phylipTradPrint;
570 }
571 }
572
573 if ($phylipTradPrint == 0) # Check for extended format
574 {
575 my $defaultPhylip = ($phylipExtPrint) ? "y" : "n";
576 undef $phylipExtPrint;
577 until (defined $phylipExtPrint or $phylipExtPrint)
578 {
579 print "\tOutput results in extended phylip format (y|n) [$defaultPhylip]: ";
580 $phylipExtPrint = <stdin>;
581 chomp ($phylipExtPrint);
582 exit(0) if ($phylipExtPrint =~ /^q/i);
583 if (substr($phylipExtPrint, 0, 1) =~ /^y/i)
584 {
585 $phylipExtPrint = 1;
586 }
587 elsif (substr($phylipExtPrint, 0, 1) =~ /^n/i or $phylipExtPrint eq "")
588 {
589 $phylipExtPrint = 0;
590 }
591 else
592 {
593 print "\t\tInvalid input ($phylipExtPrint)\n";
594 undef $phylipExtPrint;
595 }
596 }
597 }
598
599 my $defaultSeal = ($sealPrint) ? "y" : "n";
600 undef $sealPrint;
601 until (defined $sealPrint)
602 {
603 print "\tOutput results in Se-Al format (y|n) [$defaultSeal]: ";
604 $sealPrint = <stdin>;
605 chomp ($sealPrint);
606 exit(0) if ($sealPrint =~ /^q/i);
607 if (substr($sealPrint, 0, 1) =~ /^y/i)
608 {
609 $sealPrint = 1;
610 }
611 elsif (substr($sealPrint, 0, 1) =~ /^n/i or $sealPrint eq "")
612 {
613 $sealPrint = 0;
614 }
615 else
616 {
617 print "\t\tInvalid input ($sealPrint)\n";
618 undef $sealPrint;
619 }
620 }
621
622 if ($nexusPrint or $phylipTradPrint or $phylipExtPrint)
623 {
624 my $defaultInterleave = ($outFormat) ? "y" : "n";
625 undef $outFormat;
626 until (defined $outFormat)
627 {
628 print "\tInterleave output for nexus- and/or phylip-formatted output with length (n|10-100) [$defaultInterleave]: ";
629 $outFormat = <stdin>;
630 chomp ($outFormat);
631 exit(0) if ($outFormat =~ /^q/i);
632 if ($outFormat =~ /(\d+)/)
633 {
634 if ($outFormat =~ /\D/ or $outFormat < 10 or $outFormat > 100)
635 {
636 print "\t\tInvalid input ($outFormat)\n";
637 undef $outFormat;
638 }
639 else
640 {
641 $outFormat = $1;
642 }
643 }
644 elsif (substr($outFormat, 0, 1) =~ /^n/i or $outFormat eq "")
645 {
646 $outFormat = "sequential";
647 }
648 else
649 {
650 print "\t\tInvalid input ($outFormat)\n";
651 undef $outFormat;
652 }
653 }
654 if ($outFormat ne "sequential")
655 {
656 $interleaveLength = $outFormat;
657 $outFormat = "interleave";
658 }
659 }
660
661 # Get verbose output mode
662 my $defaultVerbose = ($verbose) ? "y" : "n";
663 undef $verbose;
664 until (defined $verbose)
665 {
666 print "\tOutput verbose results to screen (y|n) [$defaultVerbose]: ";
667 $verbose = <stdin>;
668 chomp ($verbose);
669 exit(0) if ($verbose =~ /^q/i);
670 if (substr($verbose, 0, 1) =~ /^y/i)
671 {
672 $verbose = 1;
673 print "\n";
674 }
675 elsif (substr($verbose, 0, 1) =~ /^n/i or $verbose eq "")
676 {
677 $verbose = 0;
678 }
679 elsif (substr($verbose, 0, 1) =~ /^x/i or $verbose eq "")
680 {
681 $verbose = $debug = 1;
682 }
683 else
684 {
685 print "\t\tInvalid input ($verbose)\n";
686 undef $verbose;
687 }
688 }
689 }
690 elsif (join(' ', @ARGV) =~ /\s-h/ or $ARGV[0] =~ /^-h/) # Print help screen
691 {
692 print "Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]\n";
693 print " [-G] [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t]\n";
694 print " [-u] [-v] [-h]\n";
695 print "Version: $version\n";
696 print "Options: -a = print out accession numbers instead of taxon labels for nexus\n";
697 print " and phylip output\n";
698 print " -c<number> = global genetic code (to be used for translation unless\n";
699 print " otherwise specified) (default = standard code; end\n";
700 print " with * to override local settings)\n";
701 foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23))
702 {
703 print " $code: $transTable{$code}";
704 print "\n";
705 }
706 print " -d<filename> = file containing raw sequence information; * = batch\n";
707 print " convert all specified file types in working\n";
708 print " directory (suffixes must be .fasta / .fas, .gb\n";
709 print " .nexus / .nex, .phylip / .ph, or .seal as\n";
710 print " appropriate)\n";
711 print " -g = do not convert ~ gap characters (e.g., from BioEdit) to -\n";
712 print " -G = convert flanking gaps to Ns\n";
713 print " -H = convert sequence data to haplotype data\n";
714 print " -i<f|g|n|p|s> = format of sequence file (fasta (f), GenBank (g),\n";
715 print " nexus (n), phylip (p), or Se-Al (s)); default =\n";
716 print " autodetect\n";
717 print " -j = produce jackknifed data sets each with a single sequence\n";
718 print " deleted\n";
719 print " -l<number> = interleave nexus- and/or phylip-formatted output\n";
720 print " (default = sequential) with specified sequence length\n";
721 print " (between 10 and 100 inclusive; default = 80);\n";
722 print " NOTE: also sets default interleave length for fasta\n";
723 print " output\n";
724 print " -n = convert ambiguous nucleotides to Ns\n";
725 print " -o<f|n|pc|pe|s> = output results in any or all of fasta (f), nexus\n";
726 print " (n), classic or extended phylip (pc or pe),\n";
727 print " and/or Se-Al (s) formats (NOTE: flag can be\n";
728 print " invoked multiple times)\n";
729 print " -r<a|i> = order sequences in final output alphabetically by name\n";
730 print " (a; default) or in input order from file (i)\n";
731 print " -s = split data set into individual partitions according to nexus\n";
732 print " charset statements\n";
733 print " -t = translate sequences into amino acids\n";
734 print " -u = interactive user-input mode\n";
735 print " -h = print this message and quit\n";
736 print " -v = verbose output\n";
737 exit(0);
738 }
739 else # Process switches
740 {
741 for (my $i = 0; $i <= $#ARGV; $i++)
742 {
743 if ($ARGV[$i] eq "-a")
744 {
745 $accPrint = 1;
746 }
747 elsif ($ARGV[$i] =~ /-c(\d+)/)
748 {
749 $globalGenCode = $1;
750 if ($globalGenCode != 0 and not defined $genCodes{$globalGenCode})
751 {
752 print "Don't understand argument: $ARGV[$i]\n";
753 print "Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]\n";
754 print " [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t] [-u]\n";
755 print " [-v] [-h]\n";
756 print "Version: $version\n";
757 exit(1);
758 }
759 $globalOverride = 1 if ($ARGV[$i] =~ /\*$/);
760 }
761 elsif ($ARGV[$i] =~ /^-d(.*)/)
762 {
763 $readFile = $1;
764 unless ($readFile eq "*")
765 {
766 die "ERROR: Data file $readFile does not exist.\n" unless (-e $readFile);
767 }
768 push @seqFiles, $readFile;
769 }
770 elsif ($ARGV[$i] eq "-g")
771 {
772 $gapChange = 0;
773 }
774 elsif ($ARGV[$i] eq "-G")
775 {
776 $flankGap = 1;
777 }
778 elsif ($ARGV[$i] eq "-H")
779 {
780 $haploTyping = 1;
781 }
782 elsif ($ARGV[$i] eq "-if")
783 {
784 $inputType = "fasta";
785 }
786 elsif ($ARGV[$i] eq "-ig")
787 {
788 $inputType = "GenBank";
789 }
790 elsif ($ARGV[$i] eq "-in")
791 {
792 $inputType = "nexus";
793 }
794 elsif ($ARGV[$i] eq "-ip")
795 {
796 $inputType = "phylip";
797 }
798 elsif ($ARGV[$i] eq "-is")
799 {
800 $inputType = "Se-Al";
801 }
802 elsif ($ARGV[$i] eq "-j")
803 {
804 $jackknife = 1;
805 }
806 elsif ($ARGV[$i] eq "-k")
807 {
808 $nameClean = 0;
809 }
810 elsif ($ARGV[$i] =~ /-l(\d+)?/)
811 {
812 $interleaveLength = $1 if (defined $1);
813 $outFormat = "interleave";
814 }
815 elsif ($ARGV[$i] eq "-n")
816 {
817 $ambigChange = 1;
818 }
819 elsif ($ARGV[$i] eq "-of")
820 {
821 $fastaPrint = 1;
822 }
823 elsif ($ARGV[$i] eq "-on")
824 {
825 $nexusPrint = 1;
826 }
827 elsif ($ARGV[$i] eq "-opc")
828 {
829 $phylipTradPrint = 1;
830 }
831 elsif ($ARGV[$i] eq "-ope")
832 {
833 $phylipExtPrint = 1;
834 }
835 elsif ($ARGV[$i] eq "-os")
836 {
837 $sealPrint = 1;
838 }
839 elsif ($ARGV[$i] eq "-ra")
840 {
841 $seqOrder = "alphabetical";
842 }
843 elsif ($ARGV[$i] eq "-ri")
844 {
845 $seqOrder = "input";
846 }
847 elsif ($ARGV[$i] eq "-s")
848 {
849 $seqSplit = 1;
850 }
851 elsif ($ARGV[$i] eq "-t")
852 {
853 $translateSeqs = 1;
854 $globalGenCode = 1 unless ($globalGenCode);
855 }
856 elsif ($ARGV[$i] eq "-v")
857 {
858 $verbose = 1;
859 }
860 elsif ($ARGV[$i] eq "-x")
861 {
862 $debug = 1;
863 $verbose = 1;
864 }
865 elsif ($ARGV[$i] =~ /^-O(.*)/)
866 {
867 $OutFile = $1;
868 }
869 else
870 {
871 print "Don't understand argument: $ARGV[$i]\n";
872 print "Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]\n";
873 print " [-G] [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t]\n";
874 print " [-u] [-v] [-h]\n";
875 print "Version: $version\n";
876 exit(1);
877 }
878 }
879 }
880 $accPrint = 0 if ($accPrint eq "n");
881 geneticCoder() if ($globalGenCode);
882 $fastaLength = $interleaveLength;
883
884 # Check for I/O errors
885 die "ERROR: Must supply name of file containing sequence data.\n" if (not @seqFiles);
886 die "ERROR: Must supply at least one output format.\n" unless ($fastaPrint or $nexusPrint or $phylipTradPrint or $phylipExtPrint or $sealPrint);
887 die "ERROR: Sequence length for interleaved format must be between 10 and 100 inclusive.\n" if (($fastaPrint or (($nexusPrint or $phylipTradPrint or $phylipExtPrint) and $outFormat eq "interleave")) and ($interleaveLength < 10 or $interleaveLength > 100));
888
889 # Read in sequence data
890 if ($seqFiles[0] eq "*") # Batch convert all appropriate files in working directory
891 {
892 if ($inputType)
893 {
894 undef @seqFiles;
895
896 system("ls -d * > convertList.txt");
897 setLineBreak("convertList.txt");
898 open (LIST, "<convertList.txt") or die "Cannot open file containing names of all sequence files, convertList.txt\n";
899 while (<LIST>)
900 {
901 chomp;
902 next unless ($_);
903 push @seqFiles, $_ if ($inputType eq "Se-Al" and $_ =~ /\.seal$/);
904 push @seqFiles, $_ if ($inputType eq "phylip" and ($_ =~ /\.phylip$/ or $_ =~ /\.ph$/));
905 push @seqFiles, $_ if ($inputType eq "nexus" and ($_ =~ /\.nexus$/ or $_ =~ /\.nex$/));
906 push @seqFiles, $_ if ($inputType eq "GenBank" and ($_ =~ /\.gb$/));
907 push @seqFiles, $_ if ($inputType eq "fasta" and ($_ =~ /\.fasta$/ or $_ =~ /\.fas$/));
908 }
909 close LIST;
910 unlink ("convertList.txt") unless ($debug);
911
912 die "ERROR: No files of type $inputType found for batch conversion.\n" if (not @seqFiles);
913 }
914 else
915 {
916 die "ERROR: Must specify input file type for batch conversion\n";
917 }
918 }
919
920 foreach my $seqFile (@seqFiles)
921 {
922 print "\nConverting file $seqFile ...\n";
923
924 # Set output file names
925 $dataSource = $seqFile;
926 $dataSource =~ s/\.\w+$//;
927
928 if ($fastaPrint)
929 {
930 $fastaOut = $dataSource . ".fasta";
931 $fastaOut =~ s/\.fasta$/_new.fasta/ if ($fastaOut eq $seqFile);
932 $fastaOut =~ s/\.fasta$/_haplo.fasta/ if ($haploTyping);
933 }
934 if ($nexusPrint)
935 {
936 # $nexusOut = $dataSource . ".nex";
937 # Fixed Output File name much easier for Galaxy
938 $nexusOut = $OutFile;
939 $nexusOut =~ s/\.nex$/_new.nex/ if ($nexusOut eq $seqFile);
940 $nexusOut =~ s/\.nex$/_haplo.nex/ if ($haploTyping);
941 }
942 if ($phylipTradPrint or $phylipExtPrint)
943 {
944 # $phylipOut = $dataSource . ".phylip";
945 # Fixed Output File name much easier for Galaxy
946 $phylipOut = $OutFile;
947 $phylipOut =~ s/\.phylip$/_new.phylip/ if ($phylipOut eq $seqFile);
948 $phylipOut =~ s/\.phylip$/_haplo.phylip/ if ($haploTyping);
949 }
950 if ($sealPrint)
951 {
952 $sealOut = $dataSource . ".seal";
953 $sealOut =~ s/\.seal$/_new.seal/ if ($sealOut eq $seqFile);
954 $sealOut =~ s/\.seal$/_haplo.seal/ if ($haploTyping);
955 }
956
957 $haploFile = $dataSource . "_haplotypeSeqs.txt";
958
959 $phylipExtPrint = 0 if ($phylipTradPrint);
960
961 # Read in sequence data
962 # Clear variables
963 undef @accNum;
964 undef %nameLabel;
965 undef %sequence;
966 undef %geneticCode;
967 undef %accPresent;
968 undef %deletedSeq;
969 undef %finalSeq;
970 undef $seqCount;
971 undef $ntax;
972 undef @charsetList;
973 undef %charsetStart;
974 undef %charsetEnd;
975 my (@haploList, %haploID, %haploSeqs);
976 $maxLength = 0;
977
978 seqRead($seqFile);
979 if (not @accNum)
980 {
981 print "\tERROR: Could not read in sequences from file $seqFile; skipping to next file\n";
982 next;
983 }
984
985 # Process for printing
986 my $stopCodonCount;
987 $seqType = "protein" if ($globalGenCode and $translateSeqs);
988 foreach my $seq (@accNum)
989 {
990 $sequence{$seq} =~ s/\~/-/g if ($gapChange);
991 $sequence{$seq} =~ s/R|Y|M|K|S|W|H|B|V|D/N/ig if ($ambigChange and $seqType eq "nucleotide");
992
993 $finalSeq{$seq} = $sequence{$seq};
994 $geneticCode{$seq} = $globalGenCode if ($globalOverride);
995 if ($globalGenCode and $translateSeqs)
996 {
997 $finalSeq{$seq} = translate($sequence{$seq}, $geneticCode{$seq});
998 $stopCodonCount++ if (($finalSeq{$seq} =~ tr/\*//) > 2); # Check for stop codons
999 }
1000 $maxLength = length($finalSeq{$seq}) if (length($finalSeq{$seq}) > $maxLength);
1001 $nameLabel{$seq} =~ s/\W+/_/g if ($nameClean); # Clean sequence labels of non-alphanumerics
1002 }
1003 printf "\n\tWARNING: $stopCodonCount of %s sequences had more than two stop codons; check for\n\t\t1) proper reading frame,\n\t\t2) proper genetic code, or\n\t\t3) that sequences are coding DNA\n", scalar(@accNum) if ($globalGenCode and $stopCodonCount);
1004
1005 # Add gaps to end of any sequence less than maximum length
1006 foreach my $seq (@accNum)
1007 {
1008 $finalSeq{$seq} .= "-" x ($maxLength - length($sequence{$seq}));
1009 $sequence{$seq} = $finalSeq{$seq};
1010 }
1011
1012 # Convert flanking gaps to Ns if desired
1013 if ($flankGap)
1014 {
1015 foreach my $seq (@accNum)
1016 {
1017 if ($finalSeq{$seq} =~ /^(\-+)/)
1018 {
1019 my $startGap = $1;
1020 my $startN = "N" x length($startGap);
1021 $finalSeq{$seq} =~s/^$startGap/$startN/;
1022 }
1023 if ($finalSeq{$seq} =~ /(\-+)$/)
1024 {
1025 my $endGap = $1;
1026 my $endN = "N" x length($endGap);
1027 $finalSeq{$seq} =~s/$endGap$/$endN/;
1028 }
1029 }
1030 }
1031
1032 # Determine haplotypes as needed
1033 if ($haploTyping)
1034 {
1035 foreach my $entry (@accNum)
1036 {
1037 $haploID{$entry} = 0;
1038
1039 foreach my $haploType (@haploList)
1040 {
1041 if ($finalSeq{$entry} eq $finalSeq{$haploType}) # Matches existing haplotype; add to list
1042 {
1043 $haploID{$entry} = $haploType;
1044 push @{$haploSeqs{$haploType}}, $entry;
1045 last;
1046 }
1047 }
1048
1049 if (not $haploID{$entry}) # No match to existing haplotype; define new haplotype
1050 {
1051 $haploID{$entry} = "haplo" . (scalar(@haploList) + 1);
1052 push @haploList, $haploID{$entry};
1053 $nameLabel{$haploID{$entry}} = $haploID{$entry};
1054 $finalSeq{$haploID{$entry}} = $finalSeq{$entry};
1055 push @{$haploSeqs{$haploID{$entry}}}, $entry;
1056 }
1057 }
1058
1059 undef @accNum;
1060 @accNum = @haploList;
1061
1062 open (HAPLO, ">$haploFile") or die "Cannot print to file $haploFile\n";
1063 foreach my $haploType (@haploList)
1064 {
1065 print HAPLO "$haploType:";
1066 print HAPLO "\t$nameLabel{$_}" foreach @{$haploSeqs{$haploType}};
1067 print HAPLO "\n";
1068 }
1069 close HAPLO;
1070 }
1071
1072 # Print results!
1073 print "\nPrinting results ...\n";
1074 @accNum = sort { $nameLabel{$a} cmp $nameLabel{$b} } keys %nameLabel if ($seqOrder eq "alphabetical" and not $haploTyping);
1075
1076 # Print full data set
1077 $ntax = scalar(@accNum);
1078 seqPrint($seqFile);
1079
1080 # Print jackknifed data sets
1081 if ($jackknife)
1082 {
1083 my $delseqCount = 0;
1084 foreach my $seq (@accNum)
1085 {
1086 $deletedSeq{$seq} = 1;
1087 $delseqCount++;
1088
1089 # Change output file names
1090 $fastaOut = $dataSource . "_jack$delseqCount.fasta";
1091 $nexusOut = $dataSource . "_jack$delseqCount.nex";
1092 $phylipOut = $dataSource . "_jack$delseqCount.phylip";
1093 $sealOut = $dataSource . "_jack$delseqCount.seal";
1094
1095 $ntax = scalar(@accNum) - 1;
1096 seqPrint($seqFile);
1097
1098 $deletedSeq{$seq} = 0; # Reset deleted sequence
1099 }
1100 }
1101
1102 # Print data set partitions
1103 if (@charsetList and $seqSplit)
1104 {
1105 my $delCount = 0;
1106 foreach my $partition (@charsetList)
1107 {
1108 $delCount = 0;
1109 # Change output file names
1110 $fastaOut = $dataSource . "_$partition.fasta";
1111 $nexusOut = $dataSource . "_$partition.nex";
1112 $phylipOut = $dataSource . "_$partition.phylip";
1113 $sealOut = $dataSource . "_$partition.seal";
1114
1115 # Restrict sequence to partition limits
1116 foreach my $seq (@accNum)
1117 {
1118 $deletedSeq{$seq} = 0; # Reset all deleted sequences
1119 $finalSeq{$seq} = substr($sequence{$seq}, $charsetStart{$partition} - 1, $charsetEnd{$partition} - $charsetStart{$partition} + 1);
1120 # Check that sequence remains informative
1121 unless ($finalSeq{$seq} =~ /a/i or $finalSeq{$seq} =~ /c/i or $finalSeq{$seq} =~ /g/i or $finalSeq{$seq} =~ /t/i)
1122 {
1123 $delCount++;
1124 $deletedSeq{$seq} = 1;
1125 }
1126 }
1127 $ntax = scalar(@accNum) - $delCount;
1128 $maxLength = $charsetEnd{$partition} - $charsetStart{$partition} + 1;
1129 seqPrint($seqFile);
1130 }
1131 }
1132 }
1133
1134 exit(0);
1135
1136 ### Subroutines used in the program
1137
1138 sub setLineBreak # Check line breaks of input files and set input record separator accordingly
1139 {
1140 my $inFile = shift;
1141 $/ ="\n";
1142 open (IN, "<$inFile") or die "Cannot open $inFile to check form of line breaks.\n";
1143 while (<IN>)
1144 {
1145 if ($_ =~ /\r\n/)
1146 {
1147 print "\tDOS line breaks detected ...\n" if ($verbose);
1148 $/ ="\r\n";
1149 last;
1150 }
1151 elsif ($_ =~ /\r/)
1152 {
1153 print "\tMac line breaks detected ...\n" if ($verbose);
1154 $/ ="\r";
1155 last;
1156 }
1157 else
1158 {
1159 print "\tUnix line breaks detected ...\n" if ($verbose);
1160 $/ ="\n";
1161 last;
1162 }
1163 }
1164 close IN;
1165 }
1166
1167 sub seqRead
1168 {
1169 my $seqFile = shift;
1170 undef %sequence;
1171
1172 print "\nReading in sequence data from file $seqFile (type is $inputType) ...\n" if ($inputType);
1173 setLineBreak($seqFile);
1174 open (SEQ, "<$seqFile") or die "Cannot open file containing sequences, $seqFile\n";
1175 my ($header, $tempAcc, $tempName, $tempSeq);
1176 my $fastaAcc;
1177 my (%nexusSpecies, %nexusAcc, $nexusRead, $commentFlag);
1178 my ($phylipLineCount, $phylipTaxa, $phylipChars, %phylipSeq);
1179 my $sealCode;
1180 my ($sealDelFlag, $owner) = (0, 0);
1181 my ($gbAcc, $gbRead);
1182 my $macBlock = 0;
1183 my %accCount;
1184
1185 while (<SEQ>)
1186 {
1187 chomp;
1188 my $lineRead = $_;
1189 next unless ($lineRead);
1190
1191 # Autodetect sequence format
1192 if (not $inputType)
1193 {
1194 $inputType = "fasta" if ($lineRead =~ /^>/);
1195 $inputType = "nexus" if ($lineRead =~ /\#nexus/i);
1196 $inputType = "phylip" if ($lineRead =~ /^\s*\d+\s+\d+/);
1197 $inputType = "Se-Al" if ($lineRead =~ /^\s*Database=\{/i);
1198 $inputType = "GenBank" if ($lineRead =~ /^\s*LOCUS/);
1199 print "\nReading in sequence data from file $seqFile (type determined to be $inputType) ...\n" if ($inputType);
1200 }
1201
1202 if ($inputType eq "nexus")
1203 {
1204 # Check if charset statement present
1205 if ($lineRead =~ /charset/i)
1206 {
1207 $lineRead =~ s/\s+//g;
1208 $lineRead =~ s/;$//;
1209
1210 my ($charsetName, $charsetBounds) = split('=', $lineRead);
1211 $charsetName =~ s/charset//i;
1212 push @charsetList, $charsetName;
1213 if ($charsetBounds =~ /(\d+)-*(\d*)/)
1214 {
1215 $charsetStart{$charsetName} = $1;
1216 if ($2)
1217 {
1218 $charsetEnd{$charsetName} = $2;
1219 }
1220 else
1221 {
1222 $charsetEnd{$charsetName} = $1;
1223 }
1224 }
1225 }
1226
1227 # Otherwise block out MacClade / PAUP blocks
1228 $macBlock = 1 if ($lineRead =~ /begin macclade;/i or $lineRead =~ /begin paup;/i);
1229 $macBlock = 0 if ($macBlock and $lineRead =~ /end;/i);
1230 next if ($macBlock);
1231
1232 # Otherwise only read in data lines or charset statements
1233 if ($lineRead =~ /^\s*matrix/i)
1234 {
1235 $nexusRead = 1;
1236 next;
1237 }
1238 $nexusRead = 0 if ($lineRead =~ /;\s*$/);
1239 next unless ($nexusRead);
1240
1241 # Remove MacClade sequence lengths
1242 $lineRead =~ s/\[\d+\]$//;
1243
1244 $commentFlag = 1 if ($lineRead =~ /\[/);
1245 if ($lineRead =~ /\]/)
1246 {
1247 $commentFlag = 0;
1248 next;
1249 }
1250 next if ($commentFlag);
1251
1252 next unless ($lineRead =~ /a/i or $lineRead =~ /c/i or $lineRead =~ /g/i or $lineRead =~ /t/i or $lineRead =~ /n/i or $lineRead =~ /\?/ or $lineRead =~ /\-/);
1253
1254 # Clean up input line
1255 $lineRead =~ s/^\s+//;
1256 $lineRead =~ s/\'//g;
1257
1258 my @nexusLine = split(/\s+/, $lineRead);
1259 my $species = shift(@nexusLine);
1260 $species =~ s/\s+/_/g;
1261 $species =~ s/\_+/_/g;
1262 my $seq = join('', @nexusLine);
1263 $seq =~ s/\s+//g;
1264 $seqType = "protein" if ($seq =~ /E/i or $seq =~ /Q/i or $seq =~ /I/i or $seq =~ /L/i or $seq =~ /F/i or $seq =~ /P/i);
1265 if (not defined $nexusSpecies{$species})
1266 {
1267 $nexusSpecies{$species} = 1;
1268 $seqCount++;
1269 $nexusAcc{$species} = "tAlign_".$seqCount;
1270 push @accNum, $nexusAcc{$species};
1271 $nameLabel{$nexusAcc{$species}} = $species;
1272 $sequence{$nexusAcc{$species}} = uc($seq);
1273 $geneticCode{$nexusAcc{$species}} = $globalGenCode;
1274 }
1275 else # Sequences are in interleaved format; append sequence
1276 {
1277 $sequence{$nexusAcc{$species}} .= uc($seq);
1278 }
1279 }
1280
1281 if ($inputType eq "fasta")
1282 {
1283 if ($lineRead =~/^\s*>/)
1284 {
1285 my $species;
1286 $seqCount++;
1287 (my $tempSpecies = $lineRead) =~ s/^\s*>//;
1288
1289 if ($tempSpecies =~ /^Mit\.\s+/) # Entry comes from European RNA project
1290 {
1291 $tempSpecies =~ s/^Mit\.\s+//i; # To fix entries from European RNA project
1292 my @speciesInfo = split(/\s+/, $tempSpecies);
1293 $species = join('_', $speciesInfo[0], $speciesInfo[1]);
1294 if (defined $speciesInfo[2])
1295 {
1296 $fastaAcc = $speciesInfo[2];
1297 $accCount{$fastaAcc}++;
1298 if ($accCount{$fastaAcc} > 1)
1299 {
1300 print "\nWARNING: Accession number $fastaAcc used more than once";
1301 print "; skipping this and subsequent entries" if ($skipDuplicates);
1302 print "\n";
1303 }
1304 }
1305 else
1306 {
1307 $fastaAcc = "tAlign_".$seqCount;
1308 $accCount{$fastaAcc}++;
1309 }
1310 }
1311 else
1312 {
1313 my @speciesLine = split(/\s+/, $tempSpecies);
1314 if ($speciesLine[$#speciesLine] =~ /^\(?\w+\d+\)?$/ and scalar(@speciesLine) > 1) # Check whether last entry is an accession number
1315 {
1316 $fastaAcc = pop (@speciesLine);
1317 $fastaAcc =~ s/^\(//g;
1318 $fastaAcc =~ s/\)$//g;
1319 $accCount{$fastaAcc}++;
1320 if ($accCount{$fastaAcc} > 1)
1321 {
1322 print "\nWARNING: Accession number $fastaAcc used more than once";
1323 if ($skipDuplicates)
1324 {
1325 print "; skipping this and subsequent entries\n";
1326 }
1327 else
1328 {
1329 print "; assigning temporary accession number\n";
1330 $fastaAcc = "tAlign_" . $seqCount;
1331 }
1332 }
1333 }
1334 else
1335 {
1336 $fastaAcc = "tAlign_".$seqCount;
1337 $accCount{$fastaAcc}++;
1338 }
1339 $species = join('_', @speciesLine);
1340 $species = "Sequence_".$seqCount if ($species eq "");
1341 }
1342 push @accNum, $fastaAcc unless ($accCount{$fastaAcc} > 1 and $skipDuplicates);
1343 $geneticCode{$fastaAcc} = $globalGenCode;
1344 ($nameLabel{$fastaAcc} = $species) =~ s/\s+/_/;
1345 $nameLabel{$fastaAcc} =~ s/\_+/_/;
1346 }
1347 else
1348 {
1349 next if ($accCount{$fastaAcc} > 1 and $skipDuplicates);
1350 $lineRead =~ s/\s+//g;
1351 $seqType = "protein" if ($lineRead =~ /E/i or $lineRead =~ /Q/i or $lineRead =~ /I/i or $lineRead =~ /L/i or $lineRead =~ /F/i or $lineRead =~ /P/i);
1352 $sequence{$fastaAcc} .= uc($lineRead);
1353 }
1354 }
1355
1356 if ($inputType eq "Se-Al")
1357 {
1358 my $header;
1359 $sealDelFlag = 1 if ($lineRead =~/MCoL/); # Se-Al sometimes places deleted species at end of file; do not read in remainder of file
1360 next if ($sealDelFlag == 1);
1361 next unless ($lineRead =~/NumSites/i or $lineRead =~/Owner/i or $lineRead =~/Name/i or $lineRead =~/Accession/i or $lineRead =~/Sequence/i or $lineRead =~/GeneticCode/i or $lineRead =~/Frame/i);
1362 if ($lineRead =~/Owner\s*\=\s*(\d+)/i)
1363 {
1364 $owner = $1;
1365 }
1366 if ($lineRead =~/Accession/i and $owner == 2)
1367 {
1368 $seqCount++;
1369 if ($lineRead =~ /null/ or $lineRead =~ /\"\"/)
1370 {
1371 $tempAcc = "tAlign_" . $seqCount;
1372 $accCount{$tempAcc}++;
1373 }
1374 else
1375 {
1376 ($header, $tempAcc) = split (/=/, $lineRead);
1377 $tempAcc =~ s/\"//g;
1378 $tempAcc =~ s/;//g;
1379 $accCount{$tempAcc}++;
1380 if ($accCount{$tempAcc} > 1)
1381 {
1382 print "\nWARNING: Accession number $fastaAcc used more than once";
1383 if ($skipDuplicates)
1384 {
1385 print "; skipping this and subsequent entries\n";
1386 }
1387 else
1388 {
1389 print "; assigning temporary accession number\n";
1390 $tempAcc = "tAlign_" . $seqCount;
1391 }
1392 print "\n";
1393 }
1394 }
1395 push @accNum, $tempAcc unless ($accCount{$tempAcc} > 1 and $skipDuplicates);
1396 }
1397 if ($lineRead =~/Name/i and $owner == 2)
1398 {
1399 ($header, $tempName) = split (/=/, $lineRead);
1400 $tempName =~ s/\"//g;
1401 $tempName =~ s/\s*;//g;
1402 $tempName =~ s/\s+/_/g;
1403 $tempName =~ s/\_+/_/g;
1404 }
1405 if ($lineRead =~/GeneticCode=(\d)/i and $owner == 2)
1406 {
1407 $sealCode = $1;
1408 $geneticCode{$tempAcc} = $seal2gb{$sealCode};
1409 }
1410 if ($lineRead =~/Sequence/i and $owner == 2)
1411 {
1412 next if ($accCount{$tempAcc} > 1 and $skipDuplicates);
1413 ($header, $tempSeq) = split (/=/, $lineRead);
1414 $tempSeq =~ s/\"//g;
1415 $tempSeq =~ s/;//g;
1416 $tempSeq =~ s/\s+//g;
1417 $nameLabel{$tempAcc} = $tempName;
1418 $sequence{$tempAcc} = uc($tempSeq);
1419 $seqType = "protein" if ($tempSeq =~ /E/i or $tempSeq =~ /Q/i or $tempSeq =~ /I/i or $tempSeq =~ /L/i or $tempSeq =~ /F/i or $tempSeq =~ /P/i);
1420 }
1421 if ($lineRead =~/Frame=(\d)/i) # Correct for reading frame
1422 {
1423 my $readingFrame = $1;
1424 $sequence{$tempAcc} = "--" . $sequence{$tempAcc} if ($readingFrame == 2);
1425 $sequence{$tempAcc} = "-" . $sequence{$tempAcc} if ($readingFrame == 3);
1426 }
1427 }
1428
1429 if ($inputType eq "phylip")
1430 {
1431 if ($lineRead =~ /^\s*(\d+)\s+(\d+)/)
1432 {
1433 $phylipTaxa = $1;
1434 $phylipChars = $2;
1435 $phylipLineCount = 0;
1436 }
1437 else
1438 {
1439 $phylipLineCount++;
1440
1441 $lineRead =~ s/\s//g;
1442
1443 $phylipSeq{$phylipLineCount} .= $lineRead;
1444
1445 $phylipLineCount = 0 if ($phylipLineCount == $phylipTaxa);
1446 }
1447 }
1448
1449 if ($inputType eq "GenBank")
1450 {
1451 # Get species name and accession number
1452 # Pure GenBank format
1453 $gbAcc = $1 if ($lineRead =~ /^\s*ACCESSION\s+(\w+\d+)/);
1454 if ($lineRead =~ /^\s*ORGANISM\s+/)
1455 {
1456 $seqCount++;
1457 $gbAcc = "tAlign_" . $seqCount if (not defined $gbAcc);
1458 $lineRead =~ s/^\s+//;
1459 my @orgLine = split(/\s+/, $lineRead);
1460 my $header = shift (@orgLine);
1461 $nameLabel{$gbAcc} = join('_', @orgLine);
1462 $accCount{$gbAcc}++;
1463 }
1464 # BioEdit format
1465 if ($lineRead =~ /^\s*TITLE/ and not defined ($gbAcc))
1466 {
1467 $seqCount++;
1468 $gbAcc = "tAlign_" . $seqCount;
1469 my @accLine = split (/\s+/, $lineRead);
1470 $gbAcc = $1 if ($accLine[2] =~ /^\((\w+\d+)\)/);
1471 $nameLabel{$gbAcc} = $accLine[1];
1472 $accCount{$gbAcc}++;
1473 }
1474
1475 if ($lineRead =~ /^\s*ORIGIN/)
1476 {
1477 if ($accCount{$gbAcc} > 1)
1478 {
1479 print "\nWARNING: Accession number $gbAcc used more than once";
1480 if ($skipDuplicates)
1481 {
1482 print "; skipping this and subsequent entries\n";
1483 next;
1484 }
1485 else
1486 {
1487 print "; assigning temporary accession number\n";
1488 $gbAcc = "tAlign_" . $seqCount;
1489 $gbRead = 1;
1490 }
1491 }
1492 else
1493 {
1494 $gbRead = 1;
1495 $seqCount++;
1496 }
1497 next;
1498 }
1499
1500 if ($lineRead =~ /^\s*\/\//) # End of accession; process
1501 {
1502 $gbRead = 0;
1503 next unless ($gbAcc);
1504
1505 push @accNum, $gbAcc unless ($accCount{$gbAcc} > 1);
1506 $geneticCode{$gbAcc} = $globalGenCode;
1507 $nameLabel{$gbAcc} =~ s/\s+/_/g;
1508 $nameLabel{$gbAcc} =~ s/\_+/_/g;
1509 $sequence{$gbAcc} =~ s/\d//g;
1510 $sequence{$gbAcc} =~ s/\s+//g;
1511 $sequence{$gbAcc} =~ s/\~/-/g if ($gapChange);
1512 undef $gbAcc;
1513 }
1514
1515 next unless ($gbRead);
1516
1517 if ($gbRead and $lineRead =~ /^\s+\d+/)
1518 {
1519 $sequence{$gbAcc} .= uc($lineRead);
1520 }
1521 }
1522 }
1523 close SEQ;
1524
1525 if ($inputType eq "phylip") # Postprocess input to derive taxon names and sequence; accounts for both sequential and extended formatting
1526 {
1527 for (my $i = 1; $i <= $phylipTaxa; $i++)
1528 {
1529 my $phylipAcc = "tAlign_" . $i;
1530
1531 push @accNum, $phylipAcc;
1532 $geneticCode{$phylipAcc} = $globalGenCode;
1533
1534 # Derive taxon name and sequence
1535 $sequence{$phylipAcc} = uc(substr($phylipSeq{$i}, 0 - $phylipChars));
1536 $seqType = "protein" if ($sequence{$phylipAcc} =~ /E/i or $sequence{$phylipAcc} =~ /Q/i or $sequence{$phylipAcc} =~ /I/i or $sequence{$phylipAcc} =~ /L/i or $sequence{$phylipAcc} =~ /F/i or $sequence{$phylipAcc} =~ /P/i);
1537
1538 $nameLabel{$phylipAcc} = substr($phylipSeq{$i}, 0, length($phylipSeq{$i}) - $phylipChars);
1539 $nameLabel{$phylipAcc} =~ s/\s+/_/g;
1540 $nameLabel{$phylipAcc} =~ s/\_+/_/g;
1541 }
1542 }
1543 }
1544
1545 sub seqPrint
1546 {
1547 my $seqFile = shift;
1548
1549 $interleaveLength = $maxLength if ($outFormat eq "sequential");
1550
1551 # Print fasta-formatted file
1552 if ($fastaPrint)
1553 {
1554 print "\tWriting to fasta-formatted file $fastaOut ...\n";
1555 open (FASTA, ">$fastaOut") or die "Cannot open fasta file for aligned DNA sequences, $fastaOut";
1556 foreach my $entry (@accNum)
1557 {
1558 # next if ($deletedSeq{$entry} or not defined $sequence{$entry});
1559 print FASTA ">$nameLabel{$entry}";
1560 print FASTA "\t($entry)" unless ($entry =~ /^tAlign/);
1561 print FASTA "\n";
1562
1563 # Print sequence
1564 my $fastaSeq = $finalSeq{$entry};
1565 for (my $breakpoint = 0; $breakpoint <= length($fastaSeq); $breakpoint += $fastaLength)
1566 {
1567 print FASTA substr($fastaSeq, $breakpoint, $fastaLength) . "\n";
1568 }
1569 # my $fastaSeq = $finalSeq{$entry};
1570 # my $breakPoint = 80;
1571 # my $breakCount = 0;
1572 # until ($breakPoint > length($fastaSeq))
1573 # {
1574 # $breakCount++;
1575 # my $replaceString = "\n" . substr($fastaSeq, $breakPoint, 1);
1576 # substr($fastaSeq, $breakPoint, 1) = $replaceString;
1577 # $breakPoint += 81; # Latter accounts for addition of \n to string
1578 # }
1579 # print FASTA "\n$fastaSeq\n";
1580 }
1581 close FASTA;
1582 }
1583
1584 # Print nexus-formatted file
1585 if ($nexusPrint)
1586 {
1587 print "\tWriting to nexus file $nexusOut";
1588 print " in interleaved format" if ($outFormat eq "interleave");
1589 print " ...\n";
1590 open (NEX, ">$nexusOut") or die "Cannot open nexus file for aligned DNA sequences, $nexusOut";
1591 print NEX "#NEXUS\n\n";
1592 print NEX "[File created from $seqFile using $perlScript v$version on ".localtime()."]\n\n";
1593 print NEX "begin data;\n";
1594 print NEX "\tdimensions ntax = $ntax nchar = $maxLength;\n";
1595 print NEX "\tformat datatype = $seqType gap = - missing = ?";
1596 print NEX " interleave" if ($outFormat eq "interleave");
1597 print NEX ";\n\n";
1598 print NEX "\tmatrix\n\n";
1599 for (my $interleaveCount = 1; $interleaveCount <= ceil($maxLength / $interleaveLength); $interleaveCount++)
1600 {
1601 my $seqStart = (($interleaveCount - 1) * $interleaveLength) + 1;
1602 my $seqEnd = $interleaveCount * $interleaveLength;
1603 $seqEnd = $maxLength if ($maxLength <= $seqEnd);
1604 my $printLength = $seqEnd - $seqStart + 1;
1605
1606 foreach my $entry (@accNum)
1607 {
1608 next if ($deletedSeq{$entry});
1609 my $nexusName = $nameLabel{$entry};
1610 $nexusName = $entry if ($accPrint);
1611 if ($nexusName =~ /\W/)
1612 {
1613 print NEX "'$nexusName'";
1614 }
1615 else
1616 {
1617 print NEX "$nexusName";
1618 }
1619
1620 my $printSeq = substr($finalSeq{$entry}, $seqStart - 1, $printLength);
1621 if ($outFormat eq "interleave") # Add gaps every 10 elements in sequence
1622 {
1623 for (my $gapAdd = 100; $gapAdd >= 10; $gapAdd-=10)
1624 {
1625 next if $gapAdd > $printLength;
1626 my $bp = substr($printSeq, $gapAdd - 1, 1) . " ";
1627 substr($printSeq, $gapAdd - 1, 1, $bp);
1628 }
1629 }
1630 print NEX "\t$printSeq\n";
1631 }
1632 print NEX "\n" if ($interleaveCount * $interleaveLength <= $maxLength);
1633 }
1634 # foreach my $entry (@accNum)
1635 # {
1636 # next if ($deletedSeq{$entry});
1637 # if ($nameLabel{$entry} =~ /\W/)
1638 # {
1639 # print NEX "'$nameLabel{$entry}'";
1640 # }
1641 # else
1642 # {
1643 # print NEX "$nameLabel{$entry}";
1644 # }
1645 # print NEX "\t$finalSeq{$entry}\n";
1646 # }
1647 print NEX "\t;\nend;\n";
1648 close NEX;
1649
1650 if (-e "/Developer/Tools/SetFile")
1651 {
1652 system ("/Developer/Tools/SetFile -t 'TEXT' $nexusOut");
1653 system ("/Developer/Tools/SetFile -c 'PAUP' $nexusOut");
1654 }
1655 }
1656
1657 # Print phylip-formatted file (on demand)
1658 if ($phylipTradPrint or $phylipExtPrint)
1659 {
1660 my $maxTaxLength = 50;
1661 $maxTaxLength = 10 if ($phylipTradPrint);
1662 my $blankName = " " x $maxTaxLength;
1663 my %shortNameCount;
1664
1665 print "\tWriting to phylip file $phylipOut ...\n";
1666 open (PHYLIP, ">$phylipOut") or die "Cannot open phylip file for aligned DNA sequences, $phylipOut";
1667 print PHYLIP " $ntax $maxLength\n";
1668 for (my $interleaveCount = 1; $interleaveCount <= ceil($maxLength / $interleaveLength); $interleaveCount++)
1669 {
1670 my $seqStart = (($interleaveCount - 1) * $interleaveLength) + 1;
1671 my $seqEnd = $interleaveCount * $interleaveLength;
1672 $seqEnd = $maxLength if ($maxLength <= $seqEnd);
1673 my $printLength = $seqEnd - $seqStart + 1;
1674
1675 # foreach my $entry (@accNum)
1676 # {
1677 # my $trimmedName = substr($nameLabel{$entry}, 0, $maxTaxLength);
1678 # $shortNameCount{trimmedName} = 0;
1679 # }
1680
1681 foreach my $entry (@accNum)
1682 {
1683 next if ($deletedSeq{$entry});
1684
1685 my $phylipName = $nameLabel{$entry};
1686 $phylipName = $entry if ($accPrint);
1687
1688 # Print name label as appropriate; also check label and adjust to proper length if needed
1689 if ($interleaveCount == 1)
1690 {
1691 if (length($phylipName) < $maxTaxLength)
1692 {
1693 $shortNameCount{$phylipName}++;
1694 $phylipName .= " " x ($maxTaxLength - length($phylipName)) if ($phylipTradPrint); # Pad end of name with spaces as needed
1695 }
1696 else
1697 {
1698 my $trimmedName = substr($phylipName, 0, $maxTaxLength);
1699 $shortNameCount{$trimmedName}++;
1700 if ($shortNameCount{$trimmedName} > 1) # Check for duplicates among shortened names and make unique by adding numbers
1701 {
1702 $phylipName = substr($phylipName, 0, $maxTaxLength - length($shortNameCount{$trimmedName}));
1703 $phylipName .= $shortNameCount{$trimmedName};
1704 $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed
1705 }
1706 else
1707 {
1708 $phylipName = $trimmedName;
1709 }
1710 }
1711 print PHYLIP "$phylipName";
1712 print PHYLIP " " if ($phylipExtPrint);
1713 }
1714 else
1715 {
1716 print PHYLIP "$blankName";
1717 }
1718
1719 # Print sequence
1720 my $printSeq = substr($finalSeq{$entry}, $seqStart - 1, $printLength);
1721 if ($outFormat eq "interleave") # Add gaps every 10 elements in sequence
1722 {
1723 for (my $gapAdd = 100; $gapAdd >= 10; $gapAdd-=10)
1724 {
1725 next if $gapAdd > $printLength;
1726 my $bp = substr($printSeq, $gapAdd - 1, 1) . " ";
1727 substr($printSeq, $gapAdd - 1, 1, $bp);
1728 }
1729 }
1730 print PHYLIP "$printSeq\n";
1731 }
1732 print PHYLIP "\n" if ($interleaveCount * $interleaveLength <= $maxLength);
1733
1734 # foreach my $entry (@accNum)
1735 # {
1736 # next if ($deletedSeq{$entry});
1737 #
1738 # my $phylipName = $nameLabel{$entry};
1739 #
1740 # # Check name label and adjust to proper length if needed
1741 # if (length($phylipName) < $maxTaxLength)
1742 # {
1743 # $shortNameCount{$phylipName}++;
1744 # $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed
1745 # }
1746 # else
1747 # {
1748 # my $trimmedName = substr($phylipName, 0 , $maxTaxLength);
1749 # $shortNameCount{$trimmedName}++;
1750 # if ($shortNameCount{$trimmedName} > 1) # Check for duplicates among shortened names and make unique by adding numbers
1751 # {
1752 # $phylipName = substr($phylipName, 0, $maxTaxLength - length($shortNameCount{$trimmedName}));
1753 # $phylipName .= $shortNameCount{$trimmedName};
1754 # $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed
1755 # }
1756 # else
1757 # {
1758 # $phylipName = $trimmedName;
1759 # }
1760 # }
1761 #
1762 # print PHYLIP "$phylipName";
1763 # print PHYLIP " " if ($phylipExtPrint);
1764 # print PHYLIP "$finalSeq{$entry}\n";
1765 }
1766 close PHYLIP;
1767 }
1768
1769 # Print Se-Al-formatted file (on demand)
1770 if ($sealPrint)
1771 {
1772 print "\tWriting to Se_Al file $sealOut ...\n";
1773 open (SEAL, ">$sealOut") or die "Cannot open Se-Al file for aligned DNA sequences, $sealOut\n";
1774 print SEAL "Database={\n";
1775 print SEAL "\tID='MLst';\n";
1776 print SEAL "\tOwner=null;\n";
1777 print SEAL "\tName=null;\n";
1778 print SEAL "\tDescription=null;\n";
1779 print SEAL "\tFlags=0;\n";
1780 print SEAL "\tCount=2;\n";
1781 print SEAL "\t{\n\t\t{\n";
1782
1783 print SEAL "\t\t\tID='PAli';\n";
1784 print SEAL "\t\t\tOwner=1;\n";
1785 print SEAL "\t\t\tName=\"$seqFile\";\n";
1786 print SEAL "\t\t\tDescription=null;\n";
1787 print SEAL "\t\t\tFlags=0;\n";
1788 print SEAL "\t\t\tNumSites=$maxLength;\n";
1789 print SEAL "\t\t\tType=";
1790 if ($seqType eq "nucleotide")
1791 {
1792 print SEAL "\"Nucleotide\"";
1793 }
1794 else
1795 {
1796 print SEAL "\"Amino Acid\"";
1797 }
1798 print SEAL ";\n";
1799 print SEAL "\t\t\tFeatures=null;\n";
1800 print SEAL "\t\t\tColourMode=";
1801 if ($seqType eq "nucleotide")
1802 {
1803 print SEAL "1";
1804 }
1805 else
1806 {
1807 print SEAL "2";
1808 }
1809 print SEAL ";\n";
1810 print SEAL "\t\t\tLabelMode=0;\n";
1811 print SEAL "\t\t\ttriplets=false;\n";
1812 print SEAL "\t\t\tinverse=true;\n";
1813 print SEAL "\t\t\tCount=$ntax;\n";
1814 print SEAL "\t\t\t{\n";
1815
1816 my $i = 0;
1817 foreach my $sequence (@accNum)
1818 {
1819 next if ($deletedSeq{$sequence});
1820 $i++;
1821 print SEAL "\t\t\t\t{\n";
1822 print SEAL "\t\t\t\t\tID='PSeq';\n";
1823 print SEAL "\t\t\t\t\tOwner=2;\n";
1824 print SEAL "\t\t\t\t\tName=\"$nameLabel{$sequence}\";\n";
1825 print SEAL "\t\t\t\t\tDescription=null;\n";
1826 print SEAL "\t\t\t\t\tFlags=0;\n";
1827 print SEAL "\t\t\t\t\tAccession=";
1828 if ($sequence =~/^tAlign_/)
1829 {
1830 print SEAL "null;\n";
1831 }
1832 else
1833 {
1834 print SEAL "$sequence;\n";
1835 }
1836 if ($seqType eq "nucleotide")
1837 {
1838 print SEAL "\t\t\t\t\tType=\"DNA\";\n";
1839 }
1840 else
1841 {
1842 print SEAL "\t\t\t\t\tType=\"AA\";\n";
1843 }
1844 print SEAL "\t\t\t\t\tLength=".length($finalSeq{$sequence}).";\n";
1845 print SEAL "\t\t\t\t\tSequence=\"$finalSeq{$sequence}\";\n";
1846 if (defined $geneticCode{$sequence} and $geneticCode{$sequence} != 0)
1847 {
1848 print SEAL "\t\t\t\t\tGeneticCode=$gb2seal{$geneticCode{$sequence}};\n";
1849 }
1850 else
1851 {
1852 print SEAL "\t\t\t\t\tGeneticCode=-1;\n"; # Default for Se-Al is non-coding
1853 }
1854 print SEAL "\t\t\t\t\tCodeTable=null;\n";
1855 print SEAL "\t\t\t\t\tFrame=1;\n";
1856 print SEAL "\t\t\t\t\tFeatures=null;\n";
1857 print SEAL "\t\t\t\t\tParent=null;\n";
1858 print SEAL "\t\t\t\t\tComplemented=false;\n";
1859 print SEAL "\t\t\t\t\tReversed=false;\n";
1860 print SEAL "\t\t\t\t}";
1861 print SEAL "," unless ($i == $ntax);
1862 print SEAL "\n";
1863 }
1864
1865 print SEAL "\t\t\t};\n";
1866 print SEAL "\t\t},\n";
1867 print SEAL "\t\t{\n";
1868 print SEAL "\t\t\tID='MCoL';\n";
1869 print SEAL "\t\t\tOwner=1;\n";
1870 print SEAL "\t\t\tName=\"Genetic Codes\";\n";
1871 print SEAL "\t\t\tDescription=\"Custom Genetic Codes\";\n";
1872 print SEAL "\t\t\tFlags=0;\n";
1873 print SEAL "\t\t\tCount=0;\n";
1874 print SEAL "\t\t}\n";
1875 print SEAL "\t};\n";
1876 print SEAL "};\n";
1877 close SEAL;
1878
1879 if (-e "/Developer/Tools/SetFile")
1880 {
1881 system ("/Developer/Tools/SetFile -t 'TEXT' $sealOut");
1882 system ("/Developer/Tools/SetFile -c 'SEAL' $sealOut");
1883 }
1884 }
1885 }
1886
1887 sub geneticCoder # Create translation tables for all genetic codes
1888 {
1889 my %geneticCode = ('1' => 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
1890 '2' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG',
1891 '3' => 'FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
1892 '4' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
1893 '5' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG',
1894 '6' => 'FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
1895 '9' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG',
1896 '10' => 'FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
1897 '11' => 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
1898 '12' => 'FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
1899 '13' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG',
1900 '14' => 'FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG',
1901 '15' => 'FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
1902 '16' => 'FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
1903 '21' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG',
1904 '22' => 'FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
1905 '23' => 'FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG');
1906
1907 foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23))
1908 {
1909 # Establish basic translation table for each genetic code
1910 # print "\nEstablishing \"$transTable{$code}\" genetic code ...\n" if ($debug);
1911 my $position = 0;
1912 foreach my $base1 (qw (T C A G))
1913 {
1914 foreach my $base2 (qw (T C A G))
1915 {
1916 foreach my $base3 (qw (T C A G))
1917 {
1918 my $codon = $base1.$base2.$base3;
1919 $DNAtoAA{$code}{$codon} = substr($geneticCode{$code}, $position, 1);
1920 # print "\t$codon = $DNAtoAA{$code}{$codon}\n" if ($debug);
1921 $position++;
1922 }
1923 }
1924 }
1925
1926 # Extend translation table to account for ambiguity codes (note: does not account for gaps)
1927 # print "\nExtending translation table to account for ambiguity codes ...\n" if ($debug);
1928 foreach my $firstPos (@ambigList)
1929 {
1930 foreach my $secondPos (@ambigList)
1931 {
1932 foreach my $thirdPos (@ambigList)
1933 {
1934 my $codon = $firstPos.$secondPos.$thirdPos;
1935 next if (defined $DNAtoAA{$code}{$codon});
1936 my $refAA = "";
1937 foreach my $firstNT (@ {$constitNTlist{$firstPos} })
1938 {
1939 last if (defined $DNAtoAA{$code}{$codon});
1940 foreach my $secondNT (@ {$constitNTlist{$secondPos} })
1941 {
1942 last if (defined $DNAtoAA{$code}{$codon});
1943 foreach my $thirdNT (@ {$constitNTlist{$thirdPos} })
1944 {
1945 my $testCodon = $firstNT.$secondNT.$thirdNT;
1946 if (not $refAA)
1947 {
1948 $refAA = $DNAtoAA{$code}{$testCodon};
1949 }
1950 else
1951 {
1952 if ($DNAtoAA{$code}{$testCodon} ne $refAA)
1953 {
1954 $DNAtoAA{$code}{$codon} = "?";
1955 last;
1956 }
1957 }
1958 }
1959 }
1960 }
1961 $DNAtoAA{$code}{$codon} = $refAA if (not defined $DNAtoAA{$code}{$codon});
1962 # print "\t$codon = $DNAtoAA{$code}{$codon}\n" if ($debug);
1963 }
1964 }
1965 }
1966 }
1967 return;
1968 }
1969
1970 sub translate # Translate a DNA sequence to an AA sequence (note: does not account for gaps)
1971 {
1972 my $DNAseq = shift;
1973 my $userCode = shift;
1974
1975 my $protSeq;
1976 for (my $codonStart = 0; $codonStart < length($DNAseq); $codonStart += 3)
1977 {
1978 if (length($DNAseq) - $codonStart >= 3) # Codon is complete; translate
1979 {
1980 my $codon = substr($DNAseq, $codonStart, 3);
1981 if ($codon =~ /-/ or $codon =~ /\./)
1982 {
1983 $protSeq .= "?";
1984 }
1985 else
1986 {
1987 $protSeq .= $DNAtoAA{$userCode}{$codon};
1988 }
1989 }
1990 else # Incomplete codon; automatically translates as ?
1991 {
1992 $protSeq .= "?";
1993 }
1994 }
1995
1996 return $protSeq;
1997 }
1998
1999 # Version history
2000 #
2001 # v1.2 (August 12, 2010)
2002 # - added switch to actually allow fasta output to be specified (or not)
2003 # - sets TYPE and CREATOR codes for nexus files on Mac systems when
2004 # SetFile is present
2005 # - can now parse GenBank formatted output (both pure GenBank and BioEdit
2006 # versions)
2007 # - error checking: warns if same accession number used more than once
2008 # and skips subsequent entires
2009 # - added ability to:
2010 # - detect sequence type of input (nucleotide vs protein) and set as
2011 # appropriate in output files
2012 # - convert nucleotide input to proteins
2013 # - convert sequence input to haplotypes
2014 # - interleave nexus- and phylip-formatted output (request by Michael
2015 # Craige) and to use inputted value to specify line lengths in fasta
2016 # output
2017 # - output individual data partitions specified according to nexus-
2018 # formatted charset statements
2019 # - output jackknifed data sets, each missing a single taxon
2020 # - clean sequence labels of non-alphanumeric characters (on by
2021 # default)
2022 # - convert all ~ gap characters (e.g., from BioEdit) to -
2023 # - convert all ambiguous nucleotides to Ns
2024 # - change flnaking gaps to Ns (indirect request by Simon Creer)
2025 # - fixed classic phylip output such that it now conforms 100% to the
2026 # phylip guidelines
2027 # - improved parsing of MacClade generating files, including blocking out
2028 # MacClade and PAUP blocks when reading in nexus-formatted files
2029 # - fixed translation between GenBank and Se-Al genetic codes
2030 # - changed all instances of #nexus to #NEXUS in output files for
2031 # compatibility with TNT (thanks to Douglas Chester for spotting this)
2032 # - minor bug fixes
2033 #
2034 # v1.1 (March 2, 2006)
2035 # - added ability to batch convert all specified file types in working
2036 # directory (use -d*)
2037 # - updated to seqRead module 1.1.1 (includes autodetection of sequence
2038 # format)
2039 # - checks that necessary input file(s) exists before proceeding
2040 # - added GNU GPL statement
2041 # - sets TYPE and CREATOR codes for Se-Al files on Mac systems when
2042 # SetFile is present
2043 # - minor bug fixes
2044 #
2045 # v1.0 (May 30, 2005)
2046 # - initial release
2047 #
2048 # This program is free software; you can redistribute it and/or
2049 # modify it under the terms of the GNU General Public License
2050 # as published by the Free Software Foundation; either version 2
2051 # of the License, or (at your option) any later version.
2052 #
2053 # This program is distributed in the hope that it will be useful,
2054 # but WITHOUT ANY WARRANTY; without even the implied warranty of
2055 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2056 # GNU General Public License for more details.
2057 #
2058 # A copy of the GNU General Public License is available at
2059 # http://www.gnu.org/copyleft/gpl.html or by writing to
2060 # the Free Software Foundation, Inc., 51 Franklin Street,
2061 # Fifth Floor, Boston, MA, 02110-1301, USA.
2062