Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison phyloconversion/seqConverterG.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5b9a38ec4a39 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # seqConverter.pl v1.2 | |
4 # Last modified August 12, 2010 21:09 | |
5 # Minor modifications for running in Galaxy by TH Oakley Jan 2012 | |
6 # (c) Olaf R.P. Bininda-Emonds | |
7 # | |
8 # Input: | |
9 # Sequence data in any of fasta, GenBank, nexus, (classic or extended) phylip, | |
10 # or Se-Al formats | |
11 # | |
12 # Output: | |
13 # Sequence data in any of fasta, nexus, (classic or extended) phylip, and/or | |
14 # Se-Al formats. | |
15 # | |
16 # Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g] | |
17 # [-G] [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t] | |
18 # [-u] [-v] [-h] | |
19 # [-O]<outfile> specify outfile | |
20 # options: -a = print out accession numbers instead of taxon labels for nexus | |
21 # and phylip output | |
22 # -c<number> = global genetic code (to be used for translation unless | |
23 # otherwise specified) (default = standard code; end | |
24 # with * to override local settings) | |
25 # -d<filename> = file containing raw sequence information; * = batch | |
26 # convert all specified file types in working | |
27 # directory (suffixes must be .fasta / .fas, .gb, | |
28 # .nexus / .nex, .phylip / .ph, or .seal as | |
29 # appropriate) | |
30 # -g = do not convert ~ gap characters (e.g., from BioEdit) to - | |
31 # -G = convert flanking gaps to Ns | |
32 # -H = convert sequence data to haplotype data | |
33 # -i<f|g|n|p|s> = format of sequence file (fasta (f), GenBank (g), | |
34 # nexus (n), phylip (p), or Se-Al (s)); default = | |
35 # autodetect | |
36 # -j = produce jackknifed data sets each with a single sequence | |
37 # deleted | |
38 # -l<number> = interleave nexus- and/or phylip-formatted output | |
39 # (default = sequential) with specified sequence length | |
40 # (between 10 and 100 inclusive; default = 80); | |
41 # NOTE: also sets default interleave length for fasta | |
42 # output | |
43 # -n = convert ambiguous nucleotides to Ns | |
44 # -o<f|n|pc|pe|s> = output results in any or all of fasta (f), nexus | |
45 # (n), classic or extended phylip (pc or pe), | |
46 # and/or Se-Al (s) formats (NOTE: flag can be | |
47 # invoked multiple times) | |
48 # -r<a|i> = order sequences in final output alphabetically by name | |
49 # (a; default) or in input order from file (i) | |
50 # -s = split data set into individual partitions according to nexus | |
51 # charset statements | |
52 # -t = translate sequences into amino acids | |
53 # -u = interactive user-input mode | |
54 # -h = print this message and quit | |
55 # -v = verbose output | |
56 | |
57 use strict; | |
58 use POSIX; | |
59 #THO added ability for Galaxy -G to specify outfile name | |
60 my $OutFile; | |
61 | |
62 # Set user-input defaults and associated parameters | |
63 # Data set variables | |
64 my $inputType = ""; # Options are "fasta", "GenBank", "nexus", "phylip", and "Se-Al" | |
65 my ($readFile, @seqFiles); | |
66 my $dataSource; | |
67 my $seqType = "nucleotide"; | |
68 my $globalGenCode = 0; | |
69 my $globalOverride = 0; | |
70 | |
71 my (@accNum, %nameLabel, %sequence, %geneticCode, %accPresent); | |
72 my (@charsetList, %charsetStart, %charsetEnd); | |
73 my (%deletedSeq, %finalSeq); | |
74 my $seqCount; | |
75 my $ntax; | |
76 my $nameClean = 1; | |
77 my $skipDuplicates = 1; | |
78 | |
79 # User input variables | |
80 my $seqOrder = "alphabetical"; # Options are "alphabetical" (default) and "input" | |
81 my $seqSplit = 0; | |
82 my $jackknife = 0; | |
83 my $gapChange = 1; | |
84 my $ambigChange = 0; | |
85 my $translateSeqs = 0; | |
86 my $haploTyping = 0; | |
87 my $haploFile; | |
88 my $flankGap = 0; | |
89 | |
90 # Translation variables and genetic codes | |
91 my %transTable = ('1' => 'standard', | |
92 '2' => 'vertebrate mitochondrial', | |
93 '3' => 'yeast mitochondrial', | |
94 '4' => 'mold, protozoan and colenterate mitochondrial and mycoplasam/spiroplasma', | |
95 '5' => 'invertebrate mitochondrial', | |
96 '6' => 'ciliate, dasycladacean and hexamita nuclear', | |
97 '9' => 'echinoderm mitochondrial', | |
98 '10' => 'euplotid nuclear', | |
99 '11' => 'bacterial and plant plastid', | |
100 '12' => 'alternative yeast nuclear', | |
101 '13' => 'ascidian mitochondrial', | |
102 '14' => 'alternative flatworm mitochondrial', | |
103 '15' => 'Blepharisma nuclear', | |
104 '16' => 'chlorophycean mitochondrial', | |
105 '21' => 'trematode mitochondrial', | |
106 '22' => 'Scenedesmus obliquus mitochondrial', | |
107 '23' => 'Thraustochytrium mitochondrial'); | |
108 my %gb2seal = ('1' => '0', | |
109 '2' => '1', | |
110 '3' => '2', | |
111 '4' => '3', | |
112 '4' => '4', | |
113 '5' => '5', | |
114 '6' => '6', | |
115 '9' => '7', | |
116 '10' => '8', | |
117 '11' => '9', | |
118 '12' => '10', | |
119 '13' => '11', | |
120 '14' => '12', | |
121 '15' => '13', | |
122 '16' => '1', | |
123 '21' => '1', | |
124 '22' => '1', | |
125 '23' => '1'); | |
126 my %seal2gb = ('0' => '1', | |
127 '1' => '2', | |
128 '2' => '3', | |
129 '3' => '4', | |
130 '4' => '4', | |
131 '5' => '5', | |
132 '6' => '6', | |
133 '7' => '9', | |
134 '8' => '10', | |
135 '9' => '11', | |
136 '10' => '12', | |
137 '11' => '13', | |
138 '12' => '14', | |
139 '13' => '15'); | |
140 my %genCodes; | |
141 foreach my $code (qw(0 1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23)) | |
142 { | |
143 $genCodes{$code} = 1; | |
144 } | |
145 my %DNAtoAA; | |
146 my @ambigList = qw(A C G T M R W S Y K V H D B N); | |
147 my %ambigCode = ('A' => 'A', 'C' => 'C', 'G' => 'G', 'T' => 'T', | |
148 'AC' => 'M', 'AG' => 'R', 'AT' => 'W', 'CG' => 'S', 'CT' => 'Y', 'GT' => 'K', | |
149 'ACG' => 'V', 'ACT' => 'H', 'AGT' => 'D', 'CGT' => 'B', 'ACGT' => 'N'); | |
150 my (%constitNTlist); | |
151 while (my ($nt, $code) = each %ambigCode) # Where $nt = key and $code = value | |
152 { | |
153 push @{$constitNTlist{$code}}, $_ foreach (split("",$nt)); | |
154 } | |
155 | |
156 # Output variables | |
157 my $maxLength; | |
158 my $fastaPrint = 0; | |
159 my $fastaOut; | |
160 my $nexusPrint = 0; | |
161 my $nexusOut; | |
162 my ($phylipTradPrint, $phylipExtPrint) = (0, 0); | |
163 my $phylipOut; | |
164 my $sealPrint = 0; | |
165 my $sealOut; | |
166 my $outFormat = "sequential"; | |
167 my $interleaveLength = 80; | |
168 my $fastaLength = 80; | |
169 my $accPrint = 0; | |
170 | |
171 # Miscellaneous variables | |
172 my $verbose = 0; | |
173 my $debug = 0; | |
174 my $perlScript = "seqConverterG.pl"; | |
175 my $version = "1.2"; | |
176 | |
177 # Read in user input | |
178 if (not @ARGV or join(' ', @ARGV) =~ /\s-u/ or $ARGV[0] =~ /^-u/) # Enter interactive user-input mode | |
179 { | |
180 print "Entering interactive user-input mode. Type \"q\" at any prompt to exit program.\n"; | |
181 | |
182 # Get print format | |
183 my $defaultAcc = ($accPrint) ? "y" : "n"; | |
184 undef $accPrint; | |
185 until (defined $accPrint) | |
186 { | |
187 print "\tPrint out accession numbers only to nexus- and/or phylip output (y|n) [$defaultAcc]: "; | |
188 $accPrint = <stdin>; | |
189 chomp ($accPrint); | |
190 exit(0) if ($accPrint eq "q"); | |
191 if (substr($accPrint, 0, 1) =~ /^n/i or $accPrint eq "") | |
192 { | |
193 $accPrint = 0; | |
194 } | |
195 elsif (substr($accPrint, 0, 1) =~ /^y/i) | |
196 { | |
197 $accPrint = 1; | |
198 } | |
199 else | |
200 { | |
201 print "\t\tInvalid input ($accPrint)\n"; | |
202 undef $accPrint; | |
203 } | |
204 } | |
205 | |
206 # Get datafile | |
207 until (defined $readFile) | |
208 { | |
209 print "\tEnter name of data file (* = batch convert): "; | |
210 $readFile = <stdin>; | |
211 chomp ($readFile); | |
212 exit(0) if ($readFile eq "q"); | |
213 unless (-e $readFile or $readFile eq "*") | |
214 { | |
215 print "\t\tFile '$readFile' does not exist\n"; | |
216 undef $readFile; | |
217 } | |
218 } | |
219 push @seqFiles, $readFile; | |
220 | |
221 # Get format of datafile | |
222 my $defaultInput = "autodetect"; | |
223 undef $inputType; | |
224 until (defined $inputType) | |
225 { | |
226 print "\tEnter format of file $readFile (fasta|GenBank|nexus|phylip|Se-Al) [$defaultInput]: "; | |
227 $inputType = <stdin>; | |
228 chomp ($inputType); | |
229 exit(0) if ($inputType =~ /^q/i); | |
230 if (substr($inputType, 0, 1) =~ /^a/i or $inputType eq "") | |
231 { | |
232 $inputType = "autodetect"; | |
233 } | |
234 elsif (substr($inputType, 0, 1) =~ /^f/i) | |
235 { | |
236 $inputType = "fasta"; | |
237 } | |
238 elsif (substr($inputType, 0, 1) =~ /^g/i) | |
239 { | |
240 $inputType = "GenBank"; | |
241 } | |
242 elsif (substr($inputType, 0, 1) =~ /^n/i) | |
243 { | |
244 $inputType = "nexus"; | |
245 } | |
246 elsif (substr($inputType, 0, 1) =~ /^p/i) | |
247 { | |
248 $inputType = "phylip"; | |
249 } | |
250 elsif (substr($inputType, 0, 1) =~ /^s/i) | |
251 { | |
252 $inputType = "Se-Al"; | |
253 } | |
254 else | |
255 { | |
256 print "\t\tInvalid input ($inputType)\n"; | |
257 undef $inputType; | |
258 } | |
259 } | |
260 $inputType = "" if ($inputType eq "autodetect"); | |
261 | |
262 # Get whether or not to clean sequence labels | |
263 my $defaultClean = ($nameClean) ? "y" : "n"; | |
264 undef $nameClean; | |
265 until (defined $nameClean) | |
266 { | |
267 print "\tClean sequence labels by changing non-alphanumeric characters to underscores (y|n)? [$defaultClean]: "; | |
268 $nameClean = <stdin>; | |
269 chomp ($nameClean); | |
270 exit(0) if ($nameClean =~ /^q/i); | |
271 if (substr($nameClean, 0, 1) =~ /^y/i or $nameClean eq "") | |
272 { | |
273 $nameClean = 1; | |
274 } | |
275 elsif (substr($nameClean, 0, 1) =~ /^n/i) | |
276 { | |
277 $nameClean = 0; | |
278 } | |
279 else | |
280 { | |
281 print "\t\tInvalid input ($seqSplit)\n"; | |
282 undef $nameClean; | |
283 } | |
284 } | |
285 | |
286 # Get whether or not to split sequences | |
287 my $defaultSplit = ($seqSplit) ? "y" : "n"; | |
288 undef $seqSplit; | |
289 until (defined $seqSplit) | |
290 { | |
291 print "\tSplit into individual partitions following charset statements (y|n)? [$defaultSplit]: "; | |
292 $seqSplit = <stdin>; | |
293 chomp ($seqSplit); | |
294 exit(0) if ($seqSplit =~ /^q/i); | |
295 if (substr($seqSplit, 0, 1) =~ /^n/i or $seqSplit eq "") | |
296 { | |
297 $seqSplit = 0; | |
298 } | |
299 elsif (substr($seqSplit, 0, 1) =~ /^y/i) | |
300 { | |
301 $seqSplit = 1; | |
302 } | |
303 else | |
304 { | |
305 print "\t\tInvalid input ($seqSplit)\n"; | |
306 undef $seqSplit; | |
307 } | |
308 } | |
309 | |
310 # Get whether or not to jackknife sequences | |
311 my $defaultJack = ($jackknife) ? "y" : "n"; | |
312 undef $jackknife; | |
313 until (defined $jackknife) | |
314 { | |
315 print "\tProduce replicate, jackknifed data sets (y|n)? [$defaultJack]: "; | |
316 $jackknife = <stdin>; | |
317 chomp ($jackknife); | |
318 exit(0) if ($jackknife =~ /^q/i); | |
319 if (substr($jackknife, 0, 1) =~ /^n/i or $jackknife eq "") | |
320 { | |
321 $jackknife = 0; | |
322 } | |
323 elsif (substr($jackknife, 0, 1) =~ /^y/i) | |
324 { | |
325 $jackknife = 1; | |
326 } | |
327 else | |
328 { | |
329 print "\t\tInvalid input ($jackknife)\n"; | |
330 undef $jackknife; | |
331 } | |
332 } | |
333 | |
334 # Get whether or not to convert gaps | |
335 my $defaultGaps = ($gapChange) ? "y" : "n"; | |
336 undef $gapChange; | |
337 until (defined $gapChange) | |
338 { | |
339 print "\tConvert ~ gap characters to - (y|n)? [$defaultGaps]: "; | |
340 $gapChange = <stdin>; | |
341 chomp ($gapChange); | |
342 exit(0) if ($gapChange =~ /^q/i); | |
343 if (substr($gapChange, 0, 1) =~ /^n/i) | |
344 { | |
345 $gapChange = 0; | |
346 } | |
347 elsif (substr($gapChange, 0, 1) =~ /^y/i or $gapChange eq "") | |
348 { | |
349 $gapChange = 1; | |
350 } | |
351 else | |
352 { | |
353 print "\t\tInvalid input ($gapChange)\n"; | |
354 undef $gapChange; | |
355 } | |
356 } | |
357 | |
358 # Get whether or not to convert flanking gaps | |
359 my $defaultFlank = ($flankGap) ? "y" : "n"; | |
360 undef $flankGap; | |
361 until (defined $flankGap) | |
362 { | |
363 print "\tConvert flanking gap characters to Ns (y|n)? [$defaultFlank]: "; | |
364 $flankGap = <stdin>; | |
365 chomp ($flankGap); | |
366 exit(0) if ($flankGap =~ /^q/i); | |
367 if (substr($flankGap, 0, 1) =~ /^n/i) | |
368 { | |
369 $flankGap = 0; | |
370 } | |
371 elsif (substr($flankGap, 0, 1) =~ /^y/i or $flankGap eq "") | |
372 { | |
373 $flankGap = 1; | |
374 } | |
375 else | |
376 { | |
377 print "\t\tInvalid input ($flankGap)\n"; | |
378 undef $flankGap; | |
379 } | |
380 } | |
381 | |
382 # Get whether or not to convert ambiguous nucleotides | |
383 my $defaultAmbig = ($ambigChange) ? "y" : "n"; | |
384 undef $ambigChange; | |
385 until (defined $ambigChange) | |
386 { | |
387 print "\tConvert ambiguous nucleotides to Ns (y|n)? [$defaultAmbig]: "; | |
388 $ambigChange = <stdin>; | |
389 chomp ($ambigChange); | |
390 exit(0) if ($ambigChange =~ /^q/i); | |
391 if (substr($ambigChange, 0, 1) =~ /^n/i or $ambigChange eq "") | |
392 { | |
393 $ambigChange = 0; | |
394 } | |
395 elsif (substr($ambigChange, 0, 1) =~ /^y/i) | |
396 { | |
397 $ambigChange = 1; | |
398 } | |
399 else | |
400 { | |
401 print "\t\tInvalid input ($ambigChange)\n"; | |
402 undef $ambigChange; | |
403 } | |
404 } | |
405 | |
406 # Get output order of sequences | |
407 my $defaultOrder = $seqOrder; | |
408 undef $seqOrder; | |
409 until (defined $seqOrder) | |
410 { | |
411 print "\tEnter output order for sequences (alphabetical|clustal|input file) [$defaultOrder]: "; | |
412 $seqOrder = <stdin>; | |
413 chomp ($seqOrder); | |
414 exit(0) if ($seqOrder =~ /^q/i); | |
415 if (substr($seqOrder, 0, 1) =~ /^i/i) | |
416 { | |
417 $seqOrder = "input"; | |
418 } | |
419 elsif (substr($seqOrder, 0, 1) =~ /^a/i or $seqOrder eq "") | |
420 { | |
421 $seqOrder = "alphabetical"; | |
422 } | |
423 else | |
424 { | |
425 print "\t\tInvalid input ($seqOrder)\n"; | |
426 undef $seqOrder; | |
427 } | |
428 } | |
429 | |
430 # Get whether or not to convert to haplotypes | |
431 my $defaultHaplo = ($haploTyping) ? "y" : "n"; | |
432 undef $haploTyping; | |
433 until (defined $haploTyping) | |
434 { | |
435 print "\tConvert sequence data to haplotypes (y|n)? [$defaultHaplo]: "; | |
436 $haploTyping = <stdin>; | |
437 chomp ($haploTyping); | |
438 exit(0) if ($haploTyping =~ /^q/i); | |
439 if (substr($haploTyping, 0, 1) =~ /^n/i or $haploTyping eq "") | |
440 { | |
441 $haploTyping = 0; | |
442 } | |
443 elsif (substr($haploTyping, 0, 1) =~ /^y/i) | |
444 { | |
445 $haploTyping = 1; | |
446 } | |
447 else | |
448 { | |
449 print "\t\tInvalid input ($haploTyping)\n"; | |
450 undef $haploTyping; | |
451 } | |
452 } | |
453 | |
454 # Get whether or not to convert to amino acids | |
455 my $defaultTranslate = ($translateSeqs) ? "y" : "n"; | |
456 undef $translateSeqs; | |
457 until (defined $translateSeqs) | |
458 { | |
459 print "\tConvert sequence data to amino acids (y|n)? [$defaultTranslate]: "; | |
460 $translateSeqs = <stdin>; | |
461 chomp ($translateSeqs); | |
462 exit(0) if ($translateSeqs =~ /^q/i); | |
463 if (substr($translateSeqs, 0, 1) =~ /^n/i or $translateSeqs eq "") | |
464 { | |
465 $translateSeqs = 0; | |
466 } | |
467 elsif (substr($translateSeqs, 0, 1) =~ /^y/i) | |
468 { | |
469 $translateSeqs = 1; | |
470 } | |
471 else | |
472 { | |
473 print "\t\tInvalid input ($translateSeqs)\n"; | |
474 undef $translateSeqs; | |
475 } | |
476 } | |
477 | |
478 # Get genetic code for translation | |
479 my $defaultCode = $globalGenCode; | |
480 $globalGenCode = 99; | |
481 print "\tGenetic codes available for protein translation:\n"; | |
482 foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23)) | |
483 { | |
484 print "\t\t$code: $transTable{$code}"; | |
485 print "\n"; | |
486 } | |
487 until (defined $genCodes{$globalGenCode}) | |
488 { | |
489 $globalOverride = 0; | |
490 print "\tEnter global genetic code to be applied (0 = no translation; follow with an * to override local code) [$defaultCode]: "; | |
491 $globalGenCode = <stdin>; | |
492 chomp ($globalGenCode); | |
493 exit(0) if ($globalGenCode =~ /^q/i); | |
494 $globalGenCode = $defaultCode if ($globalGenCode eq ""); | |
495 if ($globalGenCode =~ /\*$/) | |
496 { | |
497 $globalOverride = 1; | |
498 $globalGenCode =~ s/\*$//; | |
499 } | |
500 print "\t\tInvalid input ($globalGenCode)\n" unless (defined $genCodes{$globalGenCode} or $globalGenCode == 0); | |
501 } | |
502 | |
503 # Get output formats | |
504 my $defaultFasta = ($fastaPrint) ? "y" : "n"; | |
505 undef $fastaPrint; | |
506 until (defined $fastaPrint) | |
507 { | |
508 print "\tOutput results in fasta format (y|n) [$defaultFasta]: "; | |
509 $fastaPrint = <stdin>; | |
510 chomp ($fastaPrint); | |
511 exit(0) if ($fastaPrint =~ /^q/i); | |
512 if (substr($fastaPrint, 0, 1) =~ /^y/i) | |
513 { | |
514 $fastaPrint = 1; | |
515 } | |
516 elsif (substr($fastaPrint, 0, 1) =~ /^n/i or $fastaPrint eq "") | |
517 { | |
518 $fastaPrint = 0; | |
519 } | |
520 else | |
521 { | |
522 print "\t\tInvalid input ($fastaPrint)\n"; | |
523 undef $fastaPrint; | |
524 } | |
525 } | |
526 | |
527 my $defaultNexus = ($nexusPrint) ? "y" : "n"; | |
528 undef $nexusPrint; | |
529 until (defined $nexusPrint) | |
530 { | |
531 print "\tOutput results in nexus format (y|n) [$defaultNexus]: "; | |
532 $nexusPrint = <stdin>; | |
533 chomp ($nexusPrint); | |
534 exit(0) if ($nexusPrint =~ /^q/i); | |
535 if (substr($nexusPrint, 0, 1) =~ /^y/i) | |
536 { | |
537 $nexusPrint = 1; | |
538 } | |
539 elsif (substr($nexusPrint, 0, 1) =~ /^n/i or $nexusPrint eq "") | |
540 { | |
541 $nexusPrint = 0; | |
542 } | |
543 else | |
544 { | |
545 print "\t\tInvalid input ($nexusPrint)\n"; | |
546 undef $nexusPrint; | |
547 } | |
548 } | |
549 | |
550 my $defaultPhylip = ($phylipTradPrint) ? "y" : "n"; | |
551 undef $phylipTradPrint; | |
552 until (defined $phylipTradPrint or $phylipExtPrint) | |
553 { | |
554 print "\tOutput results in traditional phylip format (y|n) [$defaultPhylip]: "; | |
555 $phylipTradPrint = <stdin>; | |
556 chomp ($phylipTradPrint); | |
557 exit(0) if ($phylipTradPrint =~ /^q/i); | |
558 if (substr($phylipTradPrint, 0, 1) =~ /^y/i) | |
559 { | |
560 $phylipTradPrint = 1; | |
561 } | |
562 elsif (substr($phylipTradPrint, 0, 1) =~ /^n/i or $phylipTradPrint eq "") | |
563 { | |
564 $phylipTradPrint = 0; | |
565 } | |
566 else | |
567 { | |
568 print "\t\tInvalid input ($phylipTradPrint)\n"; | |
569 undef $phylipTradPrint; | |
570 } | |
571 } | |
572 | |
573 if ($phylipTradPrint == 0) # Check for extended format | |
574 { | |
575 my $defaultPhylip = ($phylipExtPrint) ? "y" : "n"; | |
576 undef $phylipExtPrint; | |
577 until (defined $phylipExtPrint or $phylipExtPrint) | |
578 { | |
579 print "\tOutput results in extended phylip format (y|n) [$defaultPhylip]: "; | |
580 $phylipExtPrint = <stdin>; | |
581 chomp ($phylipExtPrint); | |
582 exit(0) if ($phylipExtPrint =~ /^q/i); | |
583 if (substr($phylipExtPrint, 0, 1) =~ /^y/i) | |
584 { | |
585 $phylipExtPrint = 1; | |
586 } | |
587 elsif (substr($phylipExtPrint, 0, 1) =~ /^n/i or $phylipExtPrint eq "") | |
588 { | |
589 $phylipExtPrint = 0; | |
590 } | |
591 else | |
592 { | |
593 print "\t\tInvalid input ($phylipExtPrint)\n"; | |
594 undef $phylipExtPrint; | |
595 } | |
596 } | |
597 } | |
598 | |
599 my $defaultSeal = ($sealPrint) ? "y" : "n"; | |
600 undef $sealPrint; | |
601 until (defined $sealPrint) | |
602 { | |
603 print "\tOutput results in Se-Al format (y|n) [$defaultSeal]: "; | |
604 $sealPrint = <stdin>; | |
605 chomp ($sealPrint); | |
606 exit(0) if ($sealPrint =~ /^q/i); | |
607 if (substr($sealPrint, 0, 1) =~ /^y/i) | |
608 { | |
609 $sealPrint = 1; | |
610 } | |
611 elsif (substr($sealPrint, 0, 1) =~ /^n/i or $sealPrint eq "") | |
612 { | |
613 $sealPrint = 0; | |
614 } | |
615 else | |
616 { | |
617 print "\t\tInvalid input ($sealPrint)\n"; | |
618 undef $sealPrint; | |
619 } | |
620 } | |
621 | |
622 if ($nexusPrint or $phylipTradPrint or $phylipExtPrint) | |
623 { | |
624 my $defaultInterleave = ($outFormat) ? "y" : "n"; | |
625 undef $outFormat; | |
626 until (defined $outFormat) | |
627 { | |
628 print "\tInterleave output for nexus- and/or phylip-formatted output with length (n|10-100) [$defaultInterleave]: "; | |
629 $outFormat = <stdin>; | |
630 chomp ($outFormat); | |
631 exit(0) if ($outFormat =~ /^q/i); | |
632 if ($outFormat =~ /(\d+)/) | |
633 { | |
634 if ($outFormat =~ /\D/ or $outFormat < 10 or $outFormat > 100) | |
635 { | |
636 print "\t\tInvalid input ($outFormat)\n"; | |
637 undef $outFormat; | |
638 } | |
639 else | |
640 { | |
641 $outFormat = $1; | |
642 } | |
643 } | |
644 elsif (substr($outFormat, 0, 1) =~ /^n/i or $outFormat eq "") | |
645 { | |
646 $outFormat = "sequential"; | |
647 } | |
648 else | |
649 { | |
650 print "\t\tInvalid input ($outFormat)\n"; | |
651 undef $outFormat; | |
652 } | |
653 } | |
654 if ($outFormat ne "sequential") | |
655 { | |
656 $interleaveLength = $outFormat; | |
657 $outFormat = "interleave"; | |
658 } | |
659 } | |
660 | |
661 # Get verbose output mode | |
662 my $defaultVerbose = ($verbose) ? "y" : "n"; | |
663 undef $verbose; | |
664 until (defined $verbose) | |
665 { | |
666 print "\tOutput verbose results to screen (y|n) [$defaultVerbose]: "; | |
667 $verbose = <stdin>; | |
668 chomp ($verbose); | |
669 exit(0) if ($verbose =~ /^q/i); | |
670 if (substr($verbose, 0, 1) =~ /^y/i) | |
671 { | |
672 $verbose = 1; | |
673 print "\n"; | |
674 } | |
675 elsif (substr($verbose, 0, 1) =~ /^n/i or $verbose eq "") | |
676 { | |
677 $verbose = 0; | |
678 } | |
679 elsif (substr($verbose, 0, 1) =~ /^x/i or $verbose eq "") | |
680 { | |
681 $verbose = $debug = 1; | |
682 } | |
683 else | |
684 { | |
685 print "\t\tInvalid input ($verbose)\n"; | |
686 undef $verbose; | |
687 } | |
688 } | |
689 } | |
690 elsif (join(' ', @ARGV) =~ /\s-h/ or $ARGV[0] =~ /^-h/) # Print help screen | |
691 { | |
692 print "Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]\n"; | |
693 print " [-G] [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t]\n"; | |
694 print " [-u] [-v] [-h]\n"; | |
695 print "Version: $version\n"; | |
696 print "Options: -a = print out accession numbers instead of taxon labels for nexus\n"; | |
697 print " and phylip output\n"; | |
698 print " -c<number> = global genetic code (to be used for translation unless\n"; | |
699 print " otherwise specified) (default = standard code; end\n"; | |
700 print " with * to override local settings)\n"; | |
701 foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23)) | |
702 { | |
703 print " $code: $transTable{$code}"; | |
704 print "\n"; | |
705 } | |
706 print " -d<filename> = file containing raw sequence information; * = batch\n"; | |
707 print " convert all specified file types in working\n"; | |
708 print " directory (suffixes must be .fasta / .fas, .gb\n"; | |
709 print " .nexus / .nex, .phylip / .ph, or .seal as\n"; | |
710 print " appropriate)\n"; | |
711 print " -g = do not convert ~ gap characters (e.g., from BioEdit) to -\n"; | |
712 print " -G = convert flanking gaps to Ns\n"; | |
713 print " -H = convert sequence data to haplotype data\n"; | |
714 print " -i<f|g|n|p|s> = format of sequence file (fasta (f), GenBank (g),\n"; | |
715 print " nexus (n), phylip (p), or Se-Al (s)); default =\n"; | |
716 print " autodetect\n"; | |
717 print " -j = produce jackknifed data sets each with a single sequence\n"; | |
718 print " deleted\n"; | |
719 print " -l<number> = interleave nexus- and/or phylip-formatted output\n"; | |
720 print " (default = sequential) with specified sequence length\n"; | |
721 print " (between 10 and 100 inclusive; default = 80);\n"; | |
722 print " NOTE: also sets default interleave length for fasta\n"; | |
723 print " output\n"; | |
724 print " -n = convert ambiguous nucleotides to Ns\n"; | |
725 print " -o<f|n|pc|pe|s> = output results in any or all of fasta (f), nexus\n"; | |
726 print " (n), classic or extended phylip (pc or pe),\n"; | |
727 print " and/or Se-Al (s) formats (NOTE: flag can be\n"; | |
728 print " invoked multiple times)\n"; | |
729 print " -r<a|i> = order sequences in final output alphabetically by name\n"; | |
730 print " (a; default) or in input order from file (i)\n"; | |
731 print " -s = split data set into individual partitions according to nexus\n"; | |
732 print " charset statements\n"; | |
733 print " -t = translate sequences into amino acids\n"; | |
734 print " -u = interactive user-input mode\n"; | |
735 print " -h = print this message and quit\n"; | |
736 print " -v = verbose output\n"; | |
737 exit(0); | |
738 } | |
739 else # Process switches | |
740 { | |
741 for (my $i = 0; $i <= $#ARGV; $i++) | |
742 { | |
743 if ($ARGV[$i] eq "-a") | |
744 { | |
745 $accPrint = 1; | |
746 } | |
747 elsif ($ARGV[$i] =~ /-c(\d+)/) | |
748 { | |
749 $globalGenCode = $1; | |
750 if ($globalGenCode != 0 and not defined $genCodes{$globalGenCode}) | |
751 { | |
752 print "Don't understand argument: $ARGV[$i]\n"; | |
753 print "Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]\n"; | |
754 print " [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t] [-u]\n"; | |
755 print " [-v] [-h]\n"; | |
756 print "Version: $version\n"; | |
757 exit(1); | |
758 } | |
759 $globalOverride = 1 if ($ARGV[$i] =~ /\*$/); | |
760 } | |
761 elsif ($ARGV[$i] =~ /^-d(.*)/) | |
762 { | |
763 $readFile = $1; | |
764 unless ($readFile eq "*") | |
765 { | |
766 die "ERROR: Data file $readFile does not exist.\n" unless (-e $readFile); | |
767 } | |
768 push @seqFiles, $readFile; | |
769 } | |
770 elsif ($ARGV[$i] eq "-g") | |
771 { | |
772 $gapChange = 0; | |
773 } | |
774 elsif ($ARGV[$i] eq "-G") | |
775 { | |
776 $flankGap = 1; | |
777 } | |
778 elsif ($ARGV[$i] eq "-H") | |
779 { | |
780 $haploTyping = 1; | |
781 } | |
782 elsif ($ARGV[$i] eq "-if") | |
783 { | |
784 $inputType = "fasta"; | |
785 } | |
786 elsif ($ARGV[$i] eq "-ig") | |
787 { | |
788 $inputType = "GenBank"; | |
789 } | |
790 elsif ($ARGV[$i] eq "-in") | |
791 { | |
792 $inputType = "nexus"; | |
793 } | |
794 elsif ($ARGV[$i] eq "-ip") | |
795 { | |
796 $inputType = "phylip"; | |
797 } | |
798 elsif ($ARGV[$i] eq "-is") | |
799 { | |
800 $inputType = "Se-Al"; | |
801 } | |
802 elsif ($ARGV[$i] eq "-j") | |
803 { | |
804 $jackknife = 1; | |
805 } | |
806 elsif ($ARGV[$i] eq "-k") | |
807 { | |
808 $nameClean = 0; | |
809 } | |
810 elsif ($ARGV[$i] =~ /-l(\d+)?/) | |
811 { | |
812 $interleaveLength = $1 if (defined $1); | |
813 $outFormat = "interleave"; | |
814 } | |
815 elsif ($ARGV[$i] eq "-n") | |
816 { | |
817 $ambigChange = 1; | |
818 } | |
819 elsif ($ARGV[$i] eq "-of") | |
820 { | |
821 $fastaPrint = 1; | |
822 } | |
823 elsif ($ARGV[$i] eq "-on") | |
824 { | |
825 $nexusPrint = 1; | |
826 } | |
827 elsif ($ARGV[$i] eq "-opc") | |
828 { | |
829 $phylipTradPrint = 1; | |
830 } | |
831 elsif ($ARGV[$i] eq "-ope") | |
832 { | |
833 $phylipExtPrint = 1; | |
834 } | |
835 elsif ($ARGV[$i] eq "-os") | |
836 { | |
837 $sealPrint = 1; | |
838 } | |
839 elsif ($ARGV[$i] eq "-ra") | |
840 { | |
841 $seqOrder = "alphabetical"; | |
842 } | |
843 elsif ($ARGV[$i] eq "-ri") | |
844 { | |
845 $seqOrder = "input"; | |
846 } | |
847 elsif ($ARGV[$i] eq "-s") | |
848 { | |
849 $seqSplit = 1; | |
850 } | |
851 elsif ($ARGV[$i] eq "-t") | |
852 { | |
853 $translateSeqs = 1; | |
854 $globalGenCode = 1 unless ($globalGenCode); | |
855 } | |
856 elsif ($ARGV[$i] eq "-v") | |
857 { | |
858 $verbose = 1; | |
859 } | |
860 elsif ($ARGV[$i] eq "-x") | |
861 { | |
862 $debug = 1; | |
863 $verbose = 1; | |
864 } | |
865 elsif ($ARGV[$i] =~ /^-O(.*)/) | |
866 { | |
867 $OutFile = $1; | |
868 } | |
869 else | |
870 { | |
871 print "Don't understand argument: $ARGV[$i]\n"; | |
872 print "Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]\n"; | |
873 print " [-G] [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t]\n"; | |
874 print " [-u] [-v] [-h]\n"; | |
875 print "Version: $version\n"; | |
876 exit(1); | |
877 } | |
878 } | |
879 } | |
880 $accPrint = 0 if ($accPrint eq "n"); | |
881 geneticCoder() if ($globalGenCode); | |
882 $fastaLength = $interleaveLength; | |
883 | |
884 # Check for I/O errors | |
885 die "ERROR: Must supply name of file containing sequence data.\n" if (not @seqFiles); | |
886 die "ERROR: Must supply at least one output format.\n" unless ($fastaPrint or $nexusPrint or $phylipTradPrint or $phylipExtPrint or $sealPrint); | |
887 die "ERROR: Sequence length for interleaved format must be between 10 and 100 inclusive.\n" if (($fastaPrint or (($nexusPrint or $phylipTradPrint or $phylipExtPrint) and $outFormat eq "interleave")) and ($interleaveLength < 10 or $interleaveLength > 100)); | |
888 | |
889 # Read in sequence data | |
890 if ($seqFiles[0] eq "*") # Batch convert all appropriate files in working directory | |
891 { | |
892 if ($inputType) | |
893 { | |
894 undef @seqFiles; | |
895 | |
896 system("ls -d * > convertList.txt"); | |
897 setLineBreak("convertList.txt"); | |
898 open (LIST, "<convertList.txt") or die "Cannot open file containing names of all sequence files, convertList.txt\n"; | |
899 while (<LIST>) | |
900 { | |
901 chomp; | |
902 next unless ($_); | |
903 push @seqFiles, $_ if ($inputType eq "Se-Al" and $_ =~ /\.seal$/); | |
904 push @seqFiles, $_ if ($inputType eq "phylip" and ($_ =~ /\.phylip$/ or $_ =~ /\.ph$/)); | |
905 push @seqFiles, $_ if ($inputType eq "nexus" and ($_ =~ /\.nexus$/ or $_ =~ /\.nex$/)); | |
906 push @seqFiles, $_ if ($inputType eq "GenBank" and ($_ =~ /\.gb$/)); | |
907 push @seqFiles, $_ if ($inputType eq "fasta" and ($_ =~ /\.fasta$/ or $_ =~ /\.fas$/)); | |
908 } | |
909 close LIST; | |
910 unlink ("convertList.txt") unless ($debug); | |
911 | |
912 die "ERROR: No files of type $inputType found for batch conversion.\n" if (not @seqFiles); | |
913 } | |
914 else | |
915 { | |
916 die "ERROR: Must specify input file type for batch conversion\n"; | |
917 } | |
918 } | |
919 | |
920 foreach my $seqFile (@seqFiles) | |
921 { | |
922 print "\nConverting file $seqFile ...\n"; | |
923 | |
924 # Set output file names | |
925 $dataSource = $seqFile; | |
926 $dataSource =~ s/\.\w+$//; | |
927 | |
928 if ($fastaPrint) | |
929 { | |
930 $fastaOut = $dataSource . ".fasta"; | |
931 $fastaOut =~ s/\.fasta$/_new.fasta/ if ($fastaOut eq $seqFile); | |
932 $fastaOut =~ s/\.fasta$/_haplo.fasta/ if ($haploTyping); | |
933 } | |
934 if ($nexusPrint) | |
935 { | |
936 # $nexusOut = $dataSource . ".nex"; | |
937 # Fixed Output File name much easier for Galaxy | |
938 $nexusOut = $OutFile; | |
939 $nexusOut =~ s/\.nex$/_new.nex/ if ($nexusOut eq $seqFile); | |
940 $nexusOut =~ s/\.nex$/_haplo.nex/ if ($haploTyping); | |
941 } | |
942 if ($phylipTradPrint or $phylipExtPrint) | |
943 { | |
944 # $phylipOut = $dataSource . ".phylip"; | |
945 # Fixed Output File name much easier for Galaxy | |
946 $phylipOut = $OutFile; | |
947 $phylipOut =~ s/\.phylip$/_new.phylip/ if ($phylipOut eq $seqFile); | |
948 $phylipOut =~ s/\.phylip$/_haplo.phylip/ if ($haploTyping); | |
949 } | |
950 if ($sealPrint) | |
951 { | |
952 $sealOut = $dataSource . ".seal"; | |
953 $sealOut =~ s/\.seal$/_new.seal/ if ($sealOut eq $seqFile); | |
954 $sealOut =~ s/\.seal$/_haplo.seal/ if ($haploTyping); | |
955 } | |
956 | |
957 $haploFile = $dataSource . "_haplotypeSeqs.txt"; | |
958 | |
959 $phylipExtPrint = 0 if ($phylipTradPrint); | |
960 | |
961 # Read in sequence data | |
962 # Clear variables | |
963 undef @accNum; | |
964 undef %nameLabel; | |
965 undef %sequence; | |
966 undef %geneticCode; | |
967 undef %accPresent; | |
968 undef %deletedSeq; | |
969 undef %finalSeq; | |
970 undef $seqCount; | |
971 undef $ntax; | |
972 undef @charsetList; | |
973 undef %charsetStart; | |
974 undef %charsetEnd; | |
975 my (@haploList, %haploID, %haploSeqs); | |
976 $maxLength = 0; | |
977 | |
978 seqRead($seqFile); | |
979 if (not @accNum) | |
980 { | |
981 print "\tERROR: Could not read in sequences from file $seqFile; skipping to next file\n"; | |
982 next; | |
983 } | |
984 | |
985 # Process for printing | |
986 my $stopCodonCount; | |
987 $seqType = "protein" if ($globalGenCode and $translateSeqs); | |
988 foreach my $seq (@accNum) | |
989 { | |
990 $sequence{$seq} =~ s/\~/-/g if ($gapChange); | |
991 $sequence{$seq} =~ s/R|Y|M|K|S|W|H|B|V|D/N/ig if ($ambigChange and $seqType eq "nucleotide"); | |
992 | |
993 $finalSeq{$seq} = $sequence{$seq}; | |
994 $geneticCode{$seq} = $globalGenCode if ($globalOverride); | |
995 if ($globalGenCode and $translateSeqs) | |
996 { | |
997 $finalSeq{$seq} = translate($sequence{$seq}, $geneticCode{$seq}); | |
998 $stopCodonCount++ if (($finalSeq{$seq} =~ tr/\*//) > 2); # Check for stop codons | |
999 } | |
1000 $maxLength = length($finalSeq{$seq}) if (length($finalSeq{$seq}) > $maxLength); | |
1001 $nameLabel{$seq} =~ s/\W+/_/g if ($nameClean); # Clean sequence labels of non-alphanumerics | |
1002 } | |
1003 printf "\n\tWARNING: $stopCodonCount of %s sequences had more than two stop codons; check for\n\t\t1) proper reading frame,\n\t\t2) proper genetic code, or\n\t\t3) that sequences are coding DNA\n", scalar(@accNum) if ($globalGenCode and $stopCodonCount); | |
1004 | |
1005 # Add gaps to end of any sequence less than maximum length | |
1006 foreach my $seq (@accNum) | |
1007 { | |
1008 $finalSeq{$seq} .= "-" x ($maxLength - length($sequence{$seq})); | |
1009 $sequence{$seq} = $finalSeq{$seq}; | |
1010 } | |
1011 | |
1012 # Convert flanking gaps to Ns if desired | |
1013 if ($flankGap) | |
1014 { | |
1015 foreach my $seq (@accNum) | |
1016 { | |
1017 if ($finalSeq{$seq} =~ /^(\-+)/) | |
1018 { | |
1019 my $startGap = $1; | |
1020 my $startN = "N" x length($startGap); | |
1021 $finalSeq{$seq} =~s/^$startGap/$startN/; | |
1022 } | |
1023 if ($finalSeq{$seq} =~ /(\-+)$/) | |
1024 { | |
1025 my $endGap = $1; | |
1026 my $endN = "N" x length($endGap); | |
1027 $finalSeq{$seq} =~s/$endGap$/$endN/; | |
1028 } | |
1029 } | |
1030 } | |
1031 | |
1032 # Determine haplotypes as needed | |
1033 if ($haploTyping) | |
1034 { | |
1035 foreach my $entry (@accNum) | |
1036 { | |
1037 $haploID{$entry} = 0; | |
1038 | |
1039 foreach my $haploType (@haploList) | |
1040 { | |
1041 if ($finalSeq{$entry} eq $finalSeq{$haploType}) # Matches existing haplotype; add to list | |
1042 { | |
1043 $haploID{$entry} = $haploType; | |
1044 push @{$haploSeqs{$haploType}}, $entry; | |
1045 last; | |
1046 } | |
1047 } | |
1048 | |
1049 if (not $haploID{$entry}) # No match to existing haplotype; define new haplotype | |
1050 { | |
1051 $haploID{$entry} = "haplo" . (scalar(@haploList) + 1); | |
1052 push @haploList, $haploID{$entry}; | |
1053 $nameLabel{$haploID{$entry}} = $haploID{$entry}; | |
1054 $finalSeq{$haploID{$entry}} = $finalSeq{$entry}; | |
1055 push @{$haploSeqs{$haploID{$entry}}}, $entry; | |
1056 } | |
1057 } | |
1058 | |
1059 undef @accNum; | |
1060 @accNum = @haploList; | |
1061 | |
1062 open (HAPLO, ">$haploFile") or die "Cannot print to file $haploFile\n"; | |
1063 foreach my $haploType (@haploList) | |
1064 { | |
1065 print HAPLO "$haploType:"; | |
1066 print HAPLO "\t$nameLabel{$_}" foreach @{$haploSeqs{$haploType}}; | |
1067 print HAPLO "\n"; | |
1068 } | |
1069 close HAPLO; | |
1070 } | |
1071 | |
1072 # Print results! | |
1073 print "\nPrinting results ...\n"; | |
1074 @accNum = sort { $nameLabel{$a} cmp $nameLabel{$b} } keys %nameLabel if ($seqOrder eq "alphabetical" and not $haploTyping); | |
1075 | |
1076 # Print full data set | |
1077 $ntax = scalar(@accNum); | |
1078 seqPrint($seqFile); | |
1079 | |
1080 # Print jackknifed data sets | |
1081 if ($jackknife) | |
1082 { | |
1083 my $delseqCount = 0; | |
1084 foreach my $seq (@accNum) | |
1085 { | |
1086 $deletedSeq{$seq} = 1; | |
1087 $delseqCount++; | |
1088 | |
1089 # Change output file names | |
1090 $fastaOut = $dataSource . "_jack$delseqCount.fasta"; | |
1091 $nexusOut = $dataSource . "_jack$delseqCount.nex"; | |
1092 $phylipOut = $dataSource . "_jack$delseqCount.phylip"; | |
1093 $sealOut = $dataSource . "_jack$delseqCount.seal"; | |
1094 | |
1095 $ntax = scalar(@accNum) - 1; | |
1096 seqPrint($seqFile); | |
1097 | |
1098 $deletedSeq{$seq} = 0; # Reset deleted sequence | |
1099 } | |
1100 } | |
1101 | |
1102 # Print data set partitions | |
1103 if (@charsetList and $seqSplit) | |
1104 { | |
1105 my $delCount = 0; | |
1106 foreach my $partition (@charsetList) | |
1107 { | |
1108 $delCount = 0; | |
1109 # Change output file names | |
1110 $fastaOut = $dataSource . "_$partition.fasta"; | |
1111 $nexusOut = $dataSource . "_$partition.nex"; | |
1112 $phylipOut = $dataSource . "_$partition.phylip"; | |
1113 $sealOut = $dataSource . "_$partition.seal"; | |
1114 | |
1115 # Restrict sequence to partition limits | |
1116 foreach my $seq (@accNum) | |
1117 { | |
1118 $deletedSeq{$seq} = 0; # Reset all deleted sequences | |
1119 $finalSeq{$seq} = substr($sequence{$seq}, $charsetStart{$partition} - 1, $charsetEnd{$partition} - $charsetStart{$partition} + 1); | |
1120 # Check that sequence remains informative | |
1121 unless ($finalSeq{$seq} =~ /a/i or $finalSeq{$seq} =~ /c/i or $finalSeq{$seq} =~ /g/i or $finalSeq{$seq} =~ /t/i) | |
1122 { | |
1123 $delCount++; | |
1124 $deletedSeq{$seq} = 1; | |
1125 } | |
1126 } | |
1127 $ntax = scalar(@accNum) - $delCount; | |
1128 $maxLength = $charsetEnd{$partition} - $charsetStart{$partition} + 1; | |
1129 seqPrint($seqFile); | |
1130 } | |
1131 } | |
1132 } | |
1133 | |
1134 exit(0); | |
1135 | |
1136 ### Subroutines used in the program | |
1137 | |
1138 sub setLineBreak # Check line breaks of input files and set input record separator accordingly | |
1139 { | |
1140 my $inFile = shift; | |
1141 $/ ="\n"; | |
1142 open (IN, "<$inFile") or die "Cannot open $inFile to check form of line breaks.\n"; | |
1143 while (<IN>) | |
1144 { | |
1145 if ($_ =~ /\r\n/) | |
1146 { | |
1147 print "\tDOS line breaks detected ...\n" if ($verbose); | |
1148 $/ ="\r\n"; | |
1149 last; | |
1150 } | |
1151 elsif ($_ =~ /\r/) | |
1152 { | |
1153 print "\tMac line breaks detected ...\n" if ($verbose); | |
1154 $/ ="\r"; | |
1155 last; | |
1156 } | |
1157 else | |
1158 { | |
1159 print "\tUnix line breaks detected ...\n" if ($verbose); | |
1160 $/ ="\n"; | |
1161 last; | |
1162 } | |
1163 } | |
1164 close IN; | |
1165 } | |
1166 | |
1167 sub seqRead | |
1168 { | |
1169 my $seqFile = shift; | |
1170 undef %sequence; | |
1171 | |
1172 print "\nReading in sequence data from file $seqFile (type is $inputType) ...\n" if ($inputType); | |
1173 setLineBreak($seqFile); | |
1174 open (SEQ, "<$seqFile") or die "Cannot open file containing sequences, $seqFile\n"; | |
1175 my ($header, $tempAcc, $tempName, $tempSeq); | |
1176 my $fastaAcc; | |
1177 my (%nexusSpecies, %nexusAcc, $nexusRead, $commentFlag); | |
1178 my ($phylipLineCount, $phylipTaxa, $phylipChars, %phylipSeq); | |
1179 my $sealCode; | |
1180 my ($sealDelFlag, $owner) = (0, 0); | |
1181 my ($gbAcc, $gbRead); | |
1182 my $macBlock = 0; | |
1183 my %accCount; | |
1184 | |
1185 while (<SEQ>) | |
1186 { | |
1187 chomp; | |
1188 my $lineRead = $_; | |
1189 next unless ($lineRead); | |
1190 | |
1191 # Autodetect sequence format | |
1192 if (not $inputType) | |
1193 { | |
1194 $inputType = "fasta" if ($lineRead =~ /^>/); | |
1195 $inputType = "nexus" if ($lineRead =~ /\#nexus/i); | |
1196 $inputType = "phylip" if ($lineRead =~ /^\s*\d+\s+\d+/); | |
1197 $inputType = "Se-Al" if ($lineRead =~ /^\s*Database=\{/i); | |
1198 $inputType = "GenBank" if ($lineRead =~ /^\s*LOCUS/); | |
1199 print "\nReading in sequence data from file $seqFile (type determined to be $inputType) ...\n" if ($inputType); | |
1200 } | |
1201 | |
1202 if ($inputType eq "nexus") | |
1203 { | |
1204 # Check if charset statement present | |
1205 if ($lineRead =~ /charset/i) | |
1206 { | |
1207 $lineRead =~ s/\s+//g; | |
1208 $lineRead =~ s/;$//; | |
1209 | |
1210 my ($charsetName, $charsetBounds) = split('=', $lineRead); | |
1211 $charsetName =~ s/charset//i; | |
1212 push @charsetList, $charsetName; | |
1213 if ($charsetBounds =~ /(\d+)-*(\d*)/) | |
1214 { | |
1215 $charsetStart{$charsetName} = $1; | |
1216 if ($2) | |
1217 { | |
1218 $charsetEnd{$charsetName} = $2; | |
1219 } | |
1220 else | |
1221 { | |
1222 $charsetEnd{$charsetName} = $1; | |
1223 } | |
1224 } | |
1225 } | |
1226 | |
1227 # Otherwise block out MacClade / PAUP blocks | |
1228 $macBlock = 1 if ($lineRead =~ /begin macclade;/i or $lineRead =~ /begin paup;/i); | |
1229 $macBlock = 0 if ($macBlock and $lineRead =~ /end;/i); | |
1230 next if ($macBlock); | |
1231 | |
1232 # Otherwise only read in data lines or charset statements | |
1233 if ($lineRead =~ /^\s*matrix/i) | |
1234 { | |
1235 $nexusRead = 1; | |
1236 next; | |
1237 } | |
1238 $nexusRead = 0 if ($lineRead =~ /;\s*$/); | |
1239 next unless ($nexusRead); | |
1240 | |
1241 # Remove MacClade sequence lengths | |
1242 $lineRead =~ s/\[\d+\]$//; | |
1243 | |
1244 $commentFlag = 1 if ($lineRead =~ /\[/); | |
1245 if ($lineRead =~ /\]/) | |
1246 { | |
1247 $commentFlag = 0; | |
1248 next; | |
1249 } | |
1250 next if ($commentFlag); | |
1251 | |
1252 next unless ($lineRead =~ /a/i or $lineRead =~ /c/i or $lineRead =~ /g/i or $lineRead =~ /t/i or $lineRead =~ /n/i or $lineRead =~ /\?/ or $lineRead =~ /\-/); | |
1253 | |
1254 # Clean up input line | |
1255 $lineRead =~ s/^\s+//; | |
1256 $lineRead =~ s/\'//g; | |
1257 | |
1258 my @nexusLine = split(/\s+/, $lineRead); | |
1259 my $species = shift(@nexusLine); | |
1260 $species =~ s/\s+/_/g; | |
1261 $species =~ s/\_+/_/g; | |
1262 my $seq = join('', @nexusLine); | |
1263 $seq =~ s/\s+//g; | |
1264 $seqType = "protein" if ($seq =~ /E/i or $seq =~ /Q/i or $seq =~ /I/i or $seq =~ /L/i or $seq =~ /F/i or $seq =~ /P/i); | |
1265 if (not defined $nexusSpecies{$species}) | |
1266 { | |
1267 $nexusSpecies{$species} = 1; | |
1268 $seqCount++; | |
1269 $nexusAcc{$species} = "tAlign_".$seqCount; | |
1270 push @accNum, $nexusAcc{$species}; | |
1271 $nameLabel{$nexusAcc{$species}} = $species; | |
1272 $sequence{$nexusAcc{$species}} = uc($seq); | |
1273 $geneticCode{$nexusAcc{$species}} = $globalGenCode; | |
1274 } | |
1275 else # Sequences are in interleaved format; append sequence | |
1276 { | |
1277 $sequence{$nexusAcc{$species}} .= uc($seq); | |
1278 } | |
1279 } | |
1280 | |
1281 if ($inputType eq "fasta") | |
1282 { | |
1283 if ($lineRead =~/^\s*>/) | |
1284 { | |
1285 my $species; | |
1286 $seqCount++; | |
1287 (my $tempSpecies = $lineRead) =~ s/^\s*>//; | |
1288 | |
1289 if ($tempSpecies =~ /^Mit\.\s+/) # Entry comes from European RNA project | |
1290 { | |
1291 $tempSpecies =~ s/^Mit\.\s+//i; # To fix entries from European RNA project | |
1292 my @speciesInfo = split(/\s+/, $tempSpecies); | |
1293 $species = join('_', $speciesInfo[0], $speciesInfo[1]); | |
1294 if (defined $speciesInfo[2]) | |
1295 { | |
1296 $fastaAcc = $speciesInfo[2]; | |
1297 $accCount{$fastaAcc}++; | |
1298 if ($accCount{$fastaAcc} > 1) | |
1299 { | |
1300 print "\nWARNING: Accession number $fastaAcc used more than once"; | |
1301 print "; skipping this and subsequent entries" if ($skipDuplicates); | |
1302 print "\n"; | |
1303 } | |
1304 } | |
1305 else | |
1306 { | |
1307 $fastaAcc = "tAlign_".$seqCount; | |
1308 $accCount{$fastaAcc}++; | |
1309 } | |
1310 } | |
1311 else | |
1312 { | |
1313 my @speciesLine = split(/\s+/, $tempSpecies); | |
1314 if ($speciesLine[$#speciesLine] =~ /^\(?\w+\d+\)?$/ and scalar(@speciesLine) > 1) # Check whether last entry is an accession number | |
1315 { | |
1316 $fastaAcc = pop (@speciesLine); | |
1317 $fastaAcc =~ s/^\(//g; | |
1318 $fastaAcc =~ s/\)$//g; | |
1319 $accCount{$fastaAcc}++; | |
1320 if ($accCount{$fastaAcc} > 1) | |
1321 { | |
1322 print "\nWARNING: Accession number $fastaAcc used more than once"; | |
1323 if ($skipDuplicates) | |
1324 { | |
1325 print "; skipping this and subsequent entries\n"; | |
1326 } | |
1327 else | |
1328 { | |
1329 print "; assigning temporary accession number\n"; | |
1330 $fastaAcc = "tAlign_" . $seqCount; | |
1331 } | |
1332 } | |
1333 } | |
1334 else | |
1335 { | |
1336 $fastaAcc = "tAlign_".$seqCount; | |
1337 $accCount{$fastaAcc}++; | |
1338 } | |
1339 $species = join('_', @speciesLine); | |
1340 $species = "Sequence_".$seqCount if ($species eq ""); | |
1341 } | |
1342 push @accNum, $fastaAcc unless ($accCount{$fastaAcc} > 1 and $skipDuplicates); | |
1343 $geneticCode{$fastaAcc} = $globalGenCode; | |
1344 ($nameLabel{$fastaAcc} = $species) =~ s/\s+/_/; | |
1345 $nameLabel{$fastaAcc} =~ s/\_+/_/; | |
1346 } | |
1347 else | |
1348 { | |
1349 next if ($accCount{$fastaAcc} > 1 and $skipDuplicates); | |
1350 $lineRead =~ s/\s+//g; | |
1351 $seqType = "protein" if ($lineRead =~ /E/i or $lineRead =~ /Q/i or $lineRead =~ /I/i or $lineRead =~ /L/i or $lineRead =~ /F/i or $lineRead =~ /P/i); | |
1352 $sequence{$fastaAcc} .= uc($lineRead); | |
1353 } | |
1354 } | |
1355 | |
1356 if ($inputType eq "Se-Al") | |
1357 { | |
1358 my $header; | |
1359 $sealDelFlag = 1 if ($lineRead =~/MCoL/); # Se-Al sometimes places deleted species at end of file; do not read in remainder of file | |
1360 next if ($sealDelFlag == 1); | |
1361 next unless ($lineRead =~/NumSites/i or $lineRead =~/Owner/i or $lineRead =~/Name/i or $lineRead =~/Accession/i or $lineRead =~/Sequence/i or $lineRead =~/GeneticCode/i or $lineRead =~/Frame/i); | |
1362 if ($lineRead =~/Owner\s*\=\s*(\d+)/i) | |
1363 { | |
1364 $owner = $1; | |
1365 } | |
1366 if ($lineRead =~/Accession/i and $owner == 2) | |
1367 { | |
1368 $seqCount++; | |
1369 if ($lineRead =~ /null/ or $lineRead =~ /\"\"/) | |
1370 { | |
1371 $tempAcc = "tAlign_" . $seqCount; | |
1372 $accCount{$tempAcc}++; | |
1373 } | |
1374 else | |
1375 { | |
1376 ($header, $tempAcc) = split (/=/, $lineRead); | |
1377 $tempAcc =~ s/\"//g; | |
1378 $tempAcc =~ s/;//g; | |
1379 $accCount{$tempAcc}++; | |
1380 if ($accCount{$tempAcc} > 1) | |
1381 { | |
1382 print "\nWARNING: Accession number $fastaAcc used more than once"; | |
1383 if ($skipDuplicates) | |
1384 { | |
1385 print "; skipping this and subsequent entries\n"; | |
1386 } | |
1387 else | |
1388 { | |
1389 print "; assigning temporary accession number\n"; | |
1390 $tempAcc = "tAlign_" . $seqCount; | |
1391 } | |
1392 print "\n"; | |
1393 } | |
1394 } | |
1395 push @accNum, $tempAcc unless ($accCount{$tempAcc} > 1 and $skipDuplicates); | |
1396 } | |
1397 if ($lineRead =~/Name/i and $owner == 2) | |
1398 { | |
1399 ($header, $tempName) = split (/=/, $lineRead); | |
1400 $tempName =~ s/\"//g; | |
1401 $tempName =~ s/\s*;//g; | |
1402 $tempName =~ s/\s+/_/g; | |
1403 $tempName =~ s/\_+/_/g; | |
1404 } | |
1405 if ($lineRead =~/GeneticCode=(\d)/i and $owner == 2) | |
1406 { | |
1407 $sealCode = $1; | |
1408 $geneticCode{$tempAcc} = $seal2gb{$sealCode}; | |
1409 } | |
1410 if ($lineRead =~/Sequence/i and $owner == 2) | |
1411 { | |
1412 next if ($accCount{$tempAcc} > 1 and $skipDuplicates); | |
1413 ($header, $tempSeq) = split (/=/, $lineRead); | |
1414 $tempSeq =~ s/\"//g; | |
1415 $tempSeq =~ s/;//g; | |
1416 $tempSeq =~ s/\s+//g; | |
1417 $nameLabel{$tempAcc} = $tempName; | |
1418 $sequence{$tempAcc} = uc($tempSeq); | |
1419 $seqType = "protein" if ($tempSeq =~ /E/i or $tempSeq =~ /Q/i or $tempSeq =~ /I/i or $tempSeq =~ /L/i or $tempSeq =~ /F/i or $tempSeq =~ /P/i); | |
1420 } | |
1421 if ($lineRead =~/Frame=(\d)/i) # Correct for reading frame | |
1422 { | |
1423 my $readingFrame = $1; | |
1424 $sequence{$tempAcc} = "--" . $sequence{$tempAcc} if ($readingFrame == 2); | |
1425 $sequence{$tempAcc} = "-" . $sequence{$tempAcc} if ($readingFrame == 3); | |
1426 } | |
1427 } | |
1428 | |
1429 if ($inputType eq "phylip") | |
1430 { | |
1431 if ($lineRead =~ /^\s*(\d+)\s+(\d+)/) | |
1432 { | |
1433 $phylipTaxa = $1; | |
1434 $phylipChars = $2; | |
1435 $phylipLineCount = 0; | |
1436 } | |
1437 else | |
1438 { | |
1439 $phylipLineCount++; | |
1440 | |
1441 $lineRead =~ s/\s//g; | |
1442 | |
1443 $phylipSeq{$phylipLineCount} .= $lineRead; | |
1444 | |
1445 $phylipLineCount = 0 if ($phylipLineCount == $phylipTaxa); | |
1446 } | |
1447 } | |
1448 | |
1449 if ($inputType eq "GenBank") | |
1450 { | |
1451 # Get species name and accession number | |
1452 # Pure GenBank format | |
1453 $gbAcc = $1 if ($lineRead =~ /^\s*ACCESSION\s+(\w+\d+)/); | |
1454 if ($lineRead =~ /^\s*ORGANISM\s+/) | |
1455 { | |
1456 $seqCount++; | |
1457 $gbAcc = "tAlign_" . $seqCount if (not defined $gbAcc); | |
1458 $lineRead =~ s/^\s+//; | |
1459 my @orgLine = split(/\s+/, $lineRead); | |
1460 my $header = shift (@orgLine); | |
1461 $nameLabel{$gbAcc} = join('_', @orgLine); | |
1462 $accCount{$gbAcc}++; | |
1463 } | |
1464 # BioEdit format | |
1465 if ($lineRead =~ /^\s*TITLE/ and not defined ($gbAcc)) | |
1466 { | |
1467 $seqCount++; | |
1468 $gbAcc = "tAlign_" . $seqCount; | |
1469 my @accLine = split (/\s+/, $lineRead); | |
1470 $gbAcc = $1 if ($accLine[2] =~ /^\((\w+\d+)\)/); | |
1471 $nameLabel{$gbAcc} = $accLine[1]; | |
1472 $accCount{$gbAcc}++; | |
1473 } | |
1474 | |
1475 if ($lineRead =~ /^\s*ORIGIN/) | |
1476 { | |
1477 if ($accCount{$gbAcc} > 1) | |
1478 { | |
1479 print "\nWARNING: Accession number $gbAcc used more than once"; | |
1480 if ($skipDuplicates) | |
1481 { | |
1482 print "; skipping this and subsequent entries\n"; | |
1483 next; | |
1484 } | |
1485 else | |
1486 { | |
1487 print "; assigning temporary accession number\n"; | |
1488 $gbAcc = "tAlign_" . $seqCount; | |
1489 $gbRead = 1; | |
1490 } | |
1491 } | |
1492 else | |
1493 { | |
1494 $gbRead = 1; | |
1495 $seqCount++; | |
1496 } | |
1497 next; | |
1498 } | |
1499 | |
1500 if ($lineRead =~ /^\s*\/\//) # End of accession; process | |
1501 { | |
1502 $gbRead = 0; | |
1503 next unless ($gbAcc); | |
1504 | |
1505 push @accNum, $gbAcc unless ($accCount{$gbAcc} > 1); | |
1506 $geneticCode{$gbAcc} = $globalGenCode; | |
1507 $nameLabel{$gbAcc} =~ s/\s+/_/g; | |
1508 $nameLabel{$gbAcc} =~ s/\_+/_/g; | |
1509 $sequence{$gbAcc} =~ s/\d//g; | |
1510 $sequence{$gbAcc} =~ s/\s+//g; | |
1511 $sequence{$gbAcc} =~ s/\~/-/g if ($gapChange); | |
1512 undef $gbAcc; | |
1513 } | |
1514 | |
1515 next unless ($gbRead); | |
1516 | |
1517 if ($gbRead and $lineRead =~ /^\s+\d+/) | |
1518 { | |
1519 $sequence{$gbAcc} .= uc($lineRead); | |
1520 } | |
1521 } | |
1522 } | |
1523 close SEQ; | |
1524 | |
1525 if ($inputType eq "phylip") # Postprocess input to derive taxon names and sequence; accounts for both sequential and extended formatting | |
1526 { | |
1527 for (my $i = 1; $i <= $phylipTaxa; $i++) | |
1528 { | |
1529 my $phylipAcc = "tAlign_" . $i; | |
1530 | |
1531 push @accNum, $phylipAcc; | |
1532 $geneticCode{$phylipAcc} = $globalGenCode; | |
1533 | |
1534 # Derive taxon name and sequence | |
1535 $sequence{$phylipAcc} = uc(substr($phylipSeq{$i}, 0 - $phylipChars)); | |
1536 $seqType = "protein" if ($sequence{$phylipAcc} =~ /E/i or $sequence{$phylipAcc} =~ /Q/i or $sequence{$phylipAcc} =~ /I/i or $sequence{$phylipAcc} =~ /L/i or $sequence{$phylipAcc} =~ /F/i or $sequence{$phylipAcc} =~ /P/i); | |
1537 | |
1538 $nameLabel{$phylipAcc} = substr($phylipSeq{$i}, 0, length($phylipSeq{$i}) - $phylipChars); | |
1539 $nameLabel{$phylipAcc} =~ s/\s+/_/g; | |
1540 $nameLabel{$phylipAcc} =~ s/\_+/_/g; | |
1541 } | |
1542 } | |
1543 } | |
1544 | |
1545 sub seqPrint | |
1546 { | |
1547 my $seqFile = shift; | |
1548 | |
1549 $interleaveLength = $maxLength if ($outFormat eq "sequential"); | |
1550 | |
1551 # Print fasta-formatted file | |
1552 if ($fastaPrint) | |
1553 { | |
1554 print "\tWriting to fasta-formatted file $fastaOut ...\n"; | |
1555 open (FASTA, ">$fastaOut") or die "Cannot open fasta file for aligned DNA sequences, $fastaOut"; | |
1556 foreach my $entry (@accNum) | |
1557 { | |
1558 # next if ($deletedSeq{$entry} or not defined $sequence{$entry}); | |
1559 print FASTA ">$nameLabel{$entry}"; | |
1560 print FASTA "\t($entry)" unless ($entry =~ /^tAlign/); | |
1561 print FASTA "\n"; | |
1562 | |
1563 # Print sequence | |
1564 my $fastaSeq = $finalSeq{$entry}; | |
1565 for (my $breakpoint = 0; $breakpoint <= length($fastaSeq); $breakpoint += $fastaLength) | |
1566 { | |
1567 print FASTA substr($fastaSeq, $breakpoint, $fastaLength) . "\n"; | |
1568 } | |
1569 # my $fastaSeq = $finalSeq{$entry}; | |
1570 # my $breakPoint = 80; | |
1571 # my $breakCount = 0; | |
1572 # until ($breakPoint > length($fastaSeq)) | |
1573 # { | |
1574 # $breakCount++; | |
1575 # my $replaceString = "\n" . substr($fastaSeq, $breakPoint, 1); | |
1576 # substr($fastaSeq, $breakPoint, 1) = $replaceString; | |
1577 # $breakPoint += 81; # Latter accounts for addition of \n to string | |
1578 # } | |
1579 # print FASTA "\n$fastaSeq\n"; | |
1580 } | |
1581 close FASTA; | |
1582 } | |
1583 | |
1584 # Print nexus-formatted file | |
1585 if ($nexusPrint) | |
1586 { | |
1587 print "\tWriting to nexus file $nexusOut"; | |
1588 print " in interleaved format" if ($outFormat eq "interleave"); | |
1589 print " ...\n"; | |
1590 open (NEX, ">$nexusOut") or die "Cannot open nexus file for aligned DNA sequences, $nexusOut"; | |
1591 print NEX "#NEXUS\n\n"; | |
1592 print NEX "[File created from $seqFile using $perlScript v$version on ".localtime()."]\n\n"; | |
1593 print NEX "begin data;\n"; | |
1594 print NEX "\tdimensions ntax = $ntax nchar = $maxLength;\n"; | |
1595 print NEX "\tformat datatype = $seqType gap = - missing = ?"; | |
1596 print NEX " interleave" if ($outFormat eq "interleave"); | |
1597 print NEX ";\n\n"; | |
1598 print NEX "\tmatrix\n\n"; | |
1599 for (my $interleaveCount = 1; $interleaveCount <= ceil($maxLength / $interleaveLength); $interleaveCount++) | |
1600 { | |
1601 my $seqStart = (($interleaveCount - 1) * $interleaveLength) + 1; | |
1602 my $seqEnd = $interleaveCount * $interleaveLength; | |
1603 $seqEnd = $maxLength if ($maxLength <= $seqEnd); | |
1604 my $printLength = $seqEnd - $seqStart + 1; | |
1605 | |
1606 foreach my $entry (@accNum) | |
1607 { | |
1608 next if ($deletedSeq{$entry}); | |
1609 my $nexusName = $nameLabel{$entry}; | |
1610 $nexusName = $entry if ($accPrint); | |
1611 if ($nexusName =~ /\W/) | |
1612 { | |
1613 print NEX "'$nexusName'"; | |
1614 } | |
1615 else | |
1616 { | |
1617 print NEX "$nexusName"; | |
1618 } | |
1619 | |
1620 my $printSeq = substr($finalSeq{$entry}, $seqStart - 1, $printLength); | |
1621 if ($outFormat eq "interleave") # Add gaps every 10 elements in sequence | |
1622 { | |
1623 for (my $gapAdd = 100; $gapAdd >= 10; $gapAdd-=10) | |
1624 { | |
1625 next if $gapAdd > $printLength; | |
1626 my $bp = substr($printSeq, $gapAdd - 1, 1) . " "; | |
1627 substr($printSeq, $gapAdd - 1, 1, $bp); | |
1628 } | |
1629 } | |
1630 print NEX "\t$printSeq\n"; | |
1631 } | |
1632 print NEX "\n" if ($interleaveCount * $interleaveLength <= $maxLength); | |
1633 } | |
1634 # foreach my $entry (@accNum) | |
1635 # { | |
1636 # next if ($deletedSeq{$entry}); | |
1637 # if ($nameLabel{$entry} =~ /\W/) | |
1638 # { | |
1639 # print NEX "'$nameLabel{$entry}'"; | |
1640 # } | |
1641 # else | |
1642 # { | |
1643 # print NEX "$nameLabel{$entry}"; | |
1644 # } | |
1645 # print NEX "\t$finalSeq{$entry}\n"; | |
1646 # } | |
1647 print NEX "\t;\nend;\n"; | |
1648 close NEX; | |
1649 | |
1650 if (-e "/Developer/Tools/SetFile") | |
1651 { | |
1652 system ("/Developer/Tools/SetFile -t 'TEXT' $nexusOut"); | |
1653 system ("/Developer/Tools/SetFile -c 'PAUP' $nexusOut"); | |
1654 } | |
1655 } | |
1656 | |
1657 # Print phylip-formatted file (on demand) | |
1658 if ($phylipTradPrint or $phylipExtPrint) | |
1659 { | |
1660 my $maxTaxLength = 50; | |
1661 $maxTaxLength = 10 if ($phylipTradPrint); | |
1662 my $blankName = " " x $maxTaxLength; | |
1663 my %shortNameCount; | |
1664 | |
1665 print "\tWriting to phylip file $phylipOut ...\n"; | |
1666 open (PHYLIP, ">$phylipOut") or die "Cannot open phylip file for aligned DNA sequences, $phylipOut"; | |
1667 print PHYLIP " $ntax $maxLength\n"; | |
1668 for (my $interleaveCount = 1; $interleaveCount <= ceil($maxLength / $interleaveLength); $interleaveCount++) | |
1669 { | |
1670 my $seqStart = (($interleaveCount - 1) * $interleaveLength) + 1; | |
1671 my $seqEnd = $interleaveCount * $interleaveLength; | |
1672 $seqEnd = $maxLength if ($maxLength <= $seqEnd); | |
1673 my $printLength = $seqEnd - $seqStart + 1; | |
1674 | |
1675 # foreach my $entry (@accNum) | |
1676 # { | |
1677 # my $trimmedName = substr($nameLabel{$entry}, 0, $maxTaxLength); | |
1678 # $shortNameCount{trimmedName} = 0; | |
1679 # } | |
1680 | |
1681 foreach my $entry (@accNum) | |
1682 { | |
1683 next if ($deletedSeq{$entry}); | |
1684 | |
1685 my $phylipName = $nameLabel{$entry}; | |
1686 $phylipName = $entry if ($accPrint); | |
1687 | |
1688 # Print name label as appropriate; also check label and adjust to proper length if needed | |
1689 if ($interleaveCount == 1) | |
1690 { | |
1691 if (length($phylipName) < $maxTaxLength) | |
1692 { | |
1693 $shortNameCount{$phylipName}++; | |
1694 $phylipName .= " " x ($maxTaxLength - length($phylipName)) if ($phylipTradPrint); # Pad end of name with spaces as needed | |
1695 } | |
1696 else | |
1697 { | |
1698 my $trimmedName = substr($phylipName, 0, $maxTaxLength); | |
1699 $shortNameCount{$trimmedName}++; | |
1700 if ($shortNameCount{$trimmedName} > 1) # Check for duplicates among shortened names and make unique by adding numbers | |
1701 { | |
1702 $phylipName = substr($phylipName, 0, $maxTaxLength - length($shortNameCount{$trimmedName})); | |
1703 $phylipName .= $shortNameCount{$trimmedName}; | |
1704 $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed | |
1705 } | |
1706 else | |
1707 { | |
1708 $phylipName = $trimmedName; | |
1709 } | |
1710 } | |
1711 print PHYLIP "$phylipName"; | |
1712 print PHYLIP " " if ($phylipExtPrint); | |
1713 } | |
1714 else | |
1715 { | |
1716 print PHYLIP "$blankName"; | |
1717 } | |
1718 | |
1719 # Print sequence | |
1720 my $printSeq = substr($finalSeq{$entry}, $seqStart - 1, $printLength); | |
1721 if ($outFormat eq "interleave") # Add gaps every 10 elements in sequence | |
1722 { | |
1723 for (my $gapAdd = 100; $gapAdd >= 10; $gapAdd-=10) | |
1724 { | |
1725 next if $gapAdd > $printLength; | |
1726 my $bp = substr($printSeq, $gapAdd - 1, 1) . " "; | |
1727 substr($printSeq, $gapAdd - 1, 1, $bp); | |
1728 } | |
1729 } | |
1730 print PHYLIP "$printSeq\n"; | |
1731 } | |
1732 print PHYLIP "\n" if ($interleaveCount * $interleaveLength <= $maxLength); | |
1733 | |
1734 # foreach my $entry (@accNum) | |
1735 # { | |
1736 # next if ($deletedSeq{$entry}); | |
1737 # | |
1738 # my $phylipName = $nameLabel{$entry}; | |
1739 # | |
1740 # # Check name label and adjust to proper length if needed | |
1741 # if (length($phylipName) < $maxTaxLength) | |
1742 # { | |
1743 # $shortNameCount{$phylipName}++; | |
1744 # $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed | |
1745 # } | |
1746 # else | |
1747 # { | |
1748 # my $trimmedName = substr($phylipName, 0 , $maxTaxLength); | |
1749 # $shortNameCount{$trimmedName}++; | |
1750 # if ($shortNameCount{$trimmedName} > 1) # Check for duplicates among shortened names and make unique by adding numbers | |
1751 # { | |
1752 # $phylipName = substr($phylipName, 0, $maxTaxLength - length($shortNameCount{$trimmedName})); | |
1753 # $phylipName .= $shortNameCount{$trimmedName}; | |
1754 # $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed | |
1755 # } | |
1756 # else | |
1757 # { | |
1758 # $phylipName = $trimmedName; | |
1759 # } | |
1760 # } | |
1761 # | |
1762 # print PHYLIP "$phylipName"; | |
1763 # print PHYLIP " " if ($phylipExtPrint); | |
1764 # print PHYLIP "$finalSeq{$entry}\n"; | |
1765 } | |
1766 close PHYLIP; | |
1767 } | |
1768 | |
1769 # Print Se-Al-formatted file (on demand) | |
1770 if ($sealPrint) | |
1771 { | |
1772 print "\tWriting to Se_Al file $sealOut ...\n"; | |
1773 open (SEAL, ">$sealOut") or die "Cannot open Se-Al file for aligned DNA sequences, $sealOut\n"; | |
1774 print SEAL "Database={\n"; | |
1775 print SEAL "\tID='MLst';\n"; | |
1776 print SEAL "\tOwner=null;\n"; | |
1777 print SEAL "\tName=null;\n"; | |
1778 print SEAL "\tDescription=null;\n"; | |
1779 print SEAL "\tFlags=0;\n"; | |
1780 print SEAL "\tCount=2;\n"; | |
1781 print SEAL "\t{\n\t\t{\n"; | |
1782 | |
1783 print SEAL "\t\t\tID='PAli';\n"; | |
1784 print SEAL "\t\t\tOwner=1;\n"; | |
1785 print SEAL "\t\t\tName=\"$seqFile\";\n"; | |
1786 print SEAL "\t\t\tDescription=null;\n"; | |
1787 print SEAL "\t\t\tFlags=0;\n"; | |
1788 print SEAL "\t\t\tNumSites=$maxLength;\n"; | |
1789 print SEAL "\t\t\tType="; | |
1790 if ($seqType eq "nucleotide") | |
1791 { | |
1792 print SEAL "\"Nucleotide\""; | |
1793 } | |
1794 else | |
1795 { | |
1796 print SEAL "\"Amino Acid\""; | |
1797 } | |
1798 print SEAL ";\n"; | |
1799 print SEAL "\t\t\tFeatures=null;\n"; | |
1800 print SEAL "\t\t\tColourMode="; | |
1801 if ($seqType eq "nucleotide") | |
1802 { | |
1803 print SEAL "1"; | |
1804 } | |
1805 else | |
1806 { | |
1807 print SEAL "2"; | |
1808 } | |
1809 print SEAL ";\n"; | |
1810 print SEAL "\t\t\tLabelMode=0;\n"; | |
1811 print SEAL "\t\t\ttriplets=false;\n"; | |
1812 print SEAL "\t\t\tinverse=true;\n"; | |
1813 print SEAL "\t\t\tCount=$ntax;\n"; | |
1814 print SEAL "\t\t\t{\n"; | |
1815 | |
1816 my $i = 0; | |
1817 foreach my $sequence (@accNum) | |
1818 { | |
1819 next if ($deletedSeq{$sequence}); | |
1820 $i++; | |
1821 print SEAL "\t\t\t\t{\n"; | |
1822 print SEAL "\t\t\t\t\tID='PSeq';\n"; | |
1823 print SEAL "\t\t\t\t\tOwner=2;\n"; | |
1824 print SEAL "\t\t\t\t\tName=\"$nameLabel{$sequence}\";\n"; | |
1825 print SEAL "\t\t\t\t\tDescription=null;\n"; | |
1826 print SEAL "\t\t\t\t\tFlags=0;\n"; | |
1827 print SEAL "\t\t\t\t\tAccession="; | |
1828 if ($sequence =~/^tAlign_/) | |
1829 { | |
1830 print SEAL "null;\n"; | |
1831 } | |
1832 else | |
1833 { | |
1834 print SEAL "$sequence;\n"; | |
1835 } | |
1836 if ($seqType eq "nucleotide") | |
1837 { | |
1838 print SEAL "\t\t\t\t\tType=\"DNA\";\n"; | |
1839 } | |
1840 else | |
1841 { | |
1842 print SEAL "\t\t\t\t\tType=\"AA\";\n"; | |
1843 } | |
1844 print SEAL "\t\t\t\t\tLength=".length($finalSeq{$sequence}).";\n"; | |
1845 print SEAL "\t\t\t\t\tSequence=\"$finalSeq{$sequence}\";\n"; | |
1846 if (defined $geneticCode{$sequence} and $geneticCode{$sequence} != 0) | |
1847 { | |
1848 print SEAL "\t\t\t\t\tGeneticCode=$gb2seal{$geneticCode{$sequence}};\n"; | |
1849 } | |
1850 else | |
1851 { | |
1852 print SEAL "\t\t\t\t\tGeneticCode=-1;\n"; # Default for Se-Al is non-coding | |
1853 } | |
1854 print SEAL "\t\t\t\t\tCodeTable=null;\n"; | |
1855 print SEAL "\t\t\t\t\tFrame=1;\n"; | |
1856 print SEAL "\t\t\t\t\tFeatures=null;\n"; | |
1857 print SEAL "\t\t\t\t\tParent=null;\n"; | |
1858 print SEAL "\t\t\t\t\tComplemented=false;\n"; | |
1859 print SEAL "\t\t\t\t\tReversed=false;\n"; | |
1860 print SEAL "\t\t\t\t}"; | |
1861 print SEAL "," unless ($i == $ntax); | |
1862 print SEAL "\n"; | |
1863 } | |
1864 | |
1865 print SEAL "\t\t\t};\n"; | |
1866 print SEAL "\t\t},\n"; | |
1867 print SEAL "\t\t{\n"; | |
1868 print SEAL "\t\t\tID='MCoL';\n"; | |
1869 print SEAL "\t\t\tOwner=1;\n"; | |
1870 print SEAL "\t\t\tName=\"Genetic Codes\";\n"; | |
1871 print SEAL "\t\t\tDescription=\"Custom Genetic Codes\";\n"; | |
1872 print SEAL "\t\t\tFlags=0;\n"; | |
1873 print SEAL "\t\t\tCount=0;\n"; | |
1874 print SEAL "\t\t}\n"; | |
1875 print SEAL "\t};\n"; | |
1876 print SEAL "};\n"; | |
1877 close SEAL; | |
1878 | |
1879 if (-e "/Developer/Tools/SetFile") | |
1880 { | |
1881 system ("/Developer/Tools/SetFile -t 'TEXT' $sealOut"); | |
1882 system ("/Developer/Tools/SetFile -c 'SEAL' $sealOut"); | |
1883 } | |
1884 } | |
1885 } | |
1886 | |
1887 sub geneticCoder # Create translation tables for all genetic codes | |
1888 { | |
1889 my %geneticCode = ('1' => 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', | |
1890 '2' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG', | |
1891 '3' => 'FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG', | |
1892 '4' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', | |
1893 '5' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG', | |
1894 '6' => 'FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', | |
1895 '9' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG', | |
1896 '10' => 'FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', | |
1897 '11' => 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', | |
1898 '12' => 'FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', | |
1899 '13' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG', | |
1900 '14' => 'FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG', | |
1901 '15' => 'FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', | |
1902 '16' => 'FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', | |
1903 '21' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG', | |
1904 '22' => 'FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', | |
1905 '23' => 'FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'); | |
1906 | |
1907 foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23)) | |
1908 { | |
1909 # Establish basic translation table for each genetic code | |
1910 # print "\nEstablishing \"$transTable{$code}\" genetic code ...\n" if ($debug); | |
1911 my $position = 0; | |
1912 foreach my $base1 (qw (T C A G)) | |
1913 { | |
1914 foreach my $base2 (qw (T C A G)) | |
1915 { | |
1916 foreach my $base3 (qw (T C A G)) | |
1917 { | |
1918 my $codon = $base1.$base2.$base3; | |
1919 $DNAtoAA{$code}{$codon} = substr($geneticCode{$code}, $position, 1); | |
1920 # print "\t$codon = $DNAtoAA{$code}{$codon}\n" if ($debug); | |
1921 $position++; | |
1922 } | |
1923 } | |
1924 } | |
1925 | |
1926 # Extend translation table to account for ambiguity codes (note: does not account for gaps) | |
1927 # print "\nExtending translation table to account for ambiguity codes ...\n" if ($debug); | |
1928 foreach my $firstPos (@ambigList) | |
1929 { | |
1930 foreach my $secondPos (@ambigList) | |
1931 { | |
1932 foreach my $thirdPos (@ambigList) | |
1933 { | |
1934 my $codon = $firstPos.$secondPos.$thirdPos; | |
1935 next if (defined $DNAtoAA{$code}{$codon}); | |
1936 my $refAA = ""; | |
1937 foreach my $firstNT (@ {$constitNTlist{$firstPos} }) | |
1938 { | |
1939 last if (defined $DNAtoAA{$code}{$codon}); | |
1940 foreach my $secondNT (@ {$constitNTlist{$secondPos} }) | |
1941 { | |
1942 last if (defined $DNAtoAA{$code}{$codon}); | |
1943 foreach my $thirdNT (@ {$constitNTlist{$thirdPos} }) | |
1944 { | |
1945 my $testCodon = $firstNT.$secondNT.$thirdNT; | |
1946 if (not $refAA) | |
1947 { | |
1948 $refAA = $DNAtoAA{$code}{$testCodon}; | |
1949 } | |
1950 else | |
1951 { | |
1952 if ($DNAtoAA{$code}{$testCodon} ne $refAA) | |
1953 { | |
1954 $DNAtoAA{$code}{$codon} = "?"; | |
1955 last; | |
1956 } | |
1957 } | |
1958 } | |
1959 } | |
1960 } | |
1961 $DNAtoAA{$code}{$codon} = $refAA if (not defined $DNAtoAA{$code}{$codon}); | |
1962 # print "\t$codon = $DNAtoAA{$code}{$codon}\n" if ($debug); | |
1963 } | |
1964 } | |
1965 } | |
1966 } | |
1967 return; | |
1968 } | |
1969 | |
1970 sub translate # Translate a DNA sequence to an AA sequence (note: does not account for gaps) | |
1971 { | |
1972 my $DNAseq = shift; | |
1973 my $userCode = shift; | |
1974 | |
1975 my $protSeq; | |
1976 for (my $codonStart = 0; $codonStart < length($DNAseq); $codonStart += 3) | |
1977 { | |
1978 if (length($DNAseq) - $codonStart >= 3) # Codon is complete; translate | |
1979 { | |
1980 my $codon = substr($DNAseq, $codonStart, 3); | |
1981 if ($codon =~ /-/ or $codon =~ /\./) | |
1982 { | |
1983 $protSeq .= "?"; | |
1984 } | |
1985 else | |
1986 { | |
1987 $protSeq .= $DNAtoAA{$userCode}{$codon}; | |
1988 } | |
1989 } | |
1990 else # Incomplete codon; automatically translates as ? | |
1991 { | |
1992 $protSeq .= "?"; | |
1993 } | |
1994 } | |
1995 | |
1996 return $protSeq; | |
1997 } | |
1998 | |
1999 # Version history | |
2000 # | |
2001 # v1.2 (August 12, 2010) | |
2002 # - added switch to actually allow fasta output to be specified (or not) | |
2003 # - sets TYPE and CREATOR codes for nexus files on Mac systems when | |
2004 # SetFile is present | |
2005 # - can now parse GenBank formatted output (both pure GenBank and BioEdit | |
2006 # versions) | |
2007 # - error checking: warns if same accession number used more than once | |
2008 # and skips subsequent entires | |
2009 # - added ability to: | |
2010 # - detect sequence type of input (nucleotide vs protein) and set as | |
2011 # appropriate in output files | |
2012 # - convert nucleotide input to proteins | |
2013 # - convert sequence input to haplotypes | |
2014 # - interleave nexus- and phylip-formatted output (request by Michael | |
2015 # Craige) and to use inputted value to specify line lengths in fasta | |
2016 # output | |
2017 # - output individual data partitions specified according to nexus- | |
2018 # formatted charset statements | |
2019 # - output jackknifed data sets, each missing a single taxon | |
2020 # - clean sequence labels of non-alphanumeric characters (on by | |
2021 # default) | |
2022 # - convert all ~ gap characters (e.g., from BioEdit) to - | |
2023 # - convert all ambiguous nucleotides to Ns | |
2024 # - change flnaking gaps to Ns (indirect request by Simon Creer) | |
2025 # - fixed classic phylip output such that it now conforms 100% to the | |
2026 # phylip guidelines | |
2027 # - improved parsing of MacClade generating files, including blocking out | |
2028 # MacClade and PAUP blocks when reading in nexus-formatted files | |
2029 # - fixed translation between GenBank and Se-Al genetic codes | |
2030 # - changed all instances of #nexus to #NEXUS in output files for | |
2031 # compatibility with TNT (thanks to Douglas Chester for spotting this) | |
2032 # - minor bug fixes | |
2033 # | |
2034 # v1.1 (March 2, 2006) | |
2035 # - added ability to batch convert all specified file types in working | |
2036 # directory (use -d*) | |
2037 # - updated to seqRead module 1.1.1 (includes autodetection of sequence | |
2038 # format) | |
2039 # - checks that necessary input file(s) exists before proceeding | |
2040 # - added GNU GPL statement | |
2041 # - sets TYPE and CREATOR codes for Se-Al files on Mac systems when | |
2042 # SetFile is present | |
2043 # - minor bug fixes | |
2044 # | |
2045 # v1.0 (May 30, 2005) | |
2046 # - initial release | |
2047 # | |
2048 # This program is free software; you can redistribute it and/or | |
2049 # modify it under the terms of the GNU General Public License | |
2050 # as published by the Free Software Foundation; either version 2 | |
2051 # of the License, or (at your option) any later version. | |
2052 # | |
2053 # This program is distributed in the hope that it will be useful, | |
2054 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
2055 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
2056 # GNU General Public License for more details. | |
2057 # | |
2058 # A copy of the GNU General Public License is available at | |
2059 # http://www.gnu.org/copyleft/gpl.html or by writing to | |
2060 # the Free Software Foundation, Inc., 51 Franklin Street, | |
2061 # Fifth Floor, Boston, MA, 02110-1301, USA. | |
2062 |