osiris_phylogenetics: phyloconversion/seqConverterG.pl comparison

comparison phyloconversion/seqConverterG.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories

author	osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date	Tue, 11 Mar 2014 12:19:13 -0700
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:5b9a38ec4a39
+#!/usr/bin/perl -w
+#
+# seqConverter.pl v1.2
+# Last modified August 12, 2010 21:09
+# Minor modifications for running in Galaxy by TH Oakley Jan 2012
+# (c) Olaf R.P. Bininda-Emonds
+#
+# Input:
+#   Sequence data in any of fasta, GenBank, nexus, (classic or extended) phylip,
+#	or Se-Al formats
+#
+# Output:
+#   Sequence data in any of fasta, nexus, (classic or extended) phylip, and/or
+#	Se-Al formats.
+#
+# Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]
+#		 [-G] [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t]
+#		 [-u] [-v] [-h]
+# [-O]<outfile> specify outfile
+#	options: -a = print out accession numbers instead of taxon labels for nexus
+#				  and phylip output
+#			 -c<number> = global genetic code (to be used for translation unless
+#						  otherwise specified) (default = standard code; end
+#						  with * to override local settings)
+#			 -d<filename> = file containing raw sequence information; * = batch
+#							convert all specified file types in working
+#							directory (suffixes must be .fasta / .fas, .gb,
+#							.nexus / .nex, .phylip / .ph, or .seal as
+#							appropriate)
+#			 -g = do not convert ~ gap characters (e.g., from BioEdit) to -
+#			 -G = convert flanking gaps to Ns
+#			 -H = convert sequence data to haplotype data
+#            -i<f|g|n|p|s> = format of sequence file (fasta (f), GenBank (g),
+#							 nexus (n), phylip (p), or Se-Al (s)); default =
+#							 autodetect
+#            -j = produce jackknifed data sets each with a single sequence
+#				  deleted
+#            -l<number> = interleave nexus- and/or phylip-formatted output
+#						  (default = sequential) with specified sequence length
+#						  (between 10 and 100 inclusive; default = 80);
+#						  NOTE: also sets default interleave length for fasta
+#						        output
+#			 -n = convert ambiguous nucleotides to Ns
+#            -o<f|n|pc|pe|s> = output results in any or all of fasta (f), nexus
+#							   (n), classic or extended phylip (pc or pe),
+#							   and/or Se-Al (s) formats (NOTE: flag can be
+#							   invoked multiple times)
+#			 -r<a|i> = order sequences in final output alphabetically by name
+#					   (a; default) or in input order from file (i)
+#			 -s = split data set into individual partitions according to nexus
+#				  charset statements
+#            -t = translate sequences into amino acids
+#            -u = interactive user-input mode
+#            -h = print this message and quit
+#            -v = verbose output
+use strict;
+use POSIX;
+#THO added ability for Galaxy -G to specify outfile name
+my $OutFile;
+# Set user-input defaults and associated parameters
+	# Data set variables
+		my $inputType = "";	# Options are "fasta", "GenBank", "nexus", "phylip", and "Se-Al"
+		my ($readFile, @seqFiles);
+			my $dataSource;
+		my $seqType = "nucleotide";
+		my $globalGenCode = 0;
+			my $globalOverride = 0;
+		my (@accNum, %nameLabel, %sequence, %geneticCode, %accPresent);
+		my (@charsetList, %charsetStart, %charsetEnd);
+		my (%deletedSeq, %finalSeq);
+		my $seqCount;
+		my $ntax;
+		my $nameClean = 1;
+		my $skipDuplicates = 1;
+	# User input variables
+		my $seqOrder = "alphabetical";	# Options are "alphabetical" (default) and "input"
+		my $seqSplit = 0;
+		my $jackknife = 0;
+		my $gapChange = 1;
+		my $ambigChange = 0;
+		my $translateSeqs = 0;
+		my $haploTyping = 0;
+			my $haploFile;
+		my $flankGap = 0;
+	# Translation variables and genetic codes
+		my %transTable = ('1' => 'standard',
+						  '2' => 'vertebrate mitochondrial',
+						  '3' => 'yeast mitochondrial',
+						  '4' => 'mold, protozoan and colenterate mitochondrial and  mycoplasam/spiroplasma',
+						  '5' => 'invertebrate mitochondrial',
+						  '6' => 'ciliate, dasycladacean and hexamita nuclear',
+						  '9' => 'echinoderm mitochondrial',
+						  '10' => 'euplotid nuclear',
+						  '11' => 'bacterial and plant plastid',
+						  '12' => 'alternative yeast nuclear',
+						  '13' => 'ascidian mitochondrial',
+						  '14' => 'alternative flatworm mitochondrial',
+						  '15' => 'Blepharisma nuclear',
+						  '16' => 'chlorophycean mitochondrial',
+						  '21' => 'trematode mitochondrial',
+						  '22' => 'Scenedesmus obliquus mitochondrial',
+						  '23' => 'Thraustochytrium mitochondrial');
+		my %gb2seal = ('1' => '0',
+					   '2' => '1',
+					   '3' => '2',
+					   '4' => '3',
+					   '4' => '4',
+					   '5' => '5',
+					   '6' => '6',
+					   '9' => '7',
+					   '10' => '8',
+					   '11' => '9',
+					   '12' => '10',
+					   '13' => '11',
+					   '14' => '12',
+					   '15' => '13',
+					   '16' => '1',
+					   '21' => '1',
+					   '22' => '1',
+					   '23' => '1');
+		my %seal2gb = ('0' => '1',
+					   '1' => '2',
+					   '2' => '3',
+					   '3' => '4',
+					   '4' => '4',
+					   '5' => '5',
+					   '6' => '6',
+					   '7' => '9',
+					   '8' => '10',
+					   '9' => '11',
+					   '10' => '12',
+					   '11' => '13',
+					   '12' => '14',
+					   '13' => '15');
+		my %genCodes;
+			foreach my $code (qw(0 1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23))
+				{
+				$genCodes{$code} = 1;
+				}
+		my %DNAtoAA;
+		my @ambigList = qw(A C G T M R W S Y K V H D B N);
+		my %ambigCode = ('A' => 'A', 'C' => 'C', 'G' => 'G', 'T' => 'T',
+						'AC' => 'M', 'AG' => 'R', 'AT' => 'W', 'CG' => 'S', 'CT' => 'Y', 'GT' => 'K',
+						'ACG' => 'V', 'ACT' => 'H', 'AGT' => 'D', 'CGT' => 'B', 'ACGT' => 'N');
+		my (%constitNTlist);
+			while (my ($nt, $code) = each %ambigCode)	# Where $nt = key and $code = value
+				{
+				push @{$constitNTlist{$code}}, $_ foreach (split("",$nt));
+				}
+	# Output variables
+		my $maxLength;
+		my $fastaPrint = 0;
+			my $fastaOut;
+		my $nexusPrint = 0;
+			my $nexusOut;
+		my ($phylipTradPrint, $phylipExtPrint) = (0, 0);
+			my $phylipOut;
+		my $sealPrint = 0;
+			my $sealOut;
+		my $outFormat = "sequential";
+			my $interleaveLength = 80;
+			my $fastaLength = 80;
+		my $accPrint = 0;
+	# Miscellaneous variables
+		my $verbose = 0;
+		my $debug = 0;
+		my $perlScript = "seqConverterG.pl";
+		my $version = "1.2";
+# Read in user input
+	if (not @ARGV or join(' ', @ARGV) =~ /\s-u/ or $ARGV[0] =~ /^-u/)	# Enter interactive user-input mode
+		{
+		print "Entering interactive user-input mode. Type \"q\" at any prompt to exit program.\n";
+		# Get print format
+			my $defaultAcc = ($accPrint) ? "y" : "n";
+				undef $accPrint;
+			until (defined $accPrint)
+				{
+				print "\tPrint out accession numbers only to nexus- and/or phylip output (y|n) [$defaultAcc]: ";
+				$accPrint = <stdin>;
+					chomp ($accPrint);
+					exit(0) if ($accPrint eq "q");
+				if (substr($accPrint, 0, 1) =~ /^n/i or $accPrint eq "")
+					{
+					$accPrint = 0;
+					}
+				elsif (substr($accPrint, 0, 1) =~ /^y/i)
+					{
+					$accPrint = 1;
+					}
+				else
+					{
+					print "\t\tInvalid input ($accPrint)\n";
+					undef $accPrint;
+					}
+				}
+		# Get datafile
+			until (defined $readFile)
+				{
+				print "\tEnter name of data file (* = batch convert): ";
+				$readFile = <stdin>;
+					chomp ($readFile);
+					exit(0) if ($readFile eq "q");
+				unless (-e $readFile or $readFile eq "*")
+					{
+					print "\t\tFile '$readFile' does not exist\n";
+					undef $readFile;
+					}
+				}
+			push @seqFiles, $readFile;
+		# Get format of datafile
+			my $defaultInput = "autodetect";
+				undef $inputType;
+			until (defined $inputType)
+				{
+				print "\tEnter format of file $readFile (fasta|GenBank|nexus|phylip|Se-Al) [$defaultInput]: ";
+				$inputType = <stdin>;
+					chomp ($inputType);
+					exit(0) if ($inputType =~ /^q/i);
+					if (substr($inputType, 0, 1) =~ /^a/i or $inputType eq "")
+						{
+						$inputType = "autodetect";
+						}
+					elsif (substr($inputType, 0, 1) =~ /^f/i)
+						{
+						$inputType = "fasta";
+						}
+					elsif (substr($inputType, 0, 1) =~ /^g/i)
+						{
+						$inputType = "GenBank";
+						}
+					elsif (substr($inputType, 0, 1) =~ /^n/i)
+						{
+						$inputType = "nexus";
+						}
+					elsif (substr($inputType, 0, 1) =~ /^p/i)
+						{
+						$inputType = "phylip";
+						}
+					elsif (substr($inputType, 0, 1) =~ /^s/i)
+						{
+						$inputType = "Se-Al";
+						}
+					else
+						{
+						print "\t\tInvalid input ($inputType)\n";
+						undef $inputType;
+						}
+				}
+			$inputType = "" if ($inputType eq "autodetect");
+		# Get whether or not to clean sequence labels
+			my $defaultClean = ($nameClean) ? "y" : "n";
+				undef $nameClean;
+			until (defined $nameClean)
+				{
+				print "\tClean sequence labels by changing non-alphanumeric characters to underscores (y|n)? [$defaultClean]: ";
+				$nameClean = <stdin>;
+					chomp ($nameClean);
+					exit(0) if ($nameClean =~ /^q/i);
+					if (substr($nameClean, 0, 1) =~ /^y/i or $nameClean eq "")
+						{
+						$nameClean = 1;
+						}
+					elsif (substr($nameClean, 0, 1) =~ /^n/i)
+						{
+						$nameClean = 0;
+						}
+					else
+						{
+						print "\t\tInvalid input ($seqSplit)\n";
+						undef $nameClean;
+						}
+				}
+		# Get whether or not to split sequences
+			my $defaultSplit = ($seqSplit) ? "y" : "n";
+				undef $seqSplit;
+			until (defined $seqSplit)
+				{
+				print "\tSplit into individual partitions following charset statements (y|n)? [$defaultSplit]: ";
+				$seqSplit = <stdin>;
+					chomp ($seqSplit);
+					exit(0) if ($seqSplit =~ /^q/i);
+					if (substr($seqSplit, 0, 1) =~ /^n/i or $seqSplit eq "")
+						{
+						$seqSplit = 0;
+						}
+					elsif (substr($seqSplit, 0, 1) =~ /^y/i)
+						{
+						$seqSplit = 1;
+						}
+					else
+						{
+						print "\t\tInvalid input ($seqSplit)\n";
+						undef $seqSplit;
+						}
+				}
+		# Get whether or not to jackknife sequences
+			my $defaultJack = ($jackknife) ? "y" : "n";
+				undef $jackknife;
+			until (defined $jackknife)
+				{
+				print "\tProduce replicate, jackknifed data sets (y|n)? [$defaultJack]: ";
+				$jackknife = <stdin>;
+					chomp ($jackknife);
+					exit(0) if ($jackknife =~ /^q/i);
+					if (substr($jackknife, 0, 1) =~ /^n/i or $jackknife eq "")
+						{
+						$jackknife = 0;
+						}
+					elsif (substr($jackknife, 0, 1) =~ /^y/i)
+						{
+						$jackknife = 1;
+						}
+					else
+						{
+						print "\t\tInvalid input ($jackknife)\n";
+						undef $jackknife;
+						}
+				}
+		# Get whether or not to convert gaps
+			my $defaultGaps = ($gapChange) ? "y" : "n";
+				undef $gapChange;
+			until (defined $gapChange)
+				{
+				print "\tConvert ~ gap characters to - (y|n)? [$defaultGaps]: ";
+				$gapChange = <stdin>;
+					chomp ($gapChange);
+					exit(0) if ($gapChange =~ /^q/i);
+					if (substr($gapChange, 0, 1) =~ /^n/i)
+						{
+						$gapChange = 0;
+						}
+					elsif (substr($gapChange, 0, 1) =~ /^y/i or $gapChange eq "")
+						{
+						$gapChange = 1;
+						}
+					else
+						{
+						print "\t\tInvalid input ($gapChange)\n";
+						undef $gapChange;
+						}
+				}
+		# Get whether or not to convert flanking gaps
+			my $defaultFlank = ($flankGap) ? "y" : "n";
+				undef $flankGap;
+			until (defined $flankGap)
+				{
+				print "\tConvert flanking gap characters to Ns (y|n)? [$defaultFlank]: ";
+				$flankGap = <stdin>;
+					chomp ($flankGap);
+					exit(0) if ($flankGap =~ /^q/i);
+					if (substr($flankGap, 0, 1) =~ /^n/i)
+						{
+						$flankGap = 0;
+						}
+					elsif (substr($flankGap, 0, 1) =~ /^y/i or $flankGap eq "")
+						{
+						$flankGap = 1;
+						}
+					else
+						{
+						print "\t\tInvalid input ($flankGap)\n";
+						undef $flankGap;
+						}
+				}
+		# Get whether or not to convert ambiguous nucleotides
+			my $defaultAmbig = ($ambigChange) ? "y" : "n";
+				undef $ambigChange;
+			until (defined $ambigChange)
+				{
+				print "\tConvert ambiguous nucleotides to Ns (y|n)? [$defaultAmbig]: ";
+				$ambigChange = <stdin>;
+					chomp ($ambigChange);
+					exit(0) if ($ambigChange =~ /^q/i);
+					if (substr($ambigChange, 0, 1) =~ /^n/i or $ambigChange eq "")
+						{
+						$ambigChange = 0;
+						}
+					elsif (substr($ambigChange, 0, 1) =~ /^y/i)
+						{
+						$ambigChange = 1;
+						}
+					else
+						{
+						print "\t\tInvalid input ($ambigChange)\n";
+						undef $ambigChange;
+						}
+				}
+		# Get output order of sequences
+			my $defaultOrder = $seqOrder;
+				undef $seqOrder;
+			until (defined $seqOrder)
+				{
+				print "\tEnter output order for sequences (alphabetical|clustal|input file) [$defaultOrder]: ";
+				$seqOrder = <stdin>;
+					chomp ($seqOrder);
+					exit(0) if ($seqOrder =~ /^q/i);
+					if (substr($seqOrder, 0, 1) =~ /^i/i)
+						{
+						$seqOrder = "input";
+						}
+					elsif (substr($seqOrder, 0, 1) =~ /^a/i or $seqOrder eq "")
+						{
+						$seqOrder = "alphabetical";
+						}
+					else
+						{
+						print "\t\tInvalid input ($seqOrder)\n";
+						undef $seqOrder;
+						}
+				}
+		# Get whether or not to convert to haplotypes
+			my $defaultHaplo = ($haploTyping) ? "y" : "n";
+				undef $haploTyping;
+			until (defined $haploTyping)
+				{
+				print "\tConvert sequence data to haplotypes (y|n)? [$defaultHaplo]: ";
+				$haploTyping = <stdin>;
+					chomp ($haploTyping);
+					exit(0) if ($haploTyping =~ /^q/i);
+					if (substr($haploTyping, 0, 1) =~ /^n/i or $haploTyping eq "")
+						{
+						$haploTyping = 0;
+						}
+					elsif (substr($haploTyping, 0, 1) =~ /^y/i)
+						{
+						$haploTyping = 1;
+						}
+					else
+						{
+						print "\t\tInvalid input ($haploTyping)\n";
+						undef $haploTyping;
+						}
+				}
+		# Get whether or not to convert to amino acids
+			my $defaultTranslate = ($translateSeqs) ? "y" : "n";
+				undef $translateSeqs;
+			until (defined $translateSeqs)
+				{
+				print "\tConvert sequence data to amino acids (y|n)? [$defaultTranslate]: ";
+				$translateSeqs = <stdin>;
+					chomp ($translateSeqs);
+					exit(0) if ($translateSeqs =~ /^q/i);
+					if (substr($translateSeqs, 0, 1) =~ /^n/i or $translateSeqs eq "")
+						{
+						$translateSeqs = 0;
+						}
+					elsif (substr($translateSeqs, 0, 1) =~ /^y/i)
+						{
+						$translateSeqs = 1;
+						}
+					else
+						{
+						print "\t\tInvalid input ($translateSeqs)\n";
+						undef $translateSeqs;
+						}
+				}
+		# Get genetic code for translation
+			my $defaultCode = $globalGenCode;
+				$globalGenCode = 99;
+			print "\tGenetic codes available for protein translation:\n";
+				foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23))
+					{
+					print "\t\t$code: $transTable{$code}";
+					print "\n";
+					}
+			until (defined $genCodes{$globalGenCode})
+				{
+				$globalOverride = 0;
+				print "\tEnter global genetic code to be applied (0 = no translation; follow with an * to override local code) [$defaultCode]: ";
+				$globalGenCode = <stdin>;
+					chomp ($globalGenCode);
+					exit(0) if ($globalGenCode =~ /^q/i);
+					$globalGenCode = $defaultCode if ($globalGenCode eq "");
+					if ($globalGenCode =~ /\*$/)
+						{
+						$globalOverride = 1;
+						$globalGenCode =~ s/\*$//;
+						}
+					print "\t\tInvalid input ($globalGenCode)\n" unless (defined $genCodes{$globalGenCode} or $globalGenCode == 0);
+				}
+		# Get output formats
+			my $defaultFasta = ($fastaPrint) ? "y" : "n";
+				undef $fastaPrint;
+			until (defined $fastaPrint)
+				{
+				print "\tOutput results in fasta format (y|n) [$defaultFasta]: ";
+				$fastaPrint = <stdin>;
+					chomp ($fastaPrint);
+					exit(0) if ($fastaPrint =~ /^q/i);
+					if (substr($fastaPrint, 0, 1) =~ /^y/i)
+						{
+						$fastaPrint = 1;
+						}
+					elsif (substr($fastaPrint, 0, 1) =~ /^n/i or $fastaPrint eq "")
+						{
+						$fastaPrint = 0;
+						}
+					else
+						{
+						print "\t\tInvalid input ($fastaPrint)\n";
+						undef $fastaPrint;
+						}
+				}
+			my $defaultNexus = ($nexusPrint) ? "y" : "n";
+				undef $nexusPrint;
+			until (defined $nexusPrint)
+				{
+				print "\tOutput results in nexus format (y|n) [$defaultNexus]: ";
+				$nexusPrint = <stdin>;
+					chomp ($nexusPrint);
+					exit(0) if ($nexusPrint =~ /^q/i);
+					if (substr($nexusPrint, 0, 1) =~ /^y/i)
+						{
+						$nexusPrint = 1;
+						}
+					elsif (substr($nexusPrint, 0, 1) =~ /^n/i or $nexusPrint eq "")
+						{
+						$nexusPrint = 0;
+						}
+					else
+						{
+						print "\t\tInvalid input ($nexusPrint)\n";
+						undef $nexusPrint;
+						}
+				}
+			my $defaultPhylip = ($phylipTradPrint) ? "y" : "n";
+				undef $phylipTradPrint;
+			until (defined $phylipTradPrint or $phylipExtPrint)
+				{
+				print "\tOutput results in traditional phylip format (y|n) [$defaultPhylip]: ";
+				$phylipTradPrint = <stdin>;
+					chomp ($phylipTradPrint);
+					exit(0) if ($phylipTradPrint =~ /^q/i);
+					if (substr($phylipTradPrint, 0, 1) =~ /^y/i)
+						{
+						$phylipTradPrint = 1;
+						}
+					elsif (substr($phylipTradPrint, 0, 1) =~ /^n/i or $phylipTradPrint eq "")
+						{
+						$phylipTradPrint = 0;
+						}
+					else
+						{
+						print "\t\tInvalid input ($phylipTradPrint)\n";
+						undef $phylipTradPrint;
+						}
+				}
+				if ($phylipTradPrint == 0)	# Check for extended format
+					{
+					my $defaultPhylip = ($phylipExtPrint) ? "y" : "n";
+						undef $phylipExtPrint;
+					until (defined $phylipExtPrint or $phylipExtPrint)
+						{
+						print "\tOutput results in extended phylip format (y|n) [$defaultPhylip]: ";
+						$phylipExtPrint = <stdin>;
+							chomp ($phylipExtPrint);
+							exit(0) if ($phylipExtPrint =~ /^q/i);
+							if (substr($phylipExtPrint, 0, 1) =~ /^y/i)
+								{
+								$phylipExtPrint = 1;
+								}
+							elsif (substr($phylipExtPrint, 0, 1) =~ /^n/i or $phylipExtPrint eq "")
+								{
+								$phylipExtPrint = 0;
+								}
+							else
+								{
+								print "\t\tInvalid input ($phylipExtPrint)\n";
+								undef $phylipExtPrint;
+								}
+						}
+					}
+			my $defaultSeal = ($sealPrint) ? "y" : "n";
+				undef $sealPrint;
+			until (defined $sealPrint)
+				{
+				print "\tOutput results in Se-Al format (y|n) [$defaultSeal]: ";
+				$sealPrint = <stdin>;
+					chomp ($sealPrint);
+					exit(0) if ($sealPrint =~ /^q/i);
+					if (substr($sealPrint, 0, 1) =~ /^y/i)
+						{
+						$sealPrint = 1;
+						}
+					elsif (substr($sealPrint, 0, 1) =~ /^n/i or $sealPrint eq "")
+						{
+						$sealPrint = 0;
+						}
+					else
+						{
+						print "\t\tInvalid input ($sealPrint)\n";
+						undef $sealPrint;
+						}
+				}
+			if ($nexusPrint or $phylipTradPrint or $phylipExtPrint)
+				{
+				my $defaultInterleave = ($outFormat) ? "y" : "n";
+					undef $outFormat;
+				until (defined $outFormat)
+					{
+					print "\tInterleave output for nexus- and/or phylip-formatted output with length (n|10-100) [$defaultInterleave]: ";
+					$outFormat = <stdin>;
+						chomp ($outFormat);
+						exit(0) if ($outFormat =~ /^q/i);
+						if ($outFormat =~ /(\d+)/)
+							{
+							if ($outFormat =~ /\D/ or $outFormat < 10 or $outFormat > 100)
+								{
+								print "\t\tInvalid input ($outFormat)\n";
+								undef $outFormat;
+								}
+							else
+								{
+								$outFormat = $1;
+								}
+							}
+						elsif (substr($outFormat, 0, 1) =~ /^n/i or $outFormat eq "")
+							{
+							$outFormat = "sequential";
+							}
+						else
+							{
+							print "\t\tInvalid input ($outFormat)\n";
+							undef $outFormat;
+							}
+					}
+				if ($outFormat ne "sequential")
+					{
+					$interleaveLength = $outFormat;
+					$outFormat = "interleave";
+					}
+				}
+		# Get verbose output mode
+			my $defaultVerbose = ($verbose) ? "y" : "n";
+				undef $verbose;
+			until (defined $verbose)
+				{
+				print "\tOutput verbose results to screen (y|n) [$defaultVerbose]: ";
+				$verbose = <stdin>;
+					chomp ($verbose);
+					exit(0) if ($verbose =~ /^q/i);
+					if (substr($verbose, 0, 1) =~ /^y/i)
+						{
+						$verbose = 1;
+						print "\n";
+						}
+					elsif (substr($verbose, 0, 1) =~ /^n/i or $verbose eq "")
+						{
+						$verbose = 0;
+						}
+					elsif (substr($verbose, 0, 1) =~ /^x/i or $verbose eq "")
+						{
+						$verbose = $debug = 1;
+						}
+					else
+						{
+						print "\t\tInvalid input ($verbose)\n";
+						undef $verbose;
+						}
+				}
+		}
+	elsif (join(' ', @ARGV) =~ /\s-h/ or $ARGV[0] =~ /^-h/)	# Print help screen
+		{
+		print "Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]\n";
+		print "       [-G] [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t]\n";
+		print "       [-u] [-v] [-h]\n";
+		print "Version: $version\n";
+		print "Options: -a = print out accession numbers instead of taxon labels for nexus\n";
+		print "              and phylip output\n";
+		print "         -c<number> = global genetic code (to be used for translation unless\n";
+		print "                      otherwise specified) (default = standard code; end\n";
+		print "                      with * to override local settings)\n";
+			foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23))
+				{
+				print "                      $code: $transTable{$code}";
+				print "\n";
+				}
+		print "         -d<filename> = file containing raw sequence information; * = batch\n";
+		print "                        convert all specified file types in working\n";
+		print "                        directory (suffixes must be .fasta / .fas, .gb\n";
+		print "                        .nexus / .nex, .phylip / .ph, or .seal as\n";
+		print "                        appropriate)\n";
+		print "         -g = do not convert ~ gap characters (e.g., from BioEdit) to -\n";
+		print "         -G = convert flanking gaps to Ns\n";
+		print "         -H = convert sequence data to haplotype data\n";
+		print "         -i<f|g|n|p|s> = format of sequence file (fasta (f), GenBank (g),\n";
+		print "                         nexus (n), phylip (p), or Se-Al (s)); default =\n";
+		print "                         autodetect\n";
+		print "         -j = produce jackknifed data sets each with a single sequence\n";
+		print "              deleted\n";
+		print "         -l<number> = interleave nexus- and/or phylip-formatted output\n";
+		print "                      (default = sequential) with specified sequence length\n";
+		print "                      (between 10 and 100 inclusive; default = 80);\n";
+		print "                      NOTE: also sets default interleave length for fasta\n";
+		print "                            output\n";
+		print "         -n = convert ambiguous nucleotides to Ns\n";
+		print "         -o<f|n|pc|pe|s> = output results in any or all of fasta (f), nexus\n";
+		print "                           (n), classic or extended phylip (pc or pe),\n";
+		print "                           and/or Se-Al (s) formats (NOTE: flag can be\n";
+		print "                           invoked multiple times)\n";
+		print "         -r<a|i> = order sequences in final output alphabetically by name\n";
+		print "                   (a; default) or in input order from file (i)\n";
+		print "         -s = split data set into individual partitions according to nexus\n";
+		print "              charset statements\n";
+		print "         -t = translate sequences into amino acids\n";
+		print "         -u = interactive user-input mode\n";
+		print "         -h = print this message and quit\n";
+		print "         -v = verbose output\n";
+		exit(0);
+		}
+	else	# Process switches
+		{
+		for (my $i = 0; $i <= $#ARGV; $i++)
+			{
+			if ($ARGV[$i] eq "-a")
+				{
+				$accPrint = 1;
+				}
+			elsif ($ARGV[$i] =~ /-c(\d+)/)
+				{
+				$globalGenCode = $1;
+				if ($globalGenCode != 0 and not defined $genCodes{$globalGenCode})
+					{
+					print "Don't understand argument: $ARGV[$i]\n";
+					print "Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]\n";
+					print "       [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t] [-u]\n";
+					print "       [-v] [-h]\n";
+					print "Version: $version\n";
+					exit(1);
+					}
+				$globalOverride = 1 if ($ARGV[$i] =~ /\*$/);
+				}
+			elsif ($ARGV[$i] =~ /^-d(.*)/)
+				{
+				$readFile = $1;
+				unless ($readFile eq "*")
+					{
+					die "ERROR: Data file $readFile does not exist.\n" unless (-e $readFile);
+					}
+				push @seqFiles, $readFile;
+				}
+			elsif ($ARGV[$i] eq "-g")
+				{
+				$gapChange = 0;
+				}
+			elsif ($ARGV[$i] eq "-G")
+				{
+				$flankGap = 1;
+				}
+			elsif ($ARGV[$i] eq "-H")
+				{
+				$haploTyping = 1;
+				}
+			elsif ($ARGV[$i] eq "-if")
+				{
+				$inputType = "fasta";
+				}
+			elsif ($ARGV[$i] eq "-ig")
+				{
+				$inputType = "GenBank";
+				}
+			elsif ($ARGV[$i] eq "-in")
+				{
+				$inputType = "nexus";
+				}
+			elsif ($ARGV[$i] eq "-ip")
+				{
+				$inputType = "phylip";
+				}
+			elsif ($ARGV[$i] eq "-is")
+				{
+				$inputType = "Se-Al";
+				}
+			elsif ($ARGV[$i] eq "-j")
+				{
+				$jackknife = 1;
+				}
+			elsif ($ARGV[$i] eq "-k")
+				{
+				$nameClean = 0;
+				}
+			elsif ($ARGV[$i] =~ /-l(\d+)?/)
+				{
+				$interleaveLength = $1 if (defined $1);
+				$outFormat = "interleave";
+				}
+			elsif ($ARGV[$i] eq "-n")
+				{
+				$ambigChange = 1;
+				}
+			elsif ($ARGV[$i] eq "-of")
+				{
+				$fastaPrint = 1;
+				}
+			elsif ($ARGV[$i] eq "-on")
+				{
+				$nexusPrint = 1;
+				}
+			elsif ($ARGV[$i] eq "-opc")
+				{
+				$phylipTradPrint = 1;
+				}
+			elsif ($ARGV[$i] eq "-ope")
+				{
+				$phylipExtPrint = 1;
+				}
+			elsif ($ARGV[$i] eq "-os")
+				{
+				$sealPrint = 1;
+				}
+			elsif ($ARGV[$i] eq "-ra")
+				{
+				$seqOrder = "alphabetical";
+				}
+			elsif ($ARGV[$i] eq "-ri")
+				{
+				$seqOrder = "input";
+				}
+			elsif ($ARGV[$i] eq "-s")
+				{
+				$seqSplit = 1;
+				}
+			elsif ($ARGV[$i] eq "-t")
+				{
+				$translateSeqs = 1;
+				$globalGenCode = 1 unless ($globalGenCode);
+				}
+			elsif ($ARGV[$i] eq "-v")
+				{
+				$verbose = 1;
+				}
+			elsif ($ARGV[$i] eq "-x")
+				{
+				$debug = 1;
+				$verbose = 1;
+				}
+			elsif ($ARGV[$i] =~ /^-O(.*)/)
+				{
+				$OutFile = $1;
+				}
+			else
+				{
+				print "Don't understand argument: $ARGV[$i]\n";
+			print "Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]\n";
+			print "       [-G] [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t]\n";
+			print "       [-u] [-v] [-h]\n";
+				print "Version: $version\n";
+				exit(1);
+				}
+			}
+		}
+	$accPrint = 0 if ($accPrint eq "n");
+	geneticCoder() if ($globalGenCode);
+	$fastaLength = $interleaveLength;
+# Check for I/O errors
+	die "ERROR: Must supply name of file containing sequence data.\n" if (not @seqFiles);
+	die "ERROR: Must supply at least one output format.\n" unless ($fastaPrint or $nexusPrint or $phylipTradPrint or $phylipExtPrint or $sealPrint);
+	die "ERROR: Sequence length for interleaved format must be between 10 and 100 inclusive.\n" if (($fastaPrint or (($nexusPrint or $phylipTradPrint or $phylipExtPrint) and $outFormat eq "interleave")) and ($interleaveLength < 10 or $interleaveLength > 100));
+# Read in sequence data
+	if ($seqFiles[0] eq "*")	# Batch convert all appropriate files in working directory
+		{
+		if ($inputType)
+			{
+			undef @seqFiles;
+			system("ls -d * > convertList.txt");
+				setLineBreak("convertList.txt");
+				open (LIST, "<convertList.txt") or die "Cannot open file containing names of all sequence files, convertList.txt\n";
+					while (<LIST>)
+						{
+						chomp;
+						next unless ($_);
+						push @seqFiles, $_ if ($inputType eq "Se-Al" and $_ =~ /\.seal$/);
+						push @seqFiles, $_ if ($inputType eq "phylip" and ($_ =~ /\.phylip$/ or $_ =~ /\.ph$/));
+						push @seqFiles, $_ if ($inputType eq "nexus" and ($_ =~ /\.nexus$/ or $_ =~ /\.nex$/));
+						push @seqFiles, $_ if ($inputType eq "GenBank" and ($_ =~ /\.gb$/));
+						push @seqFiles, $_ if ($inputType eq "fasta" and ($_ =~ /\.fasta$/ or $_ =~ /\.fas$/));
+						}
+				close LIST;
+			unlink ("convertList.txt") unless ($debug);
+			die "ERROR: No files of type $inputType found for batch conversion.\n" if (not @seqFiles);
+			}
+		else
+			{
+			die "ERROR: Must specify input file type for batch conversion\n";
+			}
+		}
+	foreach my $seqFile (@seqFiles)
+		{
+		print "\nConverting file $seqFile ...\n";
+		# Set output file names
+			$dataSource = $seqFile;
+				$dataSource =~ s/\.\w+$//;
+			if ($fastaPrint)
+				{
+				$fastaOut = $dataSource . ".fasta";
+					$fastaOut =~ s/\.fasta$/_new.fasta/ if ($fastaOut eq $seqFile);
+					$fastaOut =~ s/\.fasta$/_haplo.fasta/ if ($haploTyping);
+				}
+			if ($nexusPrint)
+				{
+#				$nexusOut = $dataSource . ".nex";
+# Fixed Output File name much easier for Galaxy
+				$nexusOut = $OutFile;
+					$nexusOut =~ s/\.nex$/_new.nex/ if ($nexusOut eq $seqFile);
+					$nexusOut =~ s/\.nex$/_haplo.nex/ if ($haploTyping);
+				}
+			if ($phylipTradPrint or $phylipExtPrint)
+				{
+#				$phylipOut = $dataSource . ".phylip";
+# Fixed Output File name much easier for Galaxy
+				$phylipOut = $OutFile;
+					$phylipOut =~ s/\.phylip$/_new.phylip/ if ($phylipOut eq $seqFile);
+					$phylipOut =~ s/\.phylip$/_haplo.phylip/ if ($haploTyping);
+				}
+			if ($sealPrint)
+				{
+				$sealOut = $dataSource . ".seal";
+					$sealOut =~ s/\.seal$/_new.seal/ if ($sealOut eq $seqFile);
+					$sealOut =~ s/\.seal$/_haplo.seal/ if ($haploTyping);
+				}
+			$haploFile = $dataSource . "_haplotypeSeqs.txt";
+			$phylipExtPrint = 0 if ($phylipTradPrint);
+		# Read in sequence data
+			# Clear variables
+				undef @accNum;
+				undef %nameLabel;
+				undef %sequence;
+				undef %geneticCode;
+				undef %accPresent;
+				undef %deletedSeq;
+				undef %finalSeq;
+				undef $seqCount;
+				undef $ntax;
+				undef @charsetList;
+					undef %charsetStart;
+					undef %charsetEnd;
+				my (@haploList, %haploID, %haploSeqs);
+				$maxLength = 0;
+			seqRead($seqFile);
+			if (not @accNum)
+				{
+				print "\tERROR: Could not read in sequences from file $seqFile; skipping to next file\n";
+				next;
+				}
+		# Process for printing
+			my $stopCodonCount;
+			$seqType = "protein" if ($globalGenCode and $translateSeqs);
+			foreach my $seq (@accNum)
+				{
+				$sequence{$seq} =~ s/\~/-/g if ($gapChange);
+				$sequence{$seq} =~ s/R|Y|M|K|S|W|H|B|V|D/N/ig if ($ambigChange and $seqType eq "nucleotide");
+				$finalSeq{$seq} = $sequence{$seq};
+					$geneticCode{$seq} = $globalGenCode if ($globalOverride);
+					if ($globalGenCode and $translateSeqs)
+						{
+						$finalSeq{$seq} = translate($sequence{$seq}, $geneticCode{$seq});
+						$stopCodonCount++ if (($finalSeq{$seq} =~ tr/\*//) > 2);	# Check for stop codons
+						}
+				$maxLength = length($finalSeq{$seq}) if (length($finalSeq{$seq}) > $maxLength);
+				$nameLabel{$seq} =~ s/\W+/_/g if ($nameClean);	# Clean sequence labels of non-alphanumerics
+				}
+			printf "\n\tWARNING: $stopCodonCount of %s sequences had more than two stop codons; check for\n\t\t1) proper reading frame,\n\t\t2) proper genetic code, or\n\t\t3) that sequences are coding DNA\n", scalar(@accNum) if ($globalGenCode and $stopCodonCount);
+		# Add gaps to end of any sequence less than maximum length
+			foreach my $seq (@accNum)
+				{
+				$finalSeq{$seq} .= "-" x ($maxLength - length($sequence{$seq}));
+					$sequence{$seq} = $finalSeq{$seq};
+				}
+		# Convert flanking gaps to Ns if desired
+			if ($flankGap)
+				{
+				foreach my $seq (@accNum)
+					{
+					if ($finalSeq{$seq} =~ /^(\-+)/)
+						{
+						my $startGap = $1;
+							my $startN = "N" x length($startGap);
+						$finalSeq{$seq} =~s/^$startGap/$startN/;
+						}
+					if ($finalSeq{$seq} =~ /(\-+)$/)
+						{
+						my $endGap = $1;
+							my $endN = "N" x length($endGap);
+						$finalSeq{$seq} =~s/$endGap$/$endN/;
+						}
+					}
+				}
+		# Determine haplotypes as needed
+			if ($haploTyping)
+				{
+				foreach my $entry (@accNum)
+					{
+					$haploID{$entry} = 0;
+					foreach my $haploType (@haploList)
+						{
+						if ($finalSeq{$entry} eq $finalSeq{$haploType})	# Matches existing haplotype; add to list
+							{
+							$haploID{$entry} = $haploType;
+							push @{$haploSeqs{$haploType}}, $entry;
+							last;
+							}
+						}
+					if (not $haploID{$entry})	# No match to existing haplotype; define new haplotype
+						{
+						$haploID{$entry} = "haplo" . (scalar(@haploList) + 1);
+						push @haploList, $haploID{$entry};
+						$nameLabel{$haploID{$entry}} = $haploID{$entry};
+						$finalSeq{$haploID{$entry}} = $finalSeq{$entry};
+						push @{$haploSeqs{$haploID{$entry}}}, $entry;
+						}
+					}
+				undef @accNum;
+					@accNum = @haploList;
+				open (HAPLO, ">$haploFile") or die "Cannot print to file $haploFile\n";
+					foreach my $haploType (@haploList)
+						{
+						print HAPLO "$haploType:";
+						print HAPLO "\t$nameLabel{$_}" foreach @{$haploSeqs{$haploType}};
+						print HAPLO "\n";
+						}
+				close HAPLO;
+				}
+		# Print results!
+			print "\nPrinting results ...\n";
+			@accNum = sort { $nameLabel{$a} cmp $nameLabel{$b} } keys %nameLabel if ($seqOrder eq "alphabetical" and not $haploTyping);
+			# Print full data set
+				$ntax = scalar(@accNum);
+					seqPrint($seqFile);
+			# Print jackknifed data sets
+				if ($jackknife)
+					{
+					my $delseqCount = 0;
+					foreach my $seq (@accNum)
+						{
+						$deletedSeq{$seq} = 1;
+							$delseqCount++;
+						# Change output file names
+							$fastaOut = $dataSource . "_jack$delseqCount.fasta";
+							$nexusOut = $dataSource . "_jack$delseqCount.nex";
+							$phylipOut = $dataSource . "_jack$delseqCount.phylip";
+							$sealOut = $dataSource . "_jack$delseqCount.seal";
+						$ntax = scalar(@accNum) - 1;
+						seqPrint($seqFile);
+						$deletedSeq{$seq} = 0;	# Reset deleted sequence
+						}
+					}
+			# Print data set partitions
+				if (@charsetList and $seqSplit)
+					{
+					my $delCount = 0;
+					foreach my $partition (@charsetList)
+						{
+						$delCount = 0;
+						# Change output file names
+							$fastaOut = $dataSource . "_$partition.fasta";
+							$nexusOut = $dataSource . "_$partition.nex";
+							$phylipOut = $dataSource . "_$partition.phylip";
+							$sealOut = $dataSource . "_$partition.seal";
+						# Restrict sequence to partition limits
+							foreach my $seq (@accNum)
+								{
+								$deletedSeq{$seq} = 0;	# Reset all deleted sequences
+								$finalSeq{$seq} = substr($sequence{$seq}, $charsetStart{$partition} - 1, $charsetEnd{$partition} - $charsetStart{$partition} + 1);
+								# Check that sequence remains informative
+									unless ($finalSeq{$seq} =~ /a/i or $finalSeq{$seq} =~ /c/i or $finalSeq{$seq} =~ /g/i or $finalSeq{$seq} =~ /t/i)
+										{
+										$delCount++;
+										$deletedSeq{$seq} = 1;
+										}
+								}
+						$ntax = scalar(@accNum) - $delCount;
+						$maxLength = $charsetEnd{$partition} - $charsetStart{$partition} + 1;
+						seqPrint($seqFile);
+						}
+					}
+		}
+exit(0);
+### Subroutines used in the program
+sub setLineBreak	# Check line breaks of input files and set input record separator accordingly
+	{
+	my $inFile = shift;
+	$/ ="\n";
+	open (IN, "<$inFile") or die "Cannot open $inFile to check form of line breaks.\n";
+		while (<IN>)
+			{
+			if ($_ =~ /\r\n/)
+				{
+				print "\tDOS line breaks detected ...\n" if ($verbose);
+				$/ ="\r\n";
+				last;
+				}
+			elsif ($_ =~ /\r/)
+				{
+				print "\tMac line breaks detected ...\n" if ($verbose);
+				$/ ="\r";
+				last;
+				}
+			else
+				{
+				print "\tUnix line breaks detected ...\n" if ($verbose);
+				$/ ="\n";
+				last;
+				}
+			}
+	close IN;
+	}
+sub seqRead
+	{
+	my $seqFile = shift;
+	undef %sequence;
+	print "\nReading in sequence data from file $seqFile (type is $inputType) ...\n" if ($inputType);
+	setLineBreak($seqFile);
+	open (SEQ, "<$seqFile") or die "Cannot open file containing sequences, $seqFile\n";
+		my ($header, $tempAcc, $tempName, $tempSeq);
+		my $fastaAcc;
+		my (%nexusSpecies, %nexusAcc, $nexusRead, $commentFlag);
+		my ($phylipLineCount, $phylipTaxa, $phylipChars, %phylipSeq);
+		my $sealCode;
+		my ($sealDelFlag, $owner) = (0, 0);
+		my ($gbAcc, $gbRead);
+		my $macBlock = 0;
+		my %accCount;
+		while (<SEQ>)
+			{
+			chomp;
+			my $lineRead = $_;
+			next unless ($lineRead);
+			# Autodetect sequence format
+				if (not $inputType)
+					{
+					$inputType = "fasta" if ($lineRead =~ /^>/);
+					$inputType = "nexus" if ($lineRead =~ /\#nexus/i);
+					$inputType = "phylip" if ($lineRead =~ /^\s*\d+\s+\d+/);
+					$inputType = "Se-Al" if ($lineRead =~ /^\s*Database=\{/i);
+					$inputType = "GenBank" if ($lineRead =~ /^\s*LOCUS/);
+					print "\nReading in sequence data from file $seqFile (type determined to be $inputType) ...\n" if ($inputType);
+					}
+			if ($inputType eq "nexus")
+				{
+				# Check if charset statement present
+					if ($lineRead =~ /charset/i)
+						{
+						$lineRead =~ s/\s+//g;
+						$lineRead =~ s/;$//;
+						my ($charsetName, $charsetBounds) = split('=', $lineRead);
+							$charsetName =~ s/charset//i;
+								push @charsetList, $charsetName;
+							if ($charsetBounds =~ /(\d+)-*(\d*)/)
+								{
+								$charsetStart{$charsetName} = $1;
+								if ($2)
+									{
+									$charsetEnd{$charsetName} = $2;
+									}
+								else
+									{
+									$charsetEnd{$charsetName} = $1;
+									}
+								}
+						}
+				# Otherwise block out MacClade / PAUP blocks
+					$macBlock = 1 if ($lineRead =~ /begin macclade;/i or $lineRead =~ /begin paup;/i);
+					$macBlock = 0 if ($macBlock and $lineRead =~ /end;/i);
+					next if ($macBlock);
+				# Otherwise only read in data lines or charset statements
+					if ($lineRead =~ /^\s*matrix/i)
+						{
+						$nexusRead = 1;
+						next;
+						}
+					$nexusRead = 0 if ($lineRead =~ /;\s*$/);
+					next unless ($nexusRead);
+					# Remove MacClade sequence lengths
+						$lineRead =~ s/\[\d+\]$//;
+					$commentFlag = 1 if ($lineRead =~ /\[/);
+					if ($lineRead =~ /\]/)
+						{
+						$commentFlag = 0;
+						next;
+						}
+					next if ($commentFlag);
+					next unless ($lineRead =~ /a/i or $lineRead =~ /c/i or $lineRead =~ /g/i or $lineRead =~ /t/i or $lineRead =~ /n/i or $lineRead =~ /\?/ or $lineRead =~ /\-/);
+				# Clean up input line
+					$lineRead =~ s/^\s+//;
+					$lineRead =~ s/\'//g;
+				my @nexusLine = split(/\s+/, $lineRead);
+					my $species = shift(@nexusLine);
+						$species =~ s/\s+/_/g;
+						$species =~ s/\_+/_/g;
+					my $seq = join('', @nexusLine);
+						$seq =~ s/\s+//g;
+						$seqType = "protein" if ($seq =~ /E/i or $seq =~ /Q/i or $seq =~ /I/i or $seq =~ /L/i or $seq =~ /F/i or $seq =~ /P/i);
+				if (not defined $nexusSpecies{$species})
+					{
+					$nexusSpecies{$species} = 1;
+					$seqCount++;
+					$nexusAcc{$species} = "tAlign_".$seqCount;
+					push @accNum, $nexusAcc{$species};
+						$nameLabel{$nexusAcc{$species}} = $species;
+						$sequence{$nexusAcc{$species}} = uc($seq);
+						$geneticCode{$nexusAcc{$species}} = $globalGenCode;
+					}
+				else	# Sequences are in interleaved format; append sequence
+					{
+					$sequence{$nexusAcc{$species}} .= uc($seq);
+					}
+				}
+			if ($inputType eq "fasta")
+				{
+				if ($lineRead =~/^\s*>/)
+					{
+					my $species;
+					$seqCount++;
+					(my $tempSpecies = $lineRead) =~ s/^\s*>//;
+						if ($tempSpecies =~ /^Mit\.\s+/)	# Entry comes from European RNA project
+							{
+							$tempSpecies =~ s/^Mit\.\s+//i;	# To fix entries from European RNA project
+							my @speciesInfo = split(/\s+/, $tempSpecies);
+								$species = join('_', $speciesInfo[0], $speciesInfo[1]);
+							if (defined $speciesInfo[2])
+								{
+								$fastaAcc = $speciesInfo[2];
+								$accCount{$fastaAcc}++;
+									if ($accCount{$fastaAcc} > 1)
+										{
+										print "\nWARNING: Accession number $fastaAcc used more than once";
+											print "; skipping this and subsequent entries" if ($skipDuplicates);
+										print "\n";
+										}
+								}
+							else
+								{
+								$fastaAcc = "tAlign_".$seqCount;
+								$accCount{$fastaAcc}++;
+								}
+							}
+						else
+							{
+							my @speciesLine = split(/\s+/, $tempSpecies);
+							if ($speciesLine[$#speciesLine] =~ /^\(?\w+\d+\)?$/ and scalar(@speciesLine) > 1)	# Check whether last entry is an accession number
+								{
+								$fastaAcc = pop (@speciesLine);
+								$fastaAcc =~ s/^\(//g;
+								$fastaAcc =~ s/\)$//g;
+								$accCount{$fastaAcc}++;
+									if ($accCount{$fastaAcc} > 1)
+										{
+										print "\nWARNING: Accession number $fastaAcc used more than once";
+											if ($skipDuplicates)
+												{
+												print "; skipping this and subsequent entries\n";
+												}
+											else
+												{
+												print "; assigning temporary accession number\n";
+												$fastaAcc = "tAlign_" . $seqCount;
+												}
+										}
+								}
+							else
+								{
+								$fastaAcc = "tAlign_".$seqCount;
+								$accCount{$fastaAcc}++;
+								}
+							$species = join('_', @speciesLine);
+								$species = "Sequence_".$seqCount if ($species eq "");
+							}
+					push @accNum, $fastaAcc unless ($accCount{$fastaAcc} > 1 and $skipDuplicates);
+						$geneticCode{$fastaAcc} = $globalGenCode;
+					($nameLabel{$fastaAcc} = $species) =~ s/\s+/_/;
+						$nameLabel{$fastaAcc} =~ s/\_+/_/;
+					}
+				else
+					{
+					next if ($accCount{$fastaAcc} > 1 and $skipDuplicates);
+					$lineRead =~ s/\s+//g;
+						$seqType = "protein" if ($lineRead =~ /E/i or $lineRead =~ /Q/i or $lineRead =~ /I/i or $lineRead =~ /L/i or $lineRead =~ /F/i or $lineRead =~ /P/i);
+					$sequence{$fastaAcc} .= uc($lineRead);
+					}
+				}
+			if ($inputType eq "Se-Al")
+				{
+				my $header;
+				$sealDelFlag = 1 if ($lineRead =~/MCoL/);	# Se-Al sometimes places deleted species at end of file; do not read in remainder of file
+					next if ($sealDelFlag == 1);
+				next unless ($lineRead =~/NumSites/i or $lineRead =~/Owner/i or $lineRead =~/Name/i or $lineRead =~/Accession/i or $lineRead =~/Sequence/i or $lineRead =~/GeneticCode/i or $lineRead =~/Frame/i);
+				if ($lineRead =~/Owner\s*\=\s*(\d+)/i)
+					{
+					$owner = $1;
+					}
+				if ($lineRead =~/Accession/i and $owner == 2)
+					{
+					$seqCount++;
+					if ($lineRead =~ /null/ or $lineRead =~ /\"\"/)
+						{
+						$tempAcc = "tAlign_" . $seqCount;
+						$accCount{$tempAcc}++;
+						}
+					else
+						{
+						($header, $tempAcc) = split (/=/, $lineRead);
+							$tempAcc =~ s/\"//g;
+							$tempAcc =~ s/;//g;
+							$accCount{$tempAcc}++;
+								if ($accCount{$tempAcc} > 1)
+									{
+									print "\nWARNING: Accession number $fastaAcc used more than once";
+										if ($skipDuplicates)
+											{
+											print "; skipping this and subsequent entries\n";
+											}
+										else
+											{
+											print "; assigning temporary accession number\n";
+											$tempAcc = "tAlign_" . $seqCount;
+											}
+									print "\n";
+									}
+						}
+					push @accNum, $tempAcc unless ($accCount{$tempAcc} > 1 and $skipDuplicates);
+					}
+				if ($lineRead =~/Name/i and $owner == 2)
+					{
+					($header, $tempName) = split (/=/, $lineRead);
+						$tempName =~ s/\"//g;
+						$tempName =~ s/\s*;//g;
+						$tempName =~ s/\s+/_/g;
+						$tempName =~ s/\_+/_/g;
+					}
+				if ($lineRead =~/GeneticCode=(\d)/i and $owner == 2)
+					{
+					$sealCode = $1;
+					$geneticCode{$tempAcc} = $seal2gb{$sealCode};
+					}
+				if ($lineRead =~/Sequence/i and $owner == 2)
+					{
+					next if ($accCount{$tempAcc} > 1 and $skipDuplicates);
+					($header, $tempSeq) = split (/=/, $lineRead);
+						$tempSeq =~ s/\"//g;
+						$tempSeq =~ s/;//g;
+						$tempSeq =~ s/\s+//g;
+					$nameLabel{$tempAcc} = $tempName;
+					$sequence{$tempAcc} = uc($tempSeq);
+						$seqType = "protein" if ($tempSeq =~ /E/i or $tempSeq =~ /Q/i or $tempSeq =~ /I/i or $tempSeq =~ /L/i or $tempSeq =~ /F/i or $tempSeq =~ /P/i);
+					}
+				if ($lineRead =~/Frame=(\d)/i)	# Correct for reading frame
+					{
+					my $readingFrame = $1;
+					$sequence{$tempAcc} = "--" . $sequence{$tempAcc} if ($readingFrame == 2);
+					$sequence{$tempAcc} = "-" . $sequence{$tempAcc} if ($readingFrame == 3);
+					}
+				}
+			if ($inputType eq "phylip")
+				{
+				if ($lineRead =~ /^\s*(\d+)\s+(\d+)/)
+					{
+					$phylipTaxa = $1;
+					$phylipChars = $2;
+					$phylipLineCount = 0;
+					}
+				else
+					{
+					$phylipLineCount++;
+					$lineRead =~ s/\s//g;
+					$phylipSeq{$phylipLineCount} .= $lineRead;
+					$phylipLineCount = 0 if ($phylipLineCount == $phylipTaxa);
+					}
+				}
+			if ($inputType eq "GenBank")
+				{
+				# Get species name and accession number
+					# Pure GenBank format
+						$gbAcc = $1 if ($lineRead =~ /^\s*ACCESSION\s+(\w+\d+)/);
+						if ($lineRead =~ /^\s*ORGANISM\s+/)
+							{
+							$seqCount++;
+								$gbAcc = "tAlign_" . $seqCount if (not defined $gbAcc);
+							$lineRead =~ s/^\s+//;
+							my @orgLine = split(/\s+/, $lineRead);
+								my $header = shift (@orgLine);
+								$nameLabel{$gbAcc} = join('_', @orgLine);
+								$accCount{$gbAcc}++;
+							}
+					# BioEdit format
+						if ($lineRead =~ /^\s*TITLE/ and not defined ($gbAcc))
+							{
+							$seqCount++;
+								$gbAcc = "tAlign_" . $seqCount;
+							my @accLine = split (/\s+/, $lineRead);
+								$gbAcc = $1 if ($accLine[2] =~ /^\((\w+\d+)\)/);
+							$nameLabel{$gbAcc} = $accLine[1];
+							$accCount{$gbAcc}++;
+							}
+				if ($lineRead =~ /^\s*ORIGIN/)
+					{
+					if ($accCount{$gbAcc} > 1)
+						{
+						print "\nWARNING: Accession number $gbAcc used more than once";
+						if ($skipDuplicates)
+							{
+							print "; skipping this and subsequent entries\n";
+							next;
+							}
+						else
+							{
+							print "; assigning temporary accession number\n";
+							$gbAcc = "tAlign_" . $seqCount;
+							$gbRead = 1;
+							}
+						}
+					else
+						{
+						$gbRead = 1;
+						$seqCount++;
+						}
+					next;
+					}
+				if ($lineRead =~ /^\s*\/\//)	# End of accession; process
+					{
+					$gbRead = 0;
+					next unless ($gbAcc);
+					push @accNum, $gbAcc unless ($accCount{$gbAcc} > 1);
+					$geneticCode{$gbAcc} = $globalGenCode;
+					$nameLabel{$gbAcc} =~ s/\s+/_/g;
+						$nameLabel{$gbAcc} =~ s/\_+/_/g;
+					$sequence{$gbAcc} =~ s/\d//g;
+						$sequence{$gbAcc} =~ s/\s+//g;
+						$sequence{$gbAcc} =~ s/\~/-/g if ($gapChange);
+					undef $gbAcc;
+					}
+				next unless ($gbRead);
+				if ($gbRead and $lineRead =~ /^\s+\d+/)
+					{
+					$sequence{$gbAcc} .= uc($lineRead);
+					}
+				}
+			}
+	close SEQ;
+	if ($inputType eq "phylip")	# Postprocess input to derive taxon names and sequence; accounts for both sequential and extended formatting
+		{
+		for (my $i = 1; $i <= $phylipTaxa; $i++)
+			{
+			my $phylipAcc = "tAlign_" . $i;
+			push @accNum, $phylipAcc;
+			$geneticCode{$phylipAcc} = $globalGenCode;
+			# Derive taxon name and sequence
+				$sequence{$phylipAcc} = uc(substr($phylipSeq{$i}, 0 - $phylipChars));
+					$seqType = "protein" if ($sequence{$phylipAcc} =~ /E/i or $sequence{$phylipAcc} =~ /Q/i or $sequence{$phylipAcc} =~ /I/i or $sequence{$phylipAcc} =~ /L/i or $sequence{$phylipAcc} =~ /F/i or $sequence{$phylipAcc} =~ /P/i);
+				$nameLabel{$phylipAcc} = substr($phylipSeq{$i}, 0, length($phylipSeq{$i}) - $phylipChars);
+					$nameLabel{$phylipAcc} =~ s/\s+/_/g;
+					$nameLabel{$phylipAcc} =~ s/\_+/_/g;
+			}
+		}
+	}
+sub seqPrint
+	{
+	my $seqFile = shift;
+	$interleaveLength = $maxLength if ($outFormat eq "sequential");
+	# Print fasta-formatted file
+		if ($fastaPrint)
+			{
+			print "\tWriting to fasta-formatted file $fastaOut ...\n";
+			open (FASTA, ">$fastaOut") or die "Cannot open fasta file for aligned DNA sequences, $fastaOut";
+				foreach my $entry (@accNum)
+					{
+#					next if ($deletedSeq{$entry} or not defined $sequence{$entry});
+					print FASTA ">$nameLabel{$entry}";
+						print FASTA "\t($entry)" unless ($entry =~ /^tAlign/);
+						print FASTA "\n";
+					# Print sequence
+						my $fastaSeq = $finalSeq{$entry};
+						for (my $breakpoint = 0; $breakpoint <= length($fastaSeq); $breakpoint += $fastaLength)
+							{
+							print FASTA substr($fastaSeq, $breakpoint, $fastaLength) . "\n";
+							}
+#					my $fastaSeq = $finalSeq{$entry};
+#						my $breakPoint = 80;
+#						my $breakCount = 0;
+#						until ($breakPoint > length($fastaSeq))
+#							{
+#							$breakCount++;
+#							my $replaceString = "\n" . substr($fastaSeq, $breakPoint, 1);
+#							substr($fastaSeq, $breakPoint, 1) = $replaceString;
+#							$breakPoint += 81;	# Latter accounts for addition of \n to string
+#							}
+#					print FASTA "\n$fastaSeq\n";
+					}
+			close FASTA;
+			}
+	# Print nexus-formatted file
+		if ($nexusPrint)
+			{
+			print "\tWriting to nexus file $nexusOut";
+				print " in interleaved format" if ($outFormat eq "interleave");
+				print " ...\n";
+			open (NEX, ">$nexusOut") or die "Cannot open nexus file for aligned DNA sequences, $nexusOut";
+				print NEX "#NEXUS\n\n";
+				print NEX "[File created from $seqFile using $perlScript v$version on ".localtime()."]\n\n";
+				print NEX "begin data;\n";
+				print NEX "\tdimensions ntax = $ntax nchar = $maxLength;\n";
+				print NEX "\tformat datatype = $seqType gap = - missing = ?";
+					print NEX " interleave" if ($outFormat eq "interleave");
+					print NEX ";\n\n";
+				print NEX "\tmatrix\n\n";
+					for (my $interleaveCount = 1; $interleaveCount <= ceil($maxLength / $interleaveLength); $interleaveCount++)
+						{
+						my $seqStart = (($interleaveCount - 1) * $interleaveLength) + 1;
+						my $seqEnd = $interleaveCount * $interleaveLength;
+							$seqEnd = $maxLength if ($maxLength <= $seqEnd);
+							my $printLength = $seqEnd - $seqStart + 1;
+						foreach my $entry (@accNum)
+							{
+							next if ($deletedSeq{$entry});
+							my $nexusName = $nameLabel{$entry};
+								$nexusName = $entry if ($accPrint);
+							if ($nexusName =~ /\W/)
+								{
+								print NEX "'$nexusName'";
+								}
+							else
+								{
+								print NEX "$nexusName";
+								}
+							my $printSeq = substr($finalSeq{$entry}, $seqStart - 1, $printLength);
+								if ($outFormat eq "interleave")	# Add gaps every 10 elements in sequence
+									{
+									for (my $gapAdd = 100; $gapAdd >= 10; $gapAdd-=10)
+										{
+										next if $gapAdd > $printLength;
+										my $bp = substr($printSeq, $gapAdd - 1, 1) . " ";
+										substr($printSeq, $gapAdd - 1, 1, $bp);
+										}
+									}
+								print NEX "\t$printSeq\n";
+							}
+						print NEX "\n" if ($interleaveCount * $interleaveLength <= $maxLength);
+						}
+#				foreach my $entry (@accNum)
+#					{
+#					next if ($deletedSeq{$entry});
+#					if ($nameLabel{$entry} =~ /\W/)
+#						{
+#						print NEX "'$nameLabel{$entry}'";
+#						}
+#					else
+#						{
+#						print NEX "$nameLabel{$entry}";
+#						}
+#					print NEX "\t$finalSeq{$entry}\n";
+#					}
+				print NEX "\t;\nend;\n";
+			close NEX;
+			if (-e "/Developer/Tools/SetFile")
+				{
+				system ("/Developer/Tools/SetFile -t 'TEXT' $nexusOut");
+				system ("/Developer/Tools/SetFile -c 'PAUP' $nexusOut");
+				}
+			}
+	# Print phylip-formatted file (on demand)
+		if ($phylipTradPrint or $phylipExtPrint)
+			{
+			my $maxTaxLength = 50;
+				$maxTaxLength = 10 if ($phylipTradPrint);
+				my $blankName = " " x $maxTaxLength;
+			my %shortNameCount;
+			print "\tWriting to phylip file $phylipOut ...\n";
+			open (PHYLIP, ">$phylipOut") or die "Cannot open phylip file for aligned DNA sequences, $phylipOut";
+				print PHYLIP "    $ntax    $maxLength\n";
+				for (my $interleaveCount = 1; $interleaveCount <= ceil($maxLength / $interleaveLength); $interleaveCount++)
+					{
+					my $seqStart = (($interleaveCount - 1) * $interleaveLength) + 1;
+					my $seqEnd = $interleaveCount * $interleaveLength;
+						$seqEnd = $maxLength if ($maxLength <= $seqEnd);
+						my $printLength = $seqEnd - $seqStart + 1;
+#					foreach my $entry (@accNum)
+#						{
+#						my $trimmedName = substr($nameLabel{$entry}, 0, $maxTaxLength);
+#							$shortNameCount{trimmedName} = 0;
+#						}
+					foreach my $entry (@accNum)
+						{
+						next if ($deletedSeq{$entry});
+						my $phylipName = $nameLabel{$entry};
+							$phylipName = $entry if ($accPrint);
+						# Print name label as appropriate; also check label and adjust to proper length if needed
+							if ($interleaveCount == 1)
+								{
+								if (length($phylipName) < $maxTaxLength)
+									{
+									$shortNameCount{$phylipName}++;
+									$phylipName .= " " x ($maxTaxLength - length($phylipName)) if ($phylipTradPrint);	# Pad end of name with spaces as needed
+									}
+								else
+									{
+									my $trimmedName = substr($phylipName, 0, $maxTaxLength);
+									$shortNameCount{$trimmedName}++;
+									if ($shortNameCount{$trimmedName} > 1)	# Check for duplicates among shortened names and make unique by adding numbers
+										{
+										$phylipName = substr($phylipName, 0, $maxTaxLength - length($shortNameCount{$trimmedName}));
+											$phylipName .= $shortNameCount{$trimmedName};
+											$phylipName .= " " x ($maxTaxLength - length($phylipName));	# Pad end of name with spaces as needed
+										}
+									else
+										{
+										$phylipName = $trimmedName;
+										}
+									}
+								print PHYLIP "$phylipName";
+								print PHYLIP " " if ($phylipExtPrint);
+								}
+							else
+								{
+								print PHYLIP "$blankName";
+								}
+						# Print sequence
+							my $printSeq = substr($finalSeq{$entry}, $seqStart - 1, $printLength);
+								if ($outFormat eq "interleave")	# Add gaps every 10 elements in sequence
+									{
+									for (my $gapAdd = 100; $gapAdd >= 10; $gapAdd-=10)
+										{
+										next if $gapAdd > $printLength;
+										my $bp = substr($printSeq, $gapAdd - 1, 1) . " ";
+										substr($printSeq, $gapAdd - 1, 1, $bp);
+										}
+									}
+								print PHYLIP "$printSeq\n";
+						}
+					print PHYLIP "\n" if ($interleaveCount * $interleaveLength <= $maxLength);
+#				foreach my $entry (@accNum)
+#					{
+#					next if ($deletedSeq{$entry});
+#
+#					my $phylipName = $nameLabel{$entry};
+#
+#					# Check name label and adjust to proper length if needed
+#						if (length($phylipName) < $maxTaxLength)
+#							{
+#							$shortNameCount{$phylipName}++;
+#							$phylipName .= " " x ($maxTaxLength - length($phylipName));	# Pad end of name with spaces as needed
+#							}
+#						else
+#							{
+#							my $trimmedName = substr($phylipName, 0 , $maxTaxLength);
+#							$shortNameCount{$trimmedName}++;
+#							if ($shortNameCount{$trimmedName} > 1)	# Check for duplicates among shortened names and make unique by adding numbers
+#								{
+#								$phylipName = substr($phylipName, 0, $maxTaxLength - length($shortNameCount{$trimmedName}));
+#									$phylipName .= $shortNameCount{$trimmedName};
+#									$phylipName .= " " x ($maxTaxLength - length($phylipName));	# Pad end of name with spaces as needed
+#								}
+#							else
+#								{
+#								$phylipName = $trimmedName;
+#								}
+#							}
+#
+#					print PHYLIP "$phylipName";
+#						print PHYLIP " " if ($phylipExtPrint);
+#					print PHYLIP "$finalSeq{$entry}\n";
+					}
+			close PHYLIP;
+			}
+	# Print Se-Al-formatted file (on demand)
+		if ($sealPrint)
+			{
+			print "\tWriting to Se_Al file $sealOut ...\n";
+			open (SEAL, ">$sealOut") or die "Cannot open Se-Al file for aligned DNA sequences, $sealOut\n";
+				print SEAL "Database={\n";
+				print SEAL "\tID='MLst';\n";
+				print SEAL "\tOwner=null;\n";
+				print SEAL "\tName=null;\n";
+				print SEAL "\tDescription=null;\n";
+				print SEAL "\tFlags=0;\n";
+				print SEAL "\tCount=2;\n";
+				print SEAL "\t{\n\t\t{\n";
+				print SEAL "\t\t\tID='PAli';\n";
+				print SEAL "\t\t\tOwner=1;\n";
+				print SEAL "\t\t\tName=\"$seqFile\";\n";
+				print SEAL "\t\t\tDescription=null;\n";
+				print SEAL "\t\t\tFlags=0;\n";
+				print SEAL "\t\t\tNumSites=$maxLength;\n";
+				print SEAL "\t\t\tType=";
+					if ($seqType eq "nucleotide")
+						{
+						print SEAL "\"Nucleotide\"";
+						}
+					else
+						{
+						print SEAL "\"Amino Acid\"";
+						}
+					print SEAL ";\n";
+				print SEAL "\t\t\tFeatures=null;\n";
+				print SEAL "\t\t\tColourMode=";
+					if ($seqType eq "nucleotide")
+						{
+						print SEAL "1";
+						}
+					else
+						{
+						print SEAL "2";
+						}
+					print SEAL ";\n";
+				print SEAL "\t\t\tLabelMode=0;\n";
+				print SEAL "\t\t\ttriplets=false;\n";
+				print SEAL "\t\t\tinverse=true;\n";
+				print SEAL "\t\t\tCount=$ntax;\n";
+				print SEAL "\t\t\t{\n";
+				my $i = 0;
+				foreach my $sequence (@accNum)
+					{
+					next if ($deletedSeq{$sequence});
+					$i++;
+					print SEAL "\t\t\t\t{\n";
+					print SEAL "\t\t\t\t\tID='PSeq';\n";
+					print SEAL "\t\t\t\t\tOwner=2;\n";
+					print SEAL "\t\t\t\t\tName=\"$nameLabel{$sequence}\";\n";
+					print SEAL "\t\t\t\t\tDescription=null;\n";
+					print SEAL "\t\t\t\t\tFlags=0;\n";
+					print SEAL "\t\t\t\t\tAccession=";
+						if ($sequence =~/^tAlign_/)
+							{
+							print SEAL "null;\n";
+							}
+						else
+							{
+							print SEAL "$sequence;\n";
+							}
+					if ($seqType eq "nucleotide")
+						{
+						print SEAL "\t\t\t\t\tType=\"DNA\";\n";
+						}
+					else
+						{
+						print SEAL "\t\t\t\t\tType=\"AA\";\n";
+						}
+					print SEAL "\t\t\t\t\tLength=".length($finalSeq{$sequence}).";\n";
+					print SEAL "\t\t\t\t\tSequence=\"$finalSeq{$sequence}\";\n";
+					if (defined $geneticCode{$sequence} and $geneticCode{$sequence} != 0)
+						{
+						print SEAL "\t\t\t\t\tGeneticCode=$gb2seal{$geneticCode{$sequence}};\n";
+						}
+					else
+						{
+						print SEAL "\t\t\t\t\tGeneticCode=-1;\n";	# Default for Se-Al is non-coding
+						}
+					print SEAL "\t\t\t\t\tCodeTable=null;\n";
+					print SEAL "\t\t\t\t\tFrame=1;\n";
+					print SEAL "\t\t\t\t\tFeatures=null;\n";
+					print SEAL "\t\t\t\t\tParent=null;\n";
+					print SEAL "\t\t\t\t\tComplemented=false;\n";
+					print SEAL "\t\t\t\t\tReversed=false;\n";
+					print SEAL "\t\t\t\t}";
+					print SEAL "," unless ($i == $ntax);
+					print SEAL "\n";
+					}
+				print SEAL "\t\t\t};\n";
+				print SEAL "\t\t},\n";
+				print SEAL "\t\t{\n";
+				print SEAL "\t\t\tID='MCoL';\n";
+				print SEAL "\t\t\tOwner=1;\n";
+				print SEAL "\t\t\tName=\"Genetic Codes\";\n";
+				print SEAL "\t\t\tDescription=\"Custom Genetic Codes\";\n";
+				print SEAL "\t\t\tFlags=0;\n";
+				print SEAL "\t\t\tCount=0;\n";
+				print SEAL "\t\t}\n";
+				print SEAL "\t};\n";
+				print SEAL "};\n";
+			close SEAL;
+			if (-e "/Developer/Tools/SetFile")
+				{
+				system ("/Developer/Tools/SetFile -t 'TEXT' $sealOut");
+				system ("/Developer/Tools/SetFile -c 'SEAL' $sealOut");
+				}
+			}
+	}
+sub geneticCoder	# Create translation tables for all genetic codes
+	{
+	my %geneticCode = ('1' => 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
+					   '2' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG',
+					   '3' => 'FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
+					   '4' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
+					   '5' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG',
+					   '6' => 'FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
+					   '9' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG',
+					   '10' => 'FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
+					   '11' => 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
+					   '12' => 'FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
+					   '13' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG',
+					   '14' => 'FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG',
+					   '15' => 'FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
+					   '16' => 'FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
+					   '21' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG',
+					   '22' => 'FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
+					   '23' => 'FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG');
+	foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23))
+		{
+		# Establish basic translation table for each genetic code
+#			print "\nEstablishing \"$transTable{$code}\" genetic code ...\n" if ($debug);
+			my $position = 0;
+			foreach my $base1 (qw (T C A G))
+				{
+				foreach my $base2 (qw (T C A G))
+					{
+					foreach my $base3 (qw (T C A G))
+						{
+						my $codon = $base1.$base2.$base3;
+						$DNAtoAA{$code}{$codon} = substr($geneticCode{$code}, $position, 1);
+#							print "\t$codon = $DNAtoAA{$code}{$codon}\n" if ($debug);
+						$position++;
+						}
+					}
+				}
+		# Extend translation table to account for ambiguity codes (note: does not account for gaps)
+#			print "\nExtending translation table to account for ambiguity codes ...\n" if ($debug);
+			foreach my $firstPos (@ambigList)
+				{
+				foreach my $secondPos (@ambigList)
+					{
+					foreach my $thirdPos (@ambigList)
+						{
+						my $codon = $firstPos.$secondPos.$thirdPos;
+						next if (defined $DNAtoAA{$code}{$codon});
+						my $refAA = "";
+						foreach my $firstNT (@ {$constitNTlist{$firstPos} })
+							{
+							last if (defined $DNAtoAA{$code}{$codon});
+							foreach my $secondNT (@ {$constitNTlist{$secondPos} })
+								{
+								last if (defined $DNAtoAA{$code}{$codon});
+								foreach my $thirdNT (@ {$constitNTlist{$thirdPos} })
+									{
+									my $testCodon = $firstNT.$secondNT.$thirdNT;
+									if (not $refAA)
+										{
+										$refAA = $DNAtoAA{$code}{$testCodon};
+										}
+									else
+										{
+										if ($DNAtoAA{$code}{$testCodon} ne $refAA)
+											{
+											$DNAtoAA{$code}{$codon} = "?";
+											last;
+											}
+										}
+									}
+								}
+							}
+						$DNAtoAA{$code}{$codon} = $refAA if (not defined $DNAtoAA{$code}{$codon});
+#						print "\t$codon = $DNAtoAA{$code}{$codon}\n" if ($debug);
+						}
+					}
+				}
+		}
+	return;
+	}
+sub translate	# Translate a DNA sequence to an AA sequence (note: does not account for gaps)
+	{
+	my $DNAseq = shift;
+	my $userCode = shift;
+	my $protSeq;
+	for (my $codonStart = 0; $codonStart < length($DNAseq); $codonStart += 3)
+		{
+		if (length($DNAseq) - $codonStart >= 3)	# Codon is complete; translate
+			{
+			my $codon = substr($DNAseq, $codonStart, 3);
+			if ($codon =~ /-/ or $codon =~ /\./)
+				{
+				$protSeq .= "?";
+				}
+			else
+				{
+				$protSeq .= $DNAtoAA{$userCode}{$codon};
+				}
+			}
+		else	# Incomplete codon; automatically translates as ?
+			{
+			$protSeq .= "?";
+			}
+		}
+	return $protSeq;
+	}
+# Version history
+#
+#	v1.2 (August 12, 2010)
+#		- added switch to actually allow fasta output to be specified (or not)
+#		- sets TYPE and CREATOR codes for nexus files on Mac systems when
+#		  SetFile is present
+#		- can now parse GenBank formatted output (both pure GenBank and BioEdit
+#		  versions)
+#		- error checking: warns if same accession number used more than once
+#		  and skips subsequent entires
+#		- added ability to:
+#			- detect sequence type of input (nucleotide vs protein) and set as
+#			  appropriate in output files
+#			- convert nucleotide input to proteins
+#			- convert sequence input to haplotypes
+#			- interleave nexus- and phylip-formatted output (request by Michael
+#			  Craige) and to use inputted value to specify line lengths in fasta
+#			  output
+#			- output individual data partitions specified according to nexus-
+#			  formatted charset statements
+#			- output jackknifed data sets, each missing a single taxon
+#			- clean sequence labels of non-alphanumeric characters (on by
+#			  default)
+#			- convert all ~ gap characters (e.g., from BioEdit) to -
+#			- convert all ambiguous nucleotides to Ns
+#			- change flnaking gaps to Ns (indirect request by Simon Creer)
+#		- fixed classic phylip output such that it now conforms 100% to the
+#		  phylip guidelines
+#		- improved parsing of MacClade generating files, including blocking out
+#		  MacClade and PAUP blocks when reading in nexus-formatted files
+#		- fixed translation between GenBank and Se-Al genetic codes
+#		- changed all instances of #nexus to #NEXUS in output files for
+#		  compatibility with TNT (thanks to Douglas Chester for spotting this)
+#		- minor bug fixes
+#
+#	v1.1 (March 2, 2006)
+#		- added ability to batch convert all specified file types in working
+#		  directory (use -d*)
+#		- updated to seqRead module 1.1.1 (includes autodetection of sequence
+#		  format)
+#		- checks that necessary input file(s) exists before proceeding
+#		- added GNU GPL statement
+#		- sets TYPE and CREATOR codes for Se-Al files on Mac systems when
+#		  SetFile is present
+#		- minor bug fixes
+#
+#	v1.0 (May 30, 2005)
+#		- initial release
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# A copy of the GNU General Public License is available at
+# http://www.gnu.org/copyleft/gpl.html or by writing to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Fifth Floor, Boston, MA, 02110-1301, USA.

Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics

comparison phyloconversion/seqConverterG.pl @ 0:5b9a38ec4a39 draft default tip