annotate corebio/seq_io/_nexus/__init__.py @ 14:778f03497adb

Uploaded
author davidmurphy
date Fri, 24 Feb 2012 11:37:26 -0500
parents c55bdc2fb9fa
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1 # Nexus.py - a NEXUS parser
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
2 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
3 # Copyright 2005 by Frank Kauff & Cymon J. Cox. All rights reserved.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
4 # This code is part of the Biopython distribution and governed by its
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
5 # license. Please see the LICENSE file that should have been included
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
6 # as part of this package.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
7 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
8 # Bug reports welcome: fkauff@duke.edu
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
9 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
10
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
11 """Parse the contents of a nexus file.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
12 Based upon 'NEXUS: An extensible file format for systematic information'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
13 Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
14
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
15 Authors: Frank Kauff and Cymon J. Cox
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
16 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
17
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
18 import os,sys, math, random, copy
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
19 import sets
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
20
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
21 # --- Changes from Bio.Nexus ---
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
22 # Renamed Nexus.py to __init__.py. Helps with api documentation.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
23 # One further change in file, tagged with 'GEC'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
24 #from Bio.Alphabet import IUPAC
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
25 #from Bio.Data import IUPACData
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
26 #from Bio.Seq import Seq
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
27
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
28
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
29 from corebio.seq import Seq, Alphabet, protein_alphabet
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
30 import corebio.data as data
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
31 from corebio.utils import Struct
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
32
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
33 IUPACData = Struct(
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
34 ambiguous_dna_letters = data.dna_extended_letters,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
35 ambiguous_rna_letters = data.rna_extended_letters,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
36 ambiguous_dna_values = data.dna_ambiguity,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
37 ambiguous_rna_values = data.rna_ambiguity,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
38 protein_letters = data.amino_acid_letters,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
39 unambiguous_dna_letters = data.dna_letters,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
40 unambiguous_rna_letters = data.rna_letters,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
41 )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
42
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
43 IUPAC = Struct(
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
44 ambiguous_dna = Alphabet(IUPACData.ambiguous_dna_letters+'-?'),
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
45 ambiguous_rna = Alphabet(IUPACData.ambiguous_rna_letters+'-?'),
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
46 protein = protein_alphabet #Alphabet(IUPACData.protein_letters+'-?')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
47 )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
48
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
49
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
50
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
51 # End Changes
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
52
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
53
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
54 from Trees import Tree,NodeData
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
55
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
56 C = False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
57
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
58 #try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
59 # import cnexus
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
60 #except ImportError:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
61 # C=False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
62 #else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
63 # C=True
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
64
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
65 INTERLEAVE=70
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
66 SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
67 'matrix','tree', 'utree','translate']
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
68 KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets']
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
69 PUNCTUATION='()[]{}/\,;:=*\'"`+-<>'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
70 MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
71 WHITESPACE=' \t\n'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
72 #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
73 SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
74 CHARSET='chars'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
75 TAXSET='taxa'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
76
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
77 class NexusError(Exception): pass
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
78
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
79 class CharBuffer:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
80 """Helps reading NEXUS-words and characters from a buffer."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
81 def __init__(self,string):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
82 if string:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
83 self.buffer=list(string)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
84 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
85 self.buffer=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
86
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
87 def peek(self):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
88 if self.buffer:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
89 return self.buffer[0]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
90 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
91 return None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
92
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
93 def peek_nonwhitespace(self):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
94 b=''.join(self.buffer).strip()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
95 if b:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
96 return b[0]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
97 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
98 return None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
99
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
100 def next(self):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
101 if self.buffer:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
102 return self.buffer.pop(0)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
103 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
104 return None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
105
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
106 def next_nonwhitespace(self):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
107 while True:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
108 p=self.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
109 if p is None:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
110 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
111 if p not in WHITESPACE:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
112 return p
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
113 return None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
114
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
115 def skip_whitespace(self):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
116 while self.buffer[0] in WHITESPACE:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
117 self.buffer=self.buffer[1:]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
118
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
119 def next_until(self,target):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
120 for t in target:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
121 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
122 pos=self.buffer.index(t)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
123 except ValueError:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
124 pass
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
125 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
126 found=''.join(self.buffer[:pos])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
127 self.buffer=self.buffer[pos:]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
128 return found
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
129 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
130 return None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
131
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
132 def peek_word(self,word):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
133 return ''.join(self.buffer[:len(word)])==word
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
134
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
135 def next_word(self):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
136 """Return the next NEXUS word from a string, dealing with single and double quotes,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
137 whitespace and punctuation.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
138 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
139
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
140 word=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
141 quoted=False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
142 first=self.next_nonwhitespace() # get first character
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
143 if not first: # return empty if only whitespace left
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
144 return None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
145 word.append(first)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
146 if first=="'": # word starts with a quote
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
147 quoted=True
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
148 elif first in PUNCTUATION: # if it's punctuation, return immediately
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
149 return first
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
150 while True:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
151 c=self.peek()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
152 if c=="'": # a quote?
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
153 word.append(self.next()) # store quote
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
154 if self.peek()=="'": # double quote
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
155 skip=self.next() # skip second quote
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
156 elif quoted: # second single quote ends word
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
157 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
158 elif quoted:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
159 word.append(self.next()) # if quoted, then add anything
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
160 elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
161 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
162 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
163 word.append(self.next()) # standard character
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
164 return ''.join(word)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
165
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
166 def rest(self):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
167 """Return the rest of the string without parsing."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
168 return ''.join(self.buffer)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
169
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
170 class StepMatrix:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
171 """Calculate a stepmatrix for weighted parsimony.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
172 See Wheeler (1990), Cladistics 6:269-275.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
173 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
174
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
175 def __init__(self,symbols,gap):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
176 self.data={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
177 self.symbols=[s for s in symbols]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
178 self.symbols.sort()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
179 if gap:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
180 self.symbols.append(gap)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
181 for x in self.symbols:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
182 for y in [s for s in self.symbols if s!=x]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
183 self.set(x,y,0)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
184
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
185 def set(self,x,y,value):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
186 if x>y:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
187 x,y=y,x
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
188 self.data[x+y]=value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
189
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
190 def add(self,x,y,value):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
191 if x>y:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
192 x,y=y,x
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
193 self.data[x+y]+=value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
194
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
195 def sum(self):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
196 return reduce(lambda x,y:x+y,self.data.values())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
197
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
198 def transformation(self):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
199 total=self.sum()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
200 if total!=0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
201 for k in self.data:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
202 self.data[k]=self.data[k]/float(total)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
203 return self
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
204
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
205 def weighting(self):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
206 for k in self.data:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
207 if self.data[k]!=0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
208 self.data[k]=-math.log(self.data[k])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
209 return self
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
210
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
211 def smprint(self,name='your_name_here'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
212 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
213 matrix+=' %s\n' % ' '.join(self.symbols)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
214 for x in self.symbols:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
215 matrix+='[%s]'.ljust(8) % x
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
216 for y in self.symbols:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
217 if x==y:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
218 matrix+=' . '
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
219 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
220 if x>y:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
221 x1,y1=y,x
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
222 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
223 x1,y1=x,y
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
224 if self.data[x1+y1]==0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
225 matrix+='inf. '
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
226 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
227 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
228 matrix+='\n'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
229 matrix+=';\n'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
230 return matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
231
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
232 def safename(name,mrbayes=False):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
233 """Return a taxon identifier according to NEXUS standard.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
234 Wrap quotes around names with punctuation or whitespace, and double single quotes.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
235 mrbayes=True: write names without quotes, whitespace or punctuation for mrbayes.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
236 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
237 if mrbayes:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
238 safe=name.replace(' ','_')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
239 safe=''.join([c for c in safe if c in MRBAYESSAFE])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
240 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
241 safe=name.replace("'","''")
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
242 if sets.Set(safe).intersection(sets.Set(WHITESPACE+PUNCTUATION)):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
243 safe="'"+safe+"'"
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
244 return safe
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
245
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
246 def quotestrip(word):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
247 """Remove quotes and/or double quotes around identifiers."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
248 if not word:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
249 return None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
250 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
251 word=word[1:-1]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
252 return word
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
253
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
254 def get_start_end(sequence, skiplist=['-','?']):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
255 """Return position of first and last character which is not in skiplist (defaults to ['-','?'])."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
256
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
257 length=len(sequence)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
258 if length==0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
259 return None,None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
260 end=length-1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
261 while end>=0 and (sequence[end] in skiplist):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
262 end-=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
263 start=0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
264 while start<length and (sequence[start] in skiplist):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
265 start+=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
266 return start,end
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
267
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
268 def _sort_keys_by_values(p):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
269 """Returns a sorted list of keys of p sorted by values of p."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
270 startpos=[(p[pn],pn) for pn in p if p[pn]]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
271 startpos.sort()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
272 return zip(*startpos)[1]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
273
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
274 def _make_unique(l):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
275 """Check that all values in list are unique and return a pruned and sorted list."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
276 l=list(sets.Set(l))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
277 l.sort()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
278 return l
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
279
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
280 def _seqmatrix2strmatrix(matrix):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
281 """Converts a Seq-object matrix to a plain sequence-string matrix."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
282 return dict([(t,matrix[t].tostring()) for t in matrix])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
283
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
284 def _compact4nexus(orig_list):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
285 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
286 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
287 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
288
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
289 if not orig_list:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
290 return ''
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
291 orig_list=list(sets.Set(orig_list))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
292 orig_list.sort()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
293 shortlist=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
294 clist=orig_list[:]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
295 clist.append(clist[-1]+.5) # dummy value makes it easier
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
296 while len(clist)>1:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
297 step=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
298 for i,x in enumerate(clist):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
299 if x==clist[0]+i*step: # are we still in the right step?
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
300 continue
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
301 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
302 # second element, and possibly at least 3 elements to link,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
303 # and the next one is in the right step
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
304 step=x-clist[0]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
305 else: # pattern broke, add all values before current position to new list
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
306 sub=clist[:i]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
307 if len(sub)==1:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
308 shortlist.append(str(sub[0]+1))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
309 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
310 if step==1:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
311 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
312 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
313 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
314 clist=clist[i:]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
315 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
316 return ' '.join(shortlist)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
317
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
318 def combine(matrices):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
319 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
320
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
321 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
322 Character sets, character partitions and taxon sets are prefixed, readjusted and present in
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
323 the combined matrix.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
324 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
325
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
326 if not matrices:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
327 return None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
328 name=matrices[0][0]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
329 combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
330 mixed_datatypes=(len(sets.Set([n[1].datatype for n in matrices]))>1)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
331 if mixed_datatypes:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
332 combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself!
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
333 # raise NexusError, 'Matrices must be of same datatype'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
334 combined.charlabels=None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
335 combined.statelabels=None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
336 combined.interleave=False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
337 combined.translate=None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
338
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
339 # rename taxon sets and character sets and name them with prefix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
340 for cn,cs in combined.charsets.items():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
341 combined.charsets['%s.%s' % (name,cn)]=cs
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
342 del combined.charsets[cn]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
343 for tn,ts in combined.taxsets.items():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
344 combined.taxsets['%s.%s' % (name,tn)]=ts
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
345 del combined.taxsets[tn]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
346 # previous partitions usually don't make much sense in combined matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
347 # just initiate one new partition parted by single matrices
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
348 combined.charpartitions={'combined':{name:range(combined.nchar)}}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
349 for n,m in matrices[1:]: # add all other matrices
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
350 both=[t for t in combined.taxlabels if t in m.taxlabels]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
351 combined_only=[t for t in combined.taxlabels if t not in both]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
352 m_only=[t for t in m.taxlabels if t not in both]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
353 for t in both:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
354 # concatenate sequences and unify gap and missing character symbols
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
355 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
356 # replace date of missing taxa with symbol for missing data
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
357 for t in combined_only:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
358 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
359 for t in m_only:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
360 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
361 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
362 combined.taxlabels.extend(m_only) # new taxon list
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
363 for cn,cs in m.charsets.items(): # adjust character sets for new matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
364 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
365 if m.taxsets:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
366 if not combined.taxsets:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
367 combined.taxsets={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
368 combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()])) # update taxon sets
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
369 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar) # update new charpartition
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
370 # update charlabels
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
371 if m.charlabels:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
372 if not combined.charlabels:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
373 combined.charlabels={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
374 combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()]))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
375 combined.nchar+=m.nchar # update nchar and ntax
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
376 combined.ntax+=len(m_only)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
377
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
378 return combined
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
379
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
380 def _kill_comments_and_break_lines(text):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
381 """Delete []-delimited comments out of a file and break into lines separated by ';'.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
382
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
383 stripped_text=_kill_comments_and_break_lines(text):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
384 Nested and multiline comments are allowed. [ and ] symbols within single
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
385 or double quotes are ignored, newline ends a quote, all symbols with quotes are
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
386 treated the same (thus not quoting inside comments like [this character ']' ends a comment])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
387 Special [&...] and [\...] comments remain untouched, if not inside standard comment.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
388 Quotes inside special [& and [\ are treated as normal characters,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
389 but no nesting inside these special comments allowed (like [& [\ ]]).
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
390 ';' ist deleted from end of line.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
391
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
392 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
393 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
394 contents=CharBuffer(text)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
395 newtext=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
396 newline=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
397 quotelevel=''
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
398 speciallevel=False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
399 commlevel=0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
400 while True:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
401 #plain=contents.next_until(["'",'"','[',']','\n',';']) # search for next special character
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
402 #if not plain:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
403 # newline.append(contents.rest) # not found, just add the rest
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
404 # break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
405 #newline.append(plain) # add intermediate text
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
406 t=contents.next() # and get special character
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
407 if t is None:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
408 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
409 if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
410 quotelevel=''
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
411 elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
412 quotelevel=t
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
413 elif not quotelevel and t=='[': # opening bracket outside a quote
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
414 if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
415 speciallevel=True
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
416 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
417 commlevel+=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
418 elif not quotelevel and t==']': # closing bracket ioutside a quote
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
419 if speciallevel:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
420 speciallevel=False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
421 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
422 commlevel-=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
423 if commlevel<0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
424 raise NexusError, 'Nexus formatting error: unmatched ]'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
425 continue
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
426 if commlevel==0: # copy if we're not in comment
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
427 if t==';' and not quotelevel:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
428 newtext.append(''.join(newline))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
429 newline=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
430 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
431 newline.append(t)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
432 #level of comments should be 0 at the end of the file
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
433 if newline:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
434 newtext.append('\n'.join(newline))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
435 if commlevel>0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
436 raise NexusError, 'Nexus formatting error: unmatched ['
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
437 return newtext
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
438
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
439
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
440 def _adjust_lines(lines):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
441 """Adjust linebreaks to match ';', strip leading/trailing whitespace
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
442
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
443 list_of_commandlines=_adjust_lines(input_text)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
444 Lines are adjusted so that no linebreaks occur within a commandline
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
445 (except matrix command line)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
446 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
447 formatted_lines=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
448 for l in lines:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
449 #Convert line endings
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
450 l=l.replace('\r\n','\n').replace('\r','\n').strip()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
451 if l.lower().startswith('matrix'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
452 formatted_lines.append(l)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
453 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
454 l=l.replace('\n',' ')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
455 if l:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
456 formatted_lines.append(l)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
457 return formatted_lines
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
458
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
459 def _replace_parenthesized_ambigs(seq,rev_ambig_values):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
460 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
461
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
462 opening=seq.find('(')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
463 while opening>-1:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
464 closing=seq.find(')')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
465 if closing<0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
466 raise NexusError, 'Missing closing parenthesis in: '+seq
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
467 elif closing<opening:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
468 raise NexusError, 'Missing opening parenthesis in: '+seq
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
469 ambig=[x for x in seq[opening+1:closing]]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
470 ambig.sort()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
471 ambig=''.join(ambig)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
472 ambig_code=rev_ambig_values[ambig.upper()]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
473 if ambig!=ambig.upper():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
474 ambig_code=ambig_code.lower()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
475 seq=seq[:opening]+ambig_code+seq[closing+1:]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
476 opening=seq.find('(')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
477 return seq
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
478
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
479
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
480 class Commandline:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
481 """Represent a commandline as command and options."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
482
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
483 def __init__(self, line, title):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
484 self.options={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
485 options=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
486 self.command=None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
487 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
488 #Assume matrix (all other command lines have been stripped of \n)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
489 self.command, options = line.strip().split('\n', 1)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
490 except ValueError: #Not matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
491 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
492 self.command=line.split()[0]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
493 options=' '.join(line.split()[1:])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
494 self.command = self.command.strip().lower()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
495 if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
496 self.options=options.strip()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
497 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
498 if len(options) > 0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
499 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
500 options = options.replace('=', ' = ').split()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
501 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
502 indices = []
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
503 for sl in valued_indices:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
504 indices.extend(sl)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
505 token_indices = [n for n in range(len(options)) if n not in indices]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
506 for opt in valued_indices:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
507 #self.options[options[opt[0]].lower()] = options[opt[2]].lower()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
508 self.options[options[opt[0]].lower()] = options[opt[2]]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
509 for token in token_indices:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
510 self.options[options[token].lower()] = None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
511 except ValueError:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
512 raise NexusError, 'Incorrect formatting in line: %s' % line
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
513
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
514 class Block:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
515 """Represent a NEXUS block with block name and list of commandlines ."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
516 def __init__(self,title=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
517 self.title=title
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
518 self.commandlines=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
519
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
520 class Nexus(object):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
521
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
522 __slots__=['original_taxon_order','__dict__']
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
523
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
524 def __init__(self, input=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
525 self.ntax=0 # number of taxa
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
526 self.nchar=0 # number of characters
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
527 self.taxlabels=[] # labels for taxa, ordered by their id
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
528 self.charlabels=None # ... and for characters
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
529 self.statelabels=None # ... and for states
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
530 self.datatype='dna' # (standard), dna, rna, nucleotide, protein
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
531 self.respectcase=False # case sensitivity
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
532 self.missing='?' # symbol for missing characters
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
533 self.gap='-' # symbol for gap
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
534 self.symbols=None # set of symbols
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
535 self.equate=None # set of symbol synonyms
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
536 self.matchchar=None # matching char for matrix representation
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
537 self.labels=None # left, right, no
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
538 self.transpose=False # whether matrix is transposed
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
539 self.interleave=False # whether matrix is interleaved
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
540 self.tokens=False # unsupported
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
541 self.eliminate=None # unsupported
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
542 self.matrix=None # ...
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
543 self.unknown_blocks=[] # blocks we don't care about
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
544 self.taxsets={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
545 self.charsets={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
546 self.charpartitions={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
547 self.taxpartitions={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
548 self.trees=[] # list of Trees (instances of tree class)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
549 self.translate=None # Dict to translate taxon <-> taxon numbers
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
550 self.structured=[] # structured input representation
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
551 self.set={} # dict of the set command to set various options
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
552 self.options={} # dict of the options command in the data block
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
553
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
554 # some defaults
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
555 self.options['gapmode']='missing'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
556
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
557 if input:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
558 self.read(input)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
559
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
560 def get_original_taxon_order(self):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
561 """Included for backwards compatibility."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
562 return self.taxlabels
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
563 def set_original_taxon_order(self,value):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
564 """Included for backwards compatibility."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
565 self.taxlabels=value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
566 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
567
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
568 def read(self,input):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
569 """Read and parse NEXUS imput (filename, file-handle, string."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
570
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
571 # 1. Assume we have the name of a file in the execution dir
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
572 # Note we need to add parsing of the path to dir/filename
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
573 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
574 file_contents = open(os.path.expanduser(input),'rU').read()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
575 self.filename=input
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
576 except (TypeError,IOError,AttributeError):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
577 #2 Assume we have a string from a fh.read()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
578 #if isinstance(input, str):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
579 # file_contents = input
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
580 # self.filename='input_string'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
581 #3 Assume we have a file object
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
582 if hasattr(input,'read'): # file objects or StringIO objects
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
583 file_contents=input.read()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
584 # GEC : Change next line so that StringIO objects work
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
585 #if input.name:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
586 if hasattr(input, 'name'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
587 self.filename=input.name
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
588 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
589 self.filename='Unknown_nexus_file'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
590 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
591 print input.strip()[:6]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
592 raise NexusError, 'Unrecognized input: %s ...' % input[:100]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
593 if C:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
594 decommented=cnexus.scanfile(file_contents)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
595 #check for unmatched parentheses
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
596 if decommented=='[' or decommented==']':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
597 raise NexusError, 'Unmatched %s' % decommented
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
598 # cnexus can't return lists, so in analogy we separate commandlines with chr(7)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
599 # (a character that shoudn't be part of a nexus file under normal circumstances)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
600 commandlines=_adjust_lines(decommented.split(chr(7)))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
601 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
602 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
603 # get rid of stupid 'NEXUS token'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
604 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
605 if commandlines[0][:6].upper()=='#NEXUS':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
606 commandlines[0]=commandlines[0][6:].strip()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
607 except:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
608 pass
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
609 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
610 nexus_block_gen = self._get_nexus_block(commandlines)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
611 while 1:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
612 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
613 title, contents = nexus_block_gen.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
614 except StopIteration:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
615 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
616 if title in KNOWN_NEXUS_BLOCKS:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
617 self._parse_nexus_block(title, contents)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
618 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
619 self._unknown_nexus_block(title, contents)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
620
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
621 def _get_nexus_block(self,file_contents):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
622 """Generator for looping through Nexus blocks."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
623 inblock=False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
624 blocklines=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
625 while file_contents:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
626 cl=file_contents.pop(0)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
627 if cl.lower().startswith('begin'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
628 if not inblock:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
629 inblock=True
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
630 title=cl.split()[1].lower()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
631 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
632 raise NexusError('Illegal block nesting in block %s' % title)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
633 elif cl.lower().startswith('end'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
634 if inblock:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
635 inblock=False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
636 yield title,blocklines
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
637 blocklines=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
638 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
639 raise NexusError('Unmatched \'end\'.')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
640 elif inblock:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
641 blocklines.append(cl)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
642
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
643 def _unknown_nexus_block(self,title, contents):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
644 block = Block()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
645 block.commandlines.append(contents)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
646 block.title = title
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
647 self.unknown_blocks.append(block)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
648
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
649 def _parse_nexus_block(self,title, contents):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
650 """Parse a known Nexus Block """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
651 # attached the structered block representation
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
652 self._apply_block_structure(title, contents)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
653 #now check for taxa,characters,data blocks. If this stuff is defined more than once
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
654 #the later occurences will override the previous ones.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
655 block=self.structured[-1]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
656 for line in block.commandlines:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
657 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
658 getattr(self,'_'+line.command)(line.options)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
659 except AttributeError:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
660 raise
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
661 raise NexusError, 'Unknown command: %s ' % line.command
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
662
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
663 def _dimensions(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
664 if options.has_key('ntax'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
665 self.ntax=eval(options['ntax'])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
666 if options.has_key('nchar'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
667 self.nchar=eval(options['nchar'])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
668
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
669 def _format(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
670 # print options
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
671 # we first need to test respectcase, then symbols (which depends on respectcase)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
672 # then datatype (which, if standard, depends on symbols and respectcase in order to generate
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
673 # dicts for ambiguous values and alphabet
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
674 if options.has_key('respectcase'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
675 self.respectcase=True
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
676 # adjust symbols to for respectcase
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
677 if options.has_key('symbols'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
678 self.symbols=options['symbols']
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
679 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
680 (self.symbold.startswith("'") and self.symbols.endswith("'")):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
681 self.symbols=self.symbols[1:-1].replace(' ','')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
682 if not self.respectcase:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
683 self.symbols=self.symbols.lower()+self.symbols.upper()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
684 self.symbols=list(sets.Set(self.symbols))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
685 if options.has_key('datatype'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
686 self.datatype=options['datatype'].lower()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
687 if self.datatype=='dna' or self.datatype=='nucleotide':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
688 self.alphabet=IUPAC.ambiguous_dna
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
689 self.ambiguous_values=IUPACData.ambiguous_dna_values
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
690 self.unambiguous_letters=IUPACData.unambiguous_dna_letters
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
691 elif self.datatype=='rna':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
692 self.alphabet=IUPAC.ambiguous_rna
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
693 self.ambiguous_values=IUPACData.ambiguous_rna_values
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
694 self.unambiguous_letters=IUPACData.unambiguous_rna_letters
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
695 elif self.datatype=='protein':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
696 self.alphabet=IUPAC.protein
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
697 self.ambiguous_values={'B':'DN','Z':'EQ','X':IUPACData.protein_letters} # that's how PAUP handles it
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
698 self.unambiguous_letters=IUPACData.protein_letters+'*' # stop-codon
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
699 elif self.datatype=='standard':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
700 raise NexusError('Datatype standard is not yet supported.')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
701 #self.alphabet=None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
702 #self.ambiguous_values={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
703 #if not self.symbols:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
704 # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
705 #self.unambiguous_letters=self.symbols
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
706 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
707 raise NexusError, 'Unsupported datatype: '+self.datatype
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
708 self.valid_characters=''.join(self.ambiguous_values.keys())+self.unambiguous_letters
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
709 if not self.respectcase:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
710 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
711 #we have to sort the reverse ambig coding dict key characters:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
712 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
713 rev=dict([(i[1],i[0]) for i in self.ambiguous_values.items() if i[0]!='X'])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
714 self.rev_ambiguous_values={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
715 for (k,v) in rev.items():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
716 key=[c for c in k]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
717 key.sort()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
718 self.rev_ambiguous_values[''.join(key)]=v
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
719 #overwrite symbols for datype rna,dna,nucleotide
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
720 if self.datatype in ['dna','rna','nucleotide']:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
721 self.symbols=self.alphabet.letters
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
722 if self.missing not in self.ambiguous_values:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
723 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
724 self.ambiguous_values[self.gap]=self.gap
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
725 elif self.datatype=='standard':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
726 if not self.symbols:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
727 self.symbols=['1','0']
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
728 if options.has_key('missing'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
729 self.missing=options['missing'][0]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
730 if options.has_key('gap'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
731 self.gap=options['gap'][0]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
732 if options.has_key('equate'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
733 self.equate=options['equate']
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
734 if options.has_key('matchchar'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
735 self.matchchar=options['matchchar'][0]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
736 if options.has_key('labels'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
737 self.labels=options['labels']
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
738 if options.has_key('transpose'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
739 raise NexusError, 'TRANSPOSE is not supported!'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
740 self.transpose=True
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
741 if options.has_key('interleave'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
742 if options['interleave']==None or options['interleave'].lower()=='yes':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
743 self.interleave=True
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
744 if options.has_key('tokens'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
745 self.tokens=True
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
746 if options.has_key('notokens'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
747 self.tokens=False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
748
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
749
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
750 def _set(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
751 self.set=options;
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
752
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
753 def _options(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
754 self.options=options;
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
755
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
756 def _eliminate(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
757 self.eliminate=options
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
758
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
759 def _taxlabels(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
760 """Get taxon labels."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
761 self.taxlabels=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
762 opts=CharBuffer(options)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
763 while True:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
764 taxon=quotestrip(opts.next_word())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
765 if not taxon:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
766 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
767 self.taxlabels.append(taxon)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
768
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
769 def _check_taxlabels(self,taxon):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
770 """Check for presence of taxon in self.taxlabels."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
771 # According to NEXUS standard, underscores shall be treated as spaces...,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
772 # so checking for identity is more difficult
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
773 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
774 nexid=taxon.replace(' ','_')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
775 return nextaxa.get(nexid)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
776
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
777 def _charlabels(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
778 self.charlabels={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
779 opts=CharBuffer(options)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
780 while True:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
781 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
782 # get id and state
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
783 w=opts.next_word()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
784 if w is None: # McClade saves and reads charlabel-lists with terminal comma?!
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
785 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
786 identifier=self._resolve(w,set_type=CHARSET)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
787 state=quotestrip(opts.next_word())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
788 self.charlabels[identifier]=state
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
789 # check for comma or end of command
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
790 c=opts.next_nonwhitespace()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
791 if c is None:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
792 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
793 elif c!=',':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
794 raise NexusError,'Missing \',\' in line %s.' % options
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
795 except NexusError:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
796 raise
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
797 except:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
798 raise NexusError,'Format error in line %s.' % options
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
799
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
800 def _charstatelabels(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
801 # warning: charstatelabels supports only charlabels-syntax!
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
802 self._charlabels(options)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
803
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
804 def _statelabels(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
805 #self.charlabels=options
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
806 #print 'Command statelabels is not supported and will be ignored.'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
807 pass
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
808
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
809 def _matrix(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
810 if not self.ntax or not self.nchar:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
811 raise NexusError,'Dimensions must be specified before matrix!'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
812 taxlabels_present=(self.taxlabels!=[])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
813 self.matrix={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
814 taxcount=0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
815 block_interleave=0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
816 #eliminate empty lines and leading/trailing whitespace
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
817 lines=[l.strip() for l in options.split('\n') if l.strip()<>'']
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
818 lineiter=iter(lines)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
819 while 1:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
820 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
821 l=lineiter.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
822 except StopIteration:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
823 if taxcount<self.ntax:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
824 raise NexusError, 'Not enough taxa in matrix.'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
825 elif taxcount>self.ntax:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
826 raise NexusError, 'Too many taxa in matrix.'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
827 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
828 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
829 # count the taxa and check for interleaved matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
830 taxcount+=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
831 ##print taxcount
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
832 if taxcount>self.ntax:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
833 if not self.interleave:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
834 raise NexusError, 'Too many taxa in matrix - should matrix be interleaved?'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
835 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
836 taxcount=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
837 block_interleave=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
838 #get taxon name and sequence
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
839 linechars=CharBuffer(l)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
840 id=quotestrip(linechars.next_word())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
841 l=linechars.rest().strip()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
842 if taxlabels_present and not self._check_taxlabels(id):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
843 raise NexusError,'Taxon '+id+' not found in taxlabels.'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
844 chars=''
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
845 if self.interleave:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
846 #interleaved matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
847 #print 'In interleave'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
848 if l:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
849 chars=''.join(l.split())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
850 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
851 chars=''.join(lineiter.next().split())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
852 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
853 #non-interleaved matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
854 chars=''.join(l.split())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
855 while len(chars)<self.nchar:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
856 l=lineiter.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
857 chars+=''.join(l.split())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
858 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
859 #first taxon has the reference sequence if matchhar is used
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
860 if taxcount==1:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
861 refseq=iupac_seq
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
862 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
863 if self.matchchar:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
864 while 1:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
865 p=iupac_seq.tostring().find(self.matchchar)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
866 if p==-1:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
867 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
868 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
869 #check for invalid characters
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
870 for i,c in enumerate(iupac_seq.tostring()):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
871 if c not in self.valid_characters and c!=self.gap and c!=self.missing:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
872 raise NexusError, 'Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
873 % (id,c,l[i-10:i+10])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
874 #add sequence to matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
875 if block_interleave==0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
876 while self.matrix.has_key(id):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
877 if id.split('.')[-1].startswith('copy'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
878 id='.'.join(id.split('.')[:-1])+'.copy'+str(eval('0'+id.split('.')[-1][4:])+1)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
879 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
880 id+='.copy'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
881 #raise NexusError, id+' already in matrix!\nError in: '+l
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
882 self.matrix[id]=iupac_seq
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
883 # add taxon name only if taxlabels is not alredy present
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
884 if not taxlabels_present:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
885 self.taxlabels.append(id)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
886 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
887 taxon_present=self._check_taxlabels(id)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
888 if taxon_present:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
889 self.matrix[taxon_present]+=iupac_seq
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
890 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
891 raise NexusError, 'Taxon %s not in first block of interleaved matrix.' % id
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
892 #check all sequences for length according to nchar
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
893 for taxon in self.matrix:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
894 if len(self.matrix[taxon])!=self.nchar:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
895 raise NexusError,'Nchar ('+str(self.nchar)+') does not match data for taxon '+taxon
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
896
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
897 def _translate(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
898 self.translate={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
899 opts=CharBuffer(options)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
900 while True:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
901 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
902 # get id and state
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
903 identifier=int(opts.next_word())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
904 label=quotestrip(opts.next_word())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
905 self.translate[identifier]=label
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
906 # check for comma or end of command
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
907 c=opts.next_nonwhitespace()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
908 if c is None:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
909 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
910 elif c!=',':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
911 raise NexusError,'Missing \',\' in line %s.' % options
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
912 except NexusError:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
913 raise
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
914 except:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
915 raise NexusError,'Format error in line %s.' % options
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
916
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
917 def _utree(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
918 """Some software (clustalx) uses 'utree' to denote an unrooted tree."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
919 self._tree(options)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
920
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
921 def _tree(self,options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
922 opts=CharBuffer(options)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
923 name=opts.next_word()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
924 if opts.next_nonwhitespace()!='=':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
925 raise NexusError,'Syntax error in tree description: %s' % options[:50]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
926 rooted=False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
927 weight=1.0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
928 while opts.peek_nonwhitespace()=='[':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
929 open=opts.next_nonwhitespace()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
930 symbol=opts.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
931 if symbol!='&':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
932 raise NexusError,'Illegal special comment [%s...] in tree description: %s' % (symbol, options[:50])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
933 special=opts.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
934 value=opts.next_until(']')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
935 closing=opts.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
936 if special=='R':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
937 rooted=True
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
938 elif special=='U':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
939 rooted=False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
940 elif special=='W':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
941 weight=float(value)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
942 tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
943 # if there's an active translation table, translate
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
944 if self.translate:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
945 for n in tree.get_terminals():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
946 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
947 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
948 except (ValueError,KeyError):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
949 raise NexusError,'Unable to substitue %s using \'translate\' data.' % tree.node(n).data.taxon
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
950 self.trees.append(tree)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
951
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
952 def _apply_block_structure(self,title,lines):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
953 block=Block('')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
954 block.title = title
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
955 for line in lines:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
956 block.commandlines.append(Commandline(line, title))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
957 self.structured.append(block)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
958
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
959 def _taxset(self, options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
960 name,taxa=self._get_indices(options,set_type=TAXSET)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
961 self.taxsets[name]=_make_unique(taxa)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
962
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
963 def _charset(self, options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
964 name,sites=self._get_indices(options,set_type=CHARSET)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
965 self.charsets[name]=_make_unique(sites)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
966
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
967 def _taxpartition(self, options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
968 taxpartition={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
969 quotelevel=False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
970 opts=CharBuffer(options)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
971 name=self._name_n_vector(opts)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
972 if not name:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
973 raise NexusError, 'Formatting error in taxpartition: %s ' % options
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
974 # now collect thesubbpartitions and parse them
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
975 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
976 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
977 sub=''
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
978 while True:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
979 w=opts.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
980 if w is None or (w==',' and not quotelevel):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
981 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
982 taxpartition[subname]=_make_unique(subindices)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
983 sub=''
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
984 if w is None:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
985 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
986 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
987 if w=="'":
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
988 quotelevel=not quotelevel
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
989 sub+=w
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
990 self.taxpartitions[name]=taxpartition
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
991
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
992 def _charpartition(self, options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
993 charpartition={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
994 quotelevel=False
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
995 opts=CharBuffer(options)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
996 name=self._name_n_vector(opts)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
997 if not name:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
998 raise NexusError, 'Formatting error in charpartition: %s ' % options
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
999 # now collect thesubbpartitions and parse them
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1000 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1001 sub=''
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1002 while True:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1003 w=opts.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1004 if w is None or (w==',' and not quotelevel):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1005 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1006 charpartition[subname]=_make_unique(subindices)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1007 sub=''
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1008 if w is None:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1009 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1010 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1011 if w=="'":
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1012 quotelevel=not quotelevel
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1013 sub+=w
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1014 self.charpartitions[name]=charpartition
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1015
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1016 def _get_indices(self,options,set_type=CHARSET,separator='='):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1017 """Parse the taxset/charset specification
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1018 '1 2 3 - 5 dog cat 10- 20 \\ 3' --> [0,1,2,3,4,'dog','cat',10,13,16,19]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1019 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1020 opts=CharBuffer(options)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1021 name=self._name_n_vector(opts,separator=separator)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1022 indices=self._parse_list(opts,set_type=set_type)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1023 if indices is None:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1024 raise NexusError, 'Formatting error in line: %s ' % options
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1025 return name,indices
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1026
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1027 def _name_n_vector(self,opts,separator='='):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1028 """Extract name and check that it's not in vector format."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1029 rest=opts.rest()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1030 name=opts.next_word()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1031 if not name:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1032 raise NexusError, 'Formatting error in line: %s ' % rest
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1033 name=quotestrip(name)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1034 if opts.peek_nonwhitespace=='(':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1035 open=opts.next_nonwhitespace()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1036 qualifier=open.next_word()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1037 close=opts.next_nonwhitespace()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1038 if qualifier.lower()=='vector':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1039 raise NexusError, 'Unsupported VECTOR format in line %s' % (options)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1040 elif qualifier.lower()!='standard':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1041 raise NexusError, 'Unknown qualifier %s in line %s' % (qualifier,options)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1042 if opts.next_nonwhitespace()!=separator:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1043 raise NexusError, 'Formatting error in line: %s ' % rest
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1044 return name
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1045
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1046 def _parse_list(self,options_buffer,set_type):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1047 """Parse a NEXUS list: [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17-21],
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1048 (assuming dog is taxon no. 17 and cat is taxon no. 21).
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1049 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1050 plain_list=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1051 if options_buffer.peek_nonwhitespace():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1052 try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1053 while True:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1054 identifier=options_buffer.next_word() # next list element
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1055 if not identifier: # end of list?
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1056 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1057 start=self._resolve(identifier,set_type=set_type)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1058 if options_buffer.peek_nonwhitespace()=='-': # followd by -
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1059 end=start
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1060 step=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1061 # get hyphen and end of range
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1062 hyphen=options_buffer.next_nonwhitespace()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1063 end=self._resolve(options_buffer.next_word(),set_type=set_type)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1064 if set_type==CHARSET:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1065 if options_buffer.peek_nonwhitespace()=='\\': # followd by \
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1066 backslash=options_buffer.next_nonwhitespace()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1067 step=int(options_buffer.next_word()) # get backslash and step
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1068 plain_list.extend(range(start,end+1,step))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1069 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1070 if type(start)==list or type(end)==list:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1071 raise NexusError, 'Name if character sets not allowed in range definition: %s' % identifier
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1072 start=self.taxlabels.index(start)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1073 end=self.taxlabels.index(end)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1074 taxrange=self.taxlabels[start:end+1]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1075 plain_list.extend(taxrange)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1076 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1077 if type(start)==list: # start was the name of charset or taxset
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1078 plain_list.extend(start)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1079 else: # start was an ordinary identifier
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1080 plain_list.append(start)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1081 except NexusError:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1082 raise
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1083 except:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1084 return None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1085 return plain_list
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1086
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1087 def _resolve(self,identifier,set_type=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1088 """Translate identifier in list into character/taxon index.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1089 Characters (which are referred to by their index in Nexus.py):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1090 Plain numbers are returned minus 1 (Nexus indices to python indices)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1091 Text identifiers are translaterd into their indices (if plain character indentifiers),
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1092 the first hit in charlabels is returned (charlabels don't need to be unique)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1093 or the range of indices is returned (if names of character sets).
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1094 Taxa (which are referred to by their unique name in Nexus.py):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1095 Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1096 Names are returned unchanged (if plain taxon identifiers), or the names in
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1097 the corresponding taxon set is returned
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1098 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1099 identifier=quotestrip(identifier)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1100 if not set_type:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1101 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1102 if set_type==CHARSET:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1103 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1104 n=int(identifier)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1105 except ValueError:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1106 if self.charlabels and identifier in self.charlabels.values():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1107 for k in self.charlabels:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1108 if self.charlabels[k]==identifier:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1109 return k
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1110 elif self.charsets and identifier in self.charsets:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1111 return self.charsets[identifier]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1112 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1113 raise NexusError, 'Unknown character identifier: %s' % identifier
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1114 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1115 if n<=self.nchar:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1116 return n-1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1117 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1118 raise NexusError, 'Illegal character identifier: %d>nchar (=%d).' % (identifier,self.nchar)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1119 elif set_type==TAXSET:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1120 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1121 n=int(identifier)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1122 except ValueError:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1123 taxlabels_id=self._check_taxlabels(identifier)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1124 if taxlabels_id:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1125 return taxlabels_id
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1126 elif self.taxsets and identifier in self.taxsets:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1127 return self.taxsets[identifier]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1128 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1129 raise NexusError, 'Unknown taxon identifier: %s' % identifier
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1130 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1131 if n>0 and n<=self.ntax:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1132 return self.taxlabels[n-1]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1133 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1134 raise NexusError, 'Illegal taxon identifier: %d>ntax (=%d).' % (identifier,self.ntax)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1135 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1136 raise NexusError('Unknown set specification: %s.'% set_type)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1137
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1138 def _stateset(self, options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1139 #Not implemented
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1140 pass
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1141
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1142 def _changeset(self, options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1143 #Not implemented
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1144 pass
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1145
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1146 def _treeset(self, options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1147 #Not implemented
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1148 pass
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1149
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1150 def _treepartition(self, options):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1151 #Not implemented
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1152 pass
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1153
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1154 def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1155 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1156 """Writes a nexus file for each partition in charpartition.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1157 Only non-excluded characters and non-deleted taxa are included, just the data block is written.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1158 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1159
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1160 if not matrix:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1161 matrix=self.matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1162 if not matrix:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1163 return
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1164 if not filename:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1165 filename=self.filename
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1166 if charpartition:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1167 pfilenames={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1168 for p in charpartition:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1169 total_exclude=[]+exclude
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1170 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1171 total_exclude=_make_unique(total_exclude)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1172 pcomment=comment+'\nPartition: '+p+'\n'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1173 dot=filename.rfind('.')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1174 if dot>0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1175 pfilename=filename[:dot]+'_'+p+'.data'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1176 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1177 pfilename=filename+'_'+p
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1178 pfilenames[p]=pfilename
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1179 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1180 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1181 mrbayes=mrbayes)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1182 return pfilenames
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1183 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1184 fn=self.filename+'.data'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1185 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1186 exclude=exclude,delete=delete,comment=comment,append_sets=False,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1187 mrbayes=mrbayes)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1188 return fn
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1189
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1190 def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1191 blocksize=None, interleave=False, interleave_by_partition=False,\
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1192 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1193 """ Writes a nexus file with data and sets block. Character sets and partitions
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1194 are appended by default, and are adjusted according
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1195 to excluded characters (i.e. character sets still point to the same sites (not necessarily same positions),
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1196 without including the deleted characters.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1197 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1198 if not matrix:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1199 matrix=self.matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1200 if not matrix:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1201 return
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1202 if not filename:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1203 filename=self.filename
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1204 if [t for t in delete if not self._check_taxlabels(t)]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1205 raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(sets.Set(self.taxlabels)))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1206 if interleave_by_partition:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1207 if not interleave_by_partition in self.charpartitions:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1208 raise NexusError, 'Unknown partition: '+interleave_by_partition
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1209 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1210 partition=self.charpartitions[interleave_by_partition]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1211 # we need to sort the partition names by starting position before we exclude characters
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1212 names=_sort_keys_by_values(partition)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1213 newpartition={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1214 for p in partition:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1215 newpartition[p]=[c for c in partition[p] if c not in exclude]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1216 # how many taxa and how many characters are left?
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1217 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1218 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1219 ntax_adjusted=len(undelete)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1220 nchar_adjusted=len(cropped_matrix[undelete[0]])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1221 if not undelete or (undelete and undelete[0]==''):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1222 return
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1223 if isinstance(filename,str):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1224 try:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1225 fh=open(filename,'w')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1226 except IOError:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1227 raise NexusError, 'Could not open %s for writing.' % filename
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1228 elif isinstance(filename,file):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1229 fh=filename
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1230 if not omit_NEXUS:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1231 fh.write('#NEXUS\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1232 if comment:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1233 fh.write('['+comment+']\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1234 fh.write('begin data;\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1235 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1236 fh.write('\tformat datatype='+self.datatype)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1237 if self.respectcase:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1238 fh.write(' respectcase')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1239 if self.missing:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1240 fh.write(' missing='+self.missing)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1241 if self.gap:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1242 fh.write(' gap='+self.gap)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1243 if self.matchchar:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1244 fh.write(' matchchar='+self.matchchar)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1245 if self.labels:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1246 fh.write(' labels='+self.labels)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1247 if self.equate:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1248 fh.write(' equate='+self.equate)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1249 if interleave or interleave_by_partition:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1250 fh.write(' interleave')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1251 fh.write(';\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1252 #if self.taxlabels:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1253 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1254 if self.charlabels:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1255 newcharlabels=self._adjust_charlabels(exclude=exclude)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1256 clkeys=newcharlabels.keys()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1257 clkeys.sort()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1258 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1259 fh.write('matrix\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1260 if not blocksize:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1261 if interleave:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1262 blocksize=70
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1263 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1264 blocksize=self.nchar
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1265 # delete deleted taxa and ecxclude excluded characters...
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1266 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1267 if interleave_by_partition:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1268 # interleave by partitions, but adjust partitions with regard to excluded characters
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1269 seek=0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1270 for p in names:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1271 fh.write('[%s: %s]\n' % (interleave_by_partition,p))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1272 if len(newpartition[p])>0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1273 for taxon in undelete:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1274 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1275 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1276 fh.write('\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1277 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1278 fh.write('[empty]\n\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1279 seek+=len(newpartition[p])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1280 elif interleave:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1281 for seek in range(0,nchar_adjusted,blocksize):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1282 for taxon in undelete:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1283 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1284 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1285 fh.write('\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1286 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1287 for taxon in undelete:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1288 if blocksize<nchar_adjusted:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1289 fh.write(safename(taxon,mrbayes=mrbayes)+'\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1290 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1291 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1292 for seek in range(0,nchar_adjusted,blocksize):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1293 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1294 fh.write(';\nend;\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1295 if append_sets:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1296 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1297 fh.close()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1298 return filename
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1299
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1300 def append_sets(self,exclude=[],delete=[],mrbayes=False):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1301 """Appends a sets block to <filename>."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1302 if not self.charsets and not self.taxsets and not self.charpartitions:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1303 return ''
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1304 sets=['\nbegin sets']
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1305 # - now if characters have been excluded, the character sets need to be adjusted,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1306 # so that they still point to the right character positions
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1307 # calculate a list of offsets: for each deleted character, the following character position
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1308 # in the new file will have an additional offset of -1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1309 offset=0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1310 offlist=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1311 for c in range(self.nchar):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1312 if c in exclude:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1313 offset+=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1314 offlist.append(-1) # dummy value as these character positions are excluded
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1315 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1316 offlist.append(c-offset)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1317 # now adjust each of the character sets
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1318 for n,ns in self.charsets.items():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1319 cset=[offlist[c] for c in ns if c not in exclude]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1320 if cset:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1321 sets.append('charset %s = %s' % (safename(n),_compact4nexus(cset)))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1322 for n,s in self.taxsets.items():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1323 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1324 if tset:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1325 sets.append('taxset %s = %s' % (safename(n),' '.join(tset)))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1326 for n,p in self.charpartitions.items():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1327 # as characters have been excluded, the partitions must be adjusted
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1328 # if a partition is empty, it will be omitted from the charpartition command
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1329 # (although paup allows charpartition part=t1:,t2:,t3:1-100)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1330 names=_sort_keys_by_values(p)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1331 newpartition={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1332 for sn in names:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1333 nsp=[offlist[c] for c in p[sn] if c not in exclude]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1334 if nsp:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1335 newpartition[sn]=nsp
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1336 if newpartition:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1337 sets.append('charpartition %s = %s' % (safename(n),\
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1338 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition])))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1339 # now write charpartititions, much easier than charpartitions
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1340 for n,p in self.taxpartitions.items():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1341 names=_sort_keys_by_values(p)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1342 newpartition={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1343 for sn in names:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1344 nsp=[t for t in p[sn] if t not in delete]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1345 if nsp:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1346 newpartition[sn]=nsp
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1347 if newpartition:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1348 sets.append('taxpartition %s = %s' % (safename(n),\
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1349 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition])))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1350 # add 'end' and return everything
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1351 sets.append('end;\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1352 return ';\n'.join(sets)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1353 f.close()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1354
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1355 def export_fasta(self, filename=None, width=70):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1356 """Writes matrix into a fasta file: (self, filename=None, width=70)."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1357 if not filename:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1358 if '.' in filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1359 filename='.'.join(self.filename.split('.')[:-1])+'.fas'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1360 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1361 filename=self.filename+'.fas'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1362 fh=open(filename,'w')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1363 for taxon in self.taxlabels:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1364 fh.write('>'+safename(taxon)+'\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1365 for i in range(0, len(self.matrix[taxon].tostring()), width):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1366 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1367 fh.close()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1368
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1369 def constant(self,matrix=None,delete=[],exclude=[]):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1370 """Return a list with all constant characters."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1371 if not matrix:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1372 matrix=self.matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1373 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1374 if not undelete:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1375 return None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1376 elif len(undelete)==1:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1377 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1378 # get the first sequence and expand all ambiguous values
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1379 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1380 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1381 for taxon in undelete[1:]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1382 newconstant=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1383 for site in constant:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1384 #print '%d (paup=%d)' % (site[0],site[0]+1),
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1385 seqsite=matrix[taxon][site[0]].upper()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1386 #print seqsite,'checked against',site[1],'\t',
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1387 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1388 # missing or same as before -> ok
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1389 newconstant.append(site)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1390 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1391 # subset of an ambig or only missing in previous -> take subset
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1392 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite)))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1393 elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1394 intersect=sets.Set(self.ambiguous_values[seqsite]).intersection(sets.Set(site[1]))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1395 if intersect:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1396 newconstant.append((site[0],''.join(intersect)))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1397 # print 'ok'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1398 #else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1399 # print 'failed'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1400 #else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1401 # print 'failed'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1402 constant=newconstant
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1403 cpos=[s[0] for s in constant]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1404 return constant
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1405 # return [x[0] for x in constant]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1406
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1407 def cstatus(self,site,delete=[],narrow=True):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1408 """Summarize character.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1409 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1410 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1411 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1412 undelete=[t for t in self.taxlabels if t not in delete]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1413 if not undelete:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1414 return None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1415 cstatus=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1416 for t in undelete:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1417 c=self.matrix[t][site].upper()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1418 if self.options.get('gapmode')=='missing' and c==self.gap:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1419 c=self.missing
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1420 if narrow and c==self.missing:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1421 if c not in cstatus:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1422 cstatus.append(c)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1423 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1424 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1425 if self.missing in cstatus and narrow and len(cstatus)>1:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1426 cstatus=[c for c in cstatus if c!=self.missing]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1427 cstatus.sort()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1428 return cstatus
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1429
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1430 def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1431 """Calculates a stepmatrix for weighted parsimony.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1432 See Wheeler (1990), Cladistics 6:269-275 and
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1433 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1434 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1435 m=StepMatrix(self.unambiguous_letters,self.gap)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1436 for site in [s for s in range(self.nchar) if s not in exclude]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1437 cstatus=self.cstatus(site,delete)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1438 for i,b1 in enumerate(cstatus[:-1]):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1439 for b2 in cstatus[i+1:]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1440 m.add(b1.upper(),b2.upper(),1)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1441 return m.transformation().weighting().smprint(name=name)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1442
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1443 def crop_matrix(self,matrix=None, delete=[], exclude=[]):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1444 """Return a matrix without deleted taxa and excluded characters."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1445 if not matrix:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1446 matrix=self.matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1447 if [t for t in delete if not self._check_taxlabels(t)]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1448 raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(self.taxlabels))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1449 if exclude!=[]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1450 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1451 if not undelete:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1452 return {}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1453 m=[matrix[k].tostring() for k in undelete]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1454 zipped_m=zip(*m)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1455 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1456 if sitesm==[]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1457 return dict([(t,Seq('',self.alphabet)) for t in undelete])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1458 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1459 zipped_sitesm=zip(*sitesm)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1460 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1461 return dict(zip(undelete,m))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1462 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1463 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1464
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1465 def bootstrap(self,matrix=None,delete=[],exclude=[]):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1466 """Return a bootstrapped matrix."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1467 if not matrix:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1468 matrix=self.matrix
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1469 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq) # remember if Seq objects
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1470 cm=self.crop_matrix(delete=delete,exclude=exclude) # crop data out
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1471 if not cm: # everything deleted?
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1472 return {}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1473 elif len(cm[cm.keys()[0]])==0: # everything excluded?
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1474 return cm
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1475 undelete=[t for t in self.taxlabels if t in cm]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1476 if seqobjects:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1477 sitesm=zip(*[cm[t].tostring() for t in undelete])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1478 alphabet=matrix[matrix.keys()[0]].alphabet
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1479 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1480 sitesm=zip(*[cm[t] for t in undelete])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1481 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1482 bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1483 if seqobjects:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1484 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1485 return dict(zip(undelete,bootstrapseqs))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1486
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1487 def add_sequence(self,name,sequence):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1488 """Adds a sequence to the matrix."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1489 if not name:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1490 raise NexusError, 'New sequence must have a name'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1491 diff=self.nchar-len(sequence)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1492 if diff<0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1493 self.insert_gap(self.nchar,-diff)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1494 elif diff>0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1495 sequence+=self.missing*diff
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1496
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1497 self.matrix[name]=Seq(sequence,self.alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1498 self.ntax+=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1499 self.taxlabels.append(name)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1500 #taxlabels?
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1501
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1502 def insert_gap(self,pos,n=1,leftgreedy=False):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1503 """Add a gap into the matrix and adjust charsets and partitions.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1504
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1505 pos=0: first position
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1506 pos=nchar: last position
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1507 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1508
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1509 def _adjust(set,x,d,leftgreedy=False):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1510 """Adjusts chartacter sets if gaps are inserted, taking care of
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1511 new gaps within a coherent character set."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1512 # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1513 # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1514 # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1515 set.sort()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1516 addpos=0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1517 for i,c in enumerate(set):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1518 if c>=x:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1519 set[i]=c+d
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1520 # if we add gaps within a group of characters, we want the gap position included in this group
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1521 if c==x:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1522 if leftgreedy or (i>0 and set[i-1]==c-1):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1523 addpos=i
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1524 if addpos>0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1525 set[addpos:addpos]=range(x,x+d)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1526 return set
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1527
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1528 if pos<0 or pos>self.nchar:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1529 raise NexusError('Illegal gap position: %d' % pos)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1530 if n==0:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1531 return
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1532 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1533 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1534 # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1535 # i,taxon in enumerate(self.taxlabels)])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1536 zipped=zip(*sitesm)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1537 mapped=map(''.join,zipped)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1538 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1539 self.matrix=dict(listed)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1540 self.nchar+=n
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1541 # now adjust character sets
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1542 for i,s in self.charsets.items():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1543 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1544 for p in self.charpartitions:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1545 for sp,s in self.charpartitions[p].items():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1546 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1547 # now adjust character state labels
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1548 self.charlabels=self._adjust_charlabels(insert=[pos]*n)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1549 return self.charlabels
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1550
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1551 def _adjust_charlabels(self,exclude=None,insert=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1552 """Return adjusted indices of self.charlabels if characters are excluded or inserted."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1553 if exclude and insert:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1554 raise NexusError, 'Can\'t exclude and insert at the same time'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1555 if not self.charlabels:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1556 return None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1557 labels=self.charlabels.keys()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1558 labels.sort()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1559 newcharlabels={}
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1560 if exclude:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1561 exclude.sort()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1562 exclude.append(sys.maxint)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1563 excount=0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1564 for c in labels:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1565 if not c in exclude:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1566 while c>exclude[excount]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1567 excount+=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1568 newcharlabels[c-excount]=self.charlabels[c]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1569 elif insert:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1570 insert.sort()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1571 insert.append(sys.maxint)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1572 icount=0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1573 for c in labels:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1574 while c>=insert[icount]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1575 icount+=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1576 newcharlabels[c+icount]=self.charlabels[c]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1577 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1578 return self.charlabels
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1579 return newcharlabels
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1580
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1581 def invert(self,charlist):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1582 """Returns all character indices that are not in charlist."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1583 return [c for c in range(self.nchar) if c not in charlist]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1584
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1585 def gaponly(self,include_missing=False):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1586 """Return gap-only sites."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1587 gap=sets.Set(self.gap)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1588 if include_missing:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1589 gap.add(self.missing)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1590 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1591 gaponly=[i for i,site in enumerate(sitesm) if sets.Set(site).issubset(gap)]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1592 return gaponly
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1593
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1594 def terminal_gap_to_missing(self,missing=None,skip_n=True):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1595 """Replaces all terminal gaps with missing character.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1596
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1597 Mixtures like ???------??------- are properly resolved."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1598
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1599 if not missing:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1600 missing=self.missing
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1601 replace=[self.missing,self.gap]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1602 if not skip_n:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1603 replace.extend(['n','N'])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1604 for taxon in self.taxlabels:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1605 sequence=self.matrix[taxon].tostring()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1606 length=len(sequence)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1607 start,end=get_start_end(sequence,skiplist=replace)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1608 sequence=sequence[:end+1]+missing*(length-end-1)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1609 sequence=start*missing+sequence[start:]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1610 assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1611 self.matrix[taxon]=Seq(sequence,self.alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1612