0
|
1 # Nexus.py - a NEXUS parser
|
|
2 #
|
|
3 # Copyright 2005 by Frank Kauff & Cymon J. Cox. All rights reserved.
|
|
4 # This code is part of the Biopython distribution and governed by its
|
|
5 # license. Please see the LICENSE file that should have been included
|
|
6 # as part of this package.
|
|
7 #
|
|
8 # Bug reports welcome: fkauff@duke.edu
|
|
9 #
|
|
10
|
|
11 """Parse the contents of a nexus file.
|
|
12 Based upon 'NEXUS: An extensible file format for systematic information'
|
|
13 Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
|
|
14
|
|
15 Authors: Frank Kauff and Cymon J. Cox
|
|
16 """
|
|
17
|
|
18 import os,sys, math, random, copy
|
|
19 import sets
|
|
20
|
|
21 # --- Changes from Bio.Nexus ---
|
|
22 # Renamed Nexus.py to __init__.py. Helps with api documentation.
|
|
23 # One further change in file, tagged with 'GEC'
|
|
24 #from Bio.Alphabet import IUPAC
|
|
25 #from Bio.Data import IUPACData
|
|
26 #from Bio.Seq import Seq
|
|
27
|
|
28
|
|
29 from corebio.seq import Seq, Alphabet, protein_alphabet
|
|
30 import corebio.data as data
|
|
31 from corebio.utils import Struct
|
|
32
|
|
33 IUPACData = Struct(
|
|
34 ambiguous_dna_letters = data.dna_extended_letters,
|
|
35 ambiguous_rna_letters = data.rna_extended_letters,
|
|
36 ambiguous_dna_values = data.dna_ambiguity,
|
|
37 ambiguous_rna_values = data.rna_ambiguity,
|
|
38 protein_letters = data.amino_acid_letters,
|
|
39 unambiguous_dna_letters = data.dna_letters,
|
|
40 unambiguous_rna_letters = data.rna_letters,
|
|
41 )
|
|
42
|
|
43 IUPAC = Struct(
|
|
44 ambiguous_dna = Alphabet(IUPACData.ambiguous_dna_letters+'-?'),
|
|
45 ambiguous_rna = Alphabet(IUPACData.ambiguous_rna_letters+'-?'),
|
|
46 protein = protein_alphabet #Alphabet(IUPACData.protein_letters+'-?')
|
|
47 )
|
|
48
|
|
49
|
|
50
|
|
51 # End Changes
|
|
52
|
|
53
|
|
54 from Trees import Tree,NodeData
|
|
55
|
|
56 C = False
|
|
57
|
|
58 #try:
|
|
59 # import cnexus
|
|
60 #except ImportError:
|
|
61 # C=False
|
|
62 #else:
|
|
63 # C=True
|
|
64
|
|
65 INTERLEAVE=70
|
|
66 SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\
|
|
67 'matrix','tree', 'utree','translate']
|
|
68 KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets']
|
|
69 PUNCTUATION='()[]{}/\,;:=*\'"`+-<>'
|
|
70 MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_'
|
|
71 WHITESPACE=' \t\n'
|
|
72 #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments
|
|
73 SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored
|
|
74 CHARSET='chars'
|
|
75 TAXSET='taxa'
|
|
76
|
|
77 class NexusError(Exception): pass
|
|
78
|
|
79 class CharBuffer:
|
|
80 """Helps reading NEXUS-words and characters from a buffer."""
|
|
81 def __init__(self,string):
|
|
82 if string:
|
|
83 self.buffer=list(string)
|
|
84 else:
|
|
85 self.buffer=[]
|
|
86
|
|
87 def peek(self):
|
|
88 if self.buffer:
|
|
89 return self.buffer[0]
|
|
90 else:
|
|
91 return None
|
|
92
|
|
93 def peek_nonwhitespace(self):
|
|
94 b=''.join(self.buffer).strip()
|
|
95 if b:
|
|
96 return b[0]
|
|
97 else:
|
|
98 return None
|
|
99
|
|
100 def next(self):
|
|
101 if self.buffer:
|
|
102 return self.buffer.pop(0)
|
|
103 else:
|
|
104 return None
|
|
105
|
|
106 def next_nonwhitespace(self):
|
|
107 while True:
|
|
108 p=self.next()
|
|
109 if p is None:
|
|
110 break
|
|
111 if p not in WHITESPACE:
|
|
112 return p
|
|
113 return None
|
|
114
|
|
115 def skip_whitespace(self):
|
|
116 while self.buffer[0] in WHITESPACE:
|
|
117 self.buffer=self.buffer[1:]
|
|
118
|
|
119 def next_until(self,target):
|
|
120 for t in target:
|
|
121 try:
|
|
122 pos=self.buffer.index(t)
|
|
123 except ValueError:
|
|
124 pass
|
|
125 else:
|
|
126 found=''.join(self.buffer[:pos])
|
|
127 self.buffer=self.buffer[pos:]
|
|
128 return found
|
|
129 else:
|
|
130 return None
|
|
131
|
|
132 def peek_word(self,word):
|
|
133 return ''.join(self.buffer[:len(word)])==word
|
|
134
|
|
135 def next_word(self):
|
|
136 """Return the next NEXUS word from a string, dealing with single and double quotes,
|
|
137 whitespace and punctuation.
|
|
138 """
|
|
139
|
|
140 word=[]
|
|
141 quoted=False
|
|
142 first=self.next_nonwhitespace() # get first character
|
|
143 if not first: # return empty if only whitespace left
|
|
144 return None
|
|
145 word.append(first)
|
|
146 if first=="'": # word starts with a quote
|
|
147 quoted=True
|
|
148 elif first in PUNCTUATION: # if it's punctuation, return immediately
|
|
149 return first
|
|
150 while True:
|
|
151 c=self.peek()
|
|
152 if c=="'": # a quote?
|
|
153 word.append(self.next()) # store quote
|
|
154 if self.peek()=="'": # double quote
|
|
155 skip=self.next() # skip second quote
|
|
156 elif quoted: # second single quote ends word
|
|
157 break
|
|
158 elif quoted:
|
|
159 word.append(self.next()) # if quoted, then add anything
|
|
160 elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop
|
|
161 break
|
|
162 else:
|
|
163 word.append(self.next()) # standard character
|
|
164 return ''.join(word)
|
|
165
|
|
166 def rest(self):
|
|
167 """Return the rest of the string without parsing."""
|
|
168 return ''.join(self.buffer)
|
|
169
|
|
170 class StepMatrix:
|
|
171 """Calculate a stepmatrix for weighted parsimony.
|
|
172 See Wheeler (1990), Cladistics 6:269-275.
|
|
173 """
|
|
174
|
|
175 def __init__(self,symbols,gap):
|
|
176 self.data={}
|
|
177 self.symbols=[s for s in symbols]
|
|
178 self.symbols.sort()
|
|
179 if gap:
|
|
180 self.symbols.append(gap)
|
|
181 for x in self.symbols:
|
|
182 for y in [s for s in self.symbols if s!=x]:
|
|
183 self.set(x,y,0)
|
|
184
|
|
185 def set(self,x,y,value):
|
|
186 if x>y:
|
|
187 x,y=y,x
|
|
188 self.data[x+y]=value
|
|
189
|
|
190 def add(self,x,y,value):
|
|
191 if x>y:
|
|
192 x,y=y,x
|
|
193 self.data[x+y]+=value
|
|
194
|
|
195 def sum(self):
|
|
196 return reduce(lambda x,y:x+y,self.data.values())
|
|
197
|
|
198 def transformation(self):
|
|
199 total=self.sum()
|
|
200 if total!=0:
|
|
201 for k in self.data:
|
|
202 self.data[k]=self.data[k]/float(total)
|
|
203 return self
|
|
204
|
|
205 def weighting(self):
|
|
206 for k in self.data:
|
|
207 if self.data[k]!=0:
|
|
208 self.data[k]=-math.log(self.data[k])
|
|
209 return self
|
|
210
|
|
211 def smprint(self,name='your_name_here'):
|
|
212 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols))
|
|
213 matrix+=' %s\n' % ' '.join(self.symbols)
|
|
214 for x in self.symbols:
|
|
215 matrix+='[%s]'.ljust(8) % x
|
|
216 for y in self.symbols:
|
|
217 if x==y:
|
|
218 matrix+=' . '
|
|
219 else:
|
|
220 if x>y:
|
|
221 x1,y1=y,x
|
|
222 else:
|
|
223 x1,y1=x,y
|
|
224 if self.data[x1+y1]==0:
|
|
225 matrix+='inf. '
|
|
226 else:
|
|
227 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1])
|
|
228 matrix+='\n'
|
|
229 matrix+=';\n'
|
|
230 return matrix
|
|
231
|
|
232 def safename(name,mrbayes=False):
|
|
233 """Return a taxon identifier according to NEXUS standard.
|
|
234 Wrap quotes around names with punctuation or whitespace, and double single quotes.
|
|
235 mrbayes=True: write names without quotes, whitespace or punctuation for mrbayes.
|
|
236 """
|
|
237 if mrbayes:
|
|
238 safe=name.replace(' ','_')
|
|
239 safe=''.join([c for c in safe if c in MRBAYESSAFE])
|
|
240 else:
|
|
241 safe=name.replace("'","''")
|
|
242 if sets.Set(safe).intersection(sets.Set(WHITESPACE+PUNCTUATION)):
|
|
243 safe="'"+safe+"'"
|
|
244 return safe
|
|
245
|
|
246 def quotestrip(word):
|
|
247 """Remove quotes and/or double quotes around identifiers."""
|
|
248 if not word:
|
|
249 return None
|
|
250 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')):
|
|
251 word=word[1:-1]
|
|
252 return word
|
|
253
|
|
254 def get_start_end(sequence, skiplist=['-','?']):
|
|
255 """Return position of first and last character which is not in skiplist (defaults to ['-','?'])."""
|
|
256
|
|
257 length=len(sequence)
|
|
258 if length==0:
|
|
259 return None,None
|
|
260 end=length-1
|
|
261 while end>=0 and (sequence[end] in skiplist):
|
|
262 end-=1
|
|
263 start=0
|
|
264 while start<length and (sequence[start] in skiplist):
|
|
265 start+=1
|
|
266 return start,end
|
|
267
|
|
268 def _sort_keys_by_values(p):
|
|
269 """Returns a sorted list of keys of p sorted by values of p."""
|
|
270 startpos=[(p[pn],pn) for pn in p if p[pn]]
|
|
271 startpos.sort()
|
|
272 return zip(*startpos)[1]
|
|
273
|
|
274 def _make_unique(l):
|
|
275 """Check that all values in list are unique and return a pruned and sorted list."""
|
|
276 l=list(sets.Set(l))
|
|
277 l.sort()
|
|
278 return l
|
|
279
|
|
280 def _seqmatrix2strmatrix(matrix):
|
|
281 """Converts a Seq-object matrix to a plain sequence-string matrix."""
|
|
282 return dict([(t,matrix[t].tostring()) for t in matrix])
|
|
283
|
|
284 def _compact4nexus(orig_list):
|
|
285 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
|
|
286 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
|
|
287 """
|
|
288
|
|
289 if not orig_list:
|
|
290 return ''
|
|
291 orig_list=list(sets.Set(orig_list))
|
|
292 orig_list.sort()
|
|
293 shortlist=[]
|
|
294 clist=orig_list[:]
|
|
295 clist.append(clist[-1]+.5) # dummy value makes it easier
|
|
296 while len(clist)>1:
|
|
297 step=1
|
|
298 for i,x in enumerate(clist):
|
|
299 if x==clist[0]+i*step: # are we still in the right step?
|
|
300 continue
|
|
301 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
|
|
302 # second element, and possibly at least 3 elements to link,
|
|
303 # and the next one is in the right step
|
|
304 step=x-clist[0]
|
|
305 else: # pattern broke, add all values before current position to new list
|
|
306 sub=clist[:i]
|
|
307 if len(sub)==1:
|
|
308 shortlist.append(str(sub[0]+1))
|
|
309 else:
|
|
310 if step==1:
|
|
311 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1))
|
|
312 else:
|
|
313 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step))
|
|
314 clist=clist[i:]
|
|
315 break
|
|
316 return ' '.join(shortlist)
|
|
317
|
|
318 def combine(matrices):
|
|
319 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
|
|
320
|
|
321 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
|
|
322 Character sets, character partitions and taxon sets are prefixed, readjusted and present in
|
|
323 the combined matrix.
|
|
324 """
|
|
325
|
|
326 if not matrices:
|
|
327 return None
|
|
328 name=matrices[0][0]
|
|
329 combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix
|
|
330 mixed_datatypes=(len(sets.Set([n[1].datatype for n in matrices]))>1)
|
|
331 if mixed_datatypes:
|
|
332 combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself!
|
|
333 # raise NexusError, 'Matrices must be of same datatype'
|
|
334 combined.charlabels=None
|
|
335 combined.statelabels=None
|
|
336 combined.interleave=False
|
|
337 combined.translate=None
|
|
338
|
|
339 # rename taxon sets and character sets and name them with prefix
|
|
340 for cn,cs in combined.charsets.items():
|
|
341 combined.charsets['%s.%s' % (name,cn)]=cs
|
|
342 del combined.charsets[cn]
|
|
343 for tn,ts in combined.taxsets.items():
|
|
344 combined.taxsets['%s.%s' % (name,tn)]=ts
|
|
345 del combined.taxsets[tn]
|
|
346 # previous partitions usually don't make much sense in combined matrix
|
|
347 # just initiate one new partition parted by single matrices
|
|
348 combined.charpartitions={'combined':{name:range(combined.nchar)}}
|
|
349 for n,m in matrices[1:]: # add all other matrices
|
|
350 both=[t for t in combined.taxlabels if t in m.taxlabels]
|
|
351 combined_only=[t for t in combined.taxlabels if t not in both]
|
|
352 m_only=[t for t in m.taxlabels if t not in both]
|
|
353 for t in both:
|
|
354 # concatenate sequences and unify gap and missing character symbols
|
|
355 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
|
|
356 # replace date of missing taxa with symbol for missing data
|
|
357 for t in combined_only:
|
|
358 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet)
|
|
359 for t in m_only:
|
|
360 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\
|
|
361 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
|
|
362 combined.taxlabels.extend(m_only) # new taxon list
|
|
363 for cn,cs in m.charsets.items(): # adjust character sets for new matrix
|
|
364 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs]
|
|
365 if m.taxsets:
|
|
366 if not combined.taxsets:
|
|
367 combined.taxsets={}
|
|
368 combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()])) # update taxon sets
|
|
369 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar) # update new charpartition
|
|
370 # update charlabels
|
|
371 if m.charlabels:
|
|
372 if not combined.charlabels:
|
|
373 combined.charlabels={}
|
|
374 combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()]))
|
|
375 combined.nchar+=m.nchar # update nchar and ntax
|
|
376 combined.ntax+=len(m_only)
|
|
377
|
|
378 return combined
|
|
379
|
|
380 def _kill_comments_and_break_lines(text):
|
|
381 """Delete []-delimited comments out of a file and break into lines separated by ';'.
|
|
382
|
|
383 stripped_text=_kill_comments_and_break_lines(text):
|
|
384 Nested and multiline comments are allowed. [ and ] symbols within single
|
|
385 or double quotes are ignored, newline ends a quote, all symbols with quotes are
|
|
386 treated the same (thus not quoting inside comments like [this character ']' ends a comment])
|
|
387 Special [&...] and [\...] comments remain untouched, if not inside standard comment.
|
|
388 Quotes inside special [& and [\ are treated as normal characters,
|
|
389 but no nesting inside these special comments allowed (like [& [\ ]]).
|
|
390 ';' ist deleted from end of line.
|
|
391
|
|
392 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus
|
|
393 """
|
|
394 contents=CharBuffer(text)
|
|
395 newtext=[]
|
|
396 newline=[]
|
|
397 quotelevel=''
|
|
398 speciallevel=False
|
|
399 commlevel=0
|
|
400 while True:
|
|
401 #plain=contents.next_until(["'",'"','[',']','\n',';']) # search for next special character
|
|
402 #if not plain:
|
|
403 # newline.append(contents.rest) # not found, just add the rest
|
|
404 # break
|
|
405 #newline.append(plain) # add intermediate text
|
|
406 t=contents.next() # and get special character
|
|
407 if t is None:
|
|
408 break
|
|
409 if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation
|
|
410 quotelevel=''
|
|
411 elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation
|
|
412 quotelevel=t
|
|
413 elif not quotelevel and t=='[': # opening bracket outside a quote
|
|
414 if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel:
|
|
415 speciallevel=True
|
|
416 else:
|
|
417 commlevel+=1
|
|
418 elif not quotelevel and t==']': # closing bracket ioutside a quote
|
|
419 if speciallevel:
|
|
420 speciallevel=False
|
|
421 else:
|
|
422 commlevel-=1
|
|
423 if commlevel<0:
|
|
424 raise NexusError, 'Nexus formatting error: unmatched ]'
|
|
425 continue
|
|
426 if commlevel==0: # copy if we're not in comment
|
|
427 if t==';' and not quotelevel:
|
|
428 newtext.append(''.join(newline))
|
|
429 newline=[]
|
|
430 else:
|
|
431 newline.append(t)
|
|
432 #level of comments should be 0 at the end of the file
|
|
433 if newline:
|
|
434 newtext.append('\n'.join(newline))
|
|
435 if commlevel>0:
|
|
436 raise NexusError, 'Nexus formatting error: unmatched ['
|
|
437 return newtext
|
|
438
|
|
439
|
|
440 def _adjust_lines(lines):
|
|
441 """Adjust linebreaks to match ';', strip leading/trailing whitespace
|
|
442
|
|
443 list_of_commandlines=_adjust_lines(input_text)
|
|
444 Lines are adjusted so that no linebreaks occur within a commandline
|
|
445 (except matrix command line)
|
|
446 """
|
|
447 formatted_lines=[]
|
|
448 for l in lines:
|
|
449 #Convert line endings
|
|
450 l=l.replace('\r\n','\n').replace('\r','\n').strip()
|
|
451 if l.lower().startswith('matrix'):
|
|
452 formatted_lines.append(l)
|
|
453 else:
|
|
454 l=l.replace('\n',' ')
|
|
455 if l:
|
|
456 formatted_lines.append(l)
|
|
457 return formatted_lines
|
|
458
|
|
459 def _replace_parenthesized_ambigs(seq,rev_ambig_values):
|
|
460 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
|
|
461
|
|
462 opening=seq.find('(')
|
|
463 while opening>-1:
|
|
464 closing=seq.find(')')
|
|
465 if closing<0:
|
|
466 raise NexusError, 'Missing closing parenthesis in: '+seq
|
|
467 elif closing<opening:
|
|
468 raise NexusError, 'Missing opening parenthesis in: '+seq
|
|
469 ambig=[x for x in seq[opening+1:closing]]
|
|
470 ambig.sort()
|
|
471 ambig=''.join(ambig)
|
|
472 ambig_code=rev_ambig_values[ambig.upper()]
|
|
473 if ambig!=ambig.upper():
|
|
474 ambig_code=ambig_code.lower()
|
|
475 seq=seq[:opening]+ambig_code+seq[closing+1:]
|
|
476 opening=seq.find('(')
|
|
477 return seq
|
|
478
|
|
479
|
|
480 class Commandline:
|
|
481 """Represent a commandline as command and options."""
|
|
482
|
|
483 def __init__(self, line, title):
|
|
484 self.options={}
|
|
485 options=[]
|
|
486 self.command=None
|
|
487 try:
|
|
488 #Assume matrix (all other command lines have been stripped of \n)
|
|
489 self.command, options = line.strip().split('\n', 1)
|
|
490 except ValueError: #Not matrix
|
|
491 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...)
|
|
492 self.command=line.split()[0]
|
|
493 options=' '.join(line.split()[1:])
|
|
494 self.command = self.command.strip().lower()
|
|
495 if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved
|
|
496 self.options=options.strip()
|
|
497 else:
|
|
498 if len(options) > 0:
|
|
499 try:
|
|
500 options = options.replace('=', ' = ').split()
|
|
501 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
|
|
502 indices = []
|
|
503 for sl in valued_indices:
|
|
504 indices.extend(sl)
|
|
505 token_indices = [n for n in range(len(options)) if n not in indices]
|
|
506 for opt in valued_indices:
|
|
507 #self.options[options[opt[0]].lower()] = options[opt[2]].lower()
|
|
508 self.options[options[opt[0]].lower()] = options[opt[2]]
|
|
509 for token in token_indices:
|
|
510 self.options[options[token].lower()] = None
|
|
511 except ValueError:
|
|
512 raise NexusError, 'Incorrect formatting in line: %s' % line
|
|
513
|
|
514 class Block:
|
|
515 """Represent a NEXUS block with block name and list of commandlines ."""
|
|
516 def __init__(self,title=None):
|
|
517 self.title=title
|
|
518 self.commandlines=[]
|
|
519
|
|
520 class Nexus(object):
|
|
521
|
|
522 __slots__=['original_taxon_order','__dict__']
|
|
523
|
|
524 def __init__(self, input=None):
|
|
525 self.ntax=0 # number of taxa
|
|
526 self.nchar=0 # number of characters
|
|
527 self.taxlabels=[] # labels for taxa, ordered by their id
|
|
528 self.charlabels=None # ... and for characters
|
|
529 self.statelabels=None # ... and for states
|
|
530 self.datatype='dna' # (standard), dna, rna, nucleotide, protein
|
|
531 self.respectcase=False # case sensitivity
|
|
532 self.missing='?' # symbol for missing characters
|
|
533 self.gap='-' # symbol for gap
|
|
534 self.symbols=None # set of symbols
|
|
535 self.equate=None # set of symbol synonyms
|
|
536 self.matchchar=None # matching char for matrix representation
|
|
537 self.labels=None # left, right, no
|
|
538 self.transpose=False # whether matrix is transposed
|
|
539 self.interleave=False # whether matrix is interleaved
|
|
540 self.tokens=False # unsupported
|
|
541 self.eliminate=None # unsupported
|
|
542 self.matrix=None # ...
|
|
543 self.unknown_blocks=[] # blocks we don't care about
|
|
544 self.taxsets={}
|
|
545 self.charsets={}
|
|
546 self.charpartitions={}
|
|
547 self.taxpartitions={}
|
|
548 self.trees=[] # list of Trees (instances of tree class)
|
|
549 self.translate=None # Dict to translate taxon <-> taxon numbers
|
|
550 self.structured=[] # structured input representation
|
|
551 self.set={} # dict of the set command to set various options
|
|
552 self.options={} # dict of the options command in the data block
|
|
553
|
|
554 # some defaults
|
|
555 self.options['gapmode']='missing'
|
|
556
|
|
557 if input:
|
|
558 self.read(input)
|
|
559
|
|
560 def get_original_taxon_order(self):
|
|
561 """Included for backwards compatibility."""
|
|
562 return self.taxlabels
|
|
563 def set_original_taxon_order(self,value):
|
|
564 """Included for backwards compatibility."""
|
|
565 self.taxlabels=value
|
|
566 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
|
|
567
|
|
568 def read(self,input):
|
|
569 """Read and parse NEXUS imput (filename, file-handle, string."""
|
|
570
|
|
571 # 1. Assume we have the name of a file in the execution dir
|
|
572 # Note we need to add parsing of the path to dir/filename
|
|
573 try:
|
|
574 file_contents = open(os.path.expanduser(input),'rU').read()
|
|
575 self.filename=input
|
|
576 except (TypeError,IOError,AttributeError):
|
|
577 #2 Assume we have a string from a fh.read()
|
|
578 #if isinstance(input, str):
|
|
579 # file_contents = input
|
|
580 # self.filename='input_string'
|
|
581 #3 Assume we have a file object
|
|
582 if hasattr(input,'read'): # file objects or StringIO objects
|
|
583 file_contents=input.read()
|
|
584 # GEC : Change next line so that StringIO objects work
|
|
585 #if input.name:
|
|
586 if hasattr(input, 'name'):
|
|
587 self.filename=input.name
|
|
588 else:
|
|
589 self.filename='Unknown_nexus_file'
|
|
590 else:
|
|
591 print input.strip()[:6]
|
|
592 raise NexusError, 'Unrecognized input: %s ...' % input[:100]
|
|
593 if C:
|
|
594 decommented=cnexus.scanfile(file_contents)
|
|
595 #check for unmatched parentheses
|
|
596 if decommented=='[' or decommented==']':
|
|
597 raise NexusError, 'Unmatched %s' % decommented
|
|
598 # cnexus can't return lists, so in analogy we separate commandlines with chr(7)
|
|
599 # (a character that shoudn't be part of a nexus file under normal circumstances)
|
|
600 commandlines=_adjust_lines(decommented.split(chr(7)))
|
|
601 else:
|
|
602 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents))
|
|
603 # get rid of stupid 'NEXUS token'
|
|
604 try:
|
|
605 if commandlines[0][:6].upper()=='#NEXUS':
|
|
606 commandlines[0]=commandlines[0][6:].strip()
|
|
607 except:
|
|
608 pass
|
|
609 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands
|
|
610 nexus_block_gen = self._get_nexus_block(commandlines)
|
|
611 while 1:
|
|
612 try:
|
|
613 title, contents = nexus_block_gen.next()
|
|
614 except StopIteration:
|
|
615 break
|
|
616 if title in KNOWN_NEXUS_BLOCKS:
|
|
617 self._parse_nexus_block(title, contents)
|
|
618 else:
|
|
619 self._unknown_nexus_block(title, contents)
|
|
620
|
|
621 def _get_nexus_block(self,file_contents):
|
|
622 """Generator for looping through Nexus blocks."""
|
|
623 inblock=False
|
|
624 blocklines=[]
|
|
625 while file_contents:
|
|
626 cl=file_contents.pop(0)
|
|
627 if cl.lower().startswith('begin'):
|
|
628 if not inblock:
|
|
629 inblock=True
|
|
630 title=cl.split()[1].lower()
|
|
631 else:
|
|
632 raise NexusError('Illegal block nesting in block %s' % title)
|
|
633 elif cl.lower().startswith('end'):
|
|
634 if inblock:
|
|
635 inblock=False
|
|
636 yield title,blocklines
|
|
637 blocklines=[]
|
|
638 else:
|
|
639 raise NexusError('Unmatched \'end\'.')
|
|
640 elif inblock:
|
|
641 blocklines.append(cl)
|
|
642
|
|
643 def _unknown_nexus_block(self,title, contents):
|
|
644 block = Block()
|
|
645 block.commandlines.append(contents)
|
|
646 block.title = title
|
|
647 self.unknown_blocks.append(block)
|
|
648
|
|
649 def _parse_nexus_block(self,title, contents):
|
|
650 """Parse a known Nexus Block """
|
|
651 # attached the structered block representation
|
|
652 self._apply_block_structure(title, contents)
|
|
653 #now check for taxa,characters,data blocks. If this stuff is defined more than once
|
|
654 #the later occurences will override the previous ones.
|
|
655 block=self.structured[-1]
|
|
656 for line in block.commandlines:
|
|
657 try:
|
|
658 getattr(self,'_'+line.command)(line.options)
|
|
659 except AttributeError:
|
|
660 raise
|
|
661 raise NexusError, 'Unknown command: %s ' % line.command
|
|
662
|
|
663 def _dimensions(self,options):
|
|
664 if options.has_key('ntax'):
|
|
665 self.ntax=eval(options['ntax'])
|
|
666 if options.has_key('nchar'):
|
|
667 self.nchar=eval(options['nchar'])
|
|
668
|
|
669 def _format(self,options):
|
|
670 # print options
|
|
671 # we first need to test respectcase, then symbols (which depends on respectcase)
|
|
672 # then datatype (which, if standard, depends on symbols and respectcase in order to generate
|
|
673 # dicts for ambiguous values and alphabet
|
|
674 if options.has_key('respectcase'):
|
|
675 self.respectcase=True
|
|
676 # adjust symbols to for respectcase
|
|
677 if options.has_key('symbols'):
|
|
678 self.symbols=options['symbols']
|
|
679 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\
|
|
680 (self.symbold.startswith("'") and self.symbols.endswith("'")):
|
|
681 self.symbols=self.symbols[1:-1].replace(' ','')
|
|
682 if not self.respectcase:
|
|
683 self.symbols=self.symbols.lower()+self.symbols.upper()
|
|
684 self.symbols=list(sets.Set(self.symbols))
|
|
685 if options.has_key('datatype'):
|
|
686 self.datatype=options['datatype'].lower()
|
|
687 if self.datatype=='dna' or self.datatype=='nucleotide':
|
|
688 self.alphabet=IUPAC.ambiguous_dna
|
|
689 self.ambiguous_values=IUPACData.ambiguous_dna_values
|
|
690 self.unambiguous_letters=IUPACData.unambiguous_dna_letters
|
|
691 elif self.datatype=='rna':
|
|
692 self.alphabet=IUPAC.ambiguous_rna
|
|
693 self.ambiguous_values=IUPACData.ambiguous_rna_values
|
|
694 self.unambiguous_letters=IUPACData.unambiguous_rna_letters
|
|
695 elif self.datatype=='protein':
|
|
696 self.alphabet=IUPAC.protein
|
|
697 self.ambiguous_values={'B':'DN','Z':'EQ','X':IUPACData.protein_letters} # that's how PAUP handles it
|
|
698 self.unambiguous_letters=IUPACData.protein_letters+'*' # stop-codon
|
|
699 elif self.datatype=='standard':
|
|
700 raise NexusError('Datatype standard is not yet supported.')
|
|
701 #self.alphabet=None
|
|
702 #self.ambiguous_values={}
|
|
703 #if not self.symbols:
|
|
704 # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states
|
|
705 #self.unambiguous_letters=self.symbols
|
|
706 else:
|
|
707 raise NexusError, 'Unsupported datatype: '+self.datatype
|
|
708 self.valid_characters=''.join(self.ambiguous_values.keys())+self.unambiguous_letters
|
|
709 if not self.respectcase:
|
|
710 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper()
|
|
711 #we have to sort the reverse ambig coding dict key characters:
|
|
712 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N'
|
|
713 rev=dict([(i[1],i[0]) for i in self.ambiguous_values.items() if i[0]!='X'])
|
|
714 self.rev_ambiguous_values={}
|
|
715 for (k,v) in rev.items():
|
|
716 key=[c for c in k]
|
|
717 key.sort()
|
|
718 self.rev_ambiguous_values[''.join(key)]=v
|
|
719 #overwrite symbols for datype rna,dna,nucleotide
|
|
720 if self.datatype in ['dna','rna','nucleotide']:
|
|
721 self.symbols=self.alphabet.letters
|
|
722 if self.missing not in self.ambiguous_values:
|
|
723 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap
|
|
724 self.ambiguous_values[self.gap]=self.gap
|
|
725 elif self.datatype=='standard':
|
|
726 if not self.symbols:
|
|
727 self.symbols=['1','0']
|
|
728 if options.has_key('missing'):
|
|
729 self.missing=options['missing'][0]
|
|
730 if options.has_key('gap'):
|
|
731 self.gap=options['gap'][0]
|
|
732 if options.has_key('equate'):
|
|
733 self.equate=options['equate']
|
|
734 if options.has_key('matchchar'):
|
|
735 self.matchchar=options['matchchar'][0]
|
|
736 if options.has_key('labels'):
|
|
737 self.labels=options['labels']
|
|
738 if options.has_key('transpose'):
|
|
739 raise NexusError, 'TRANSPOSE is not supported!'
|
|
740 self.transpose=True
|
|
741 if options.has_key('interleave'):
|
|
742 if options['interleave']==None or options['interleave'].lower()=='yes':
|
|
743 self.interleave=True
|
|
744 if options.has_key('tokens'):
|
|
745 self.tokens=True
|
|
746 if options.has_key('notokens'):
|
|
747 self.tokens=False
|
|
748
|
|
749
|
|
750 def _set(self,options):
|
|
751 self.set=options;
|
|
752
|
|
753 def _options(self,options):
|
|
754 self.options=options;
|
|
755
|
|
756 def _eliminate(self,options):
|
|
757 self.eliminate=options
|
|
758
|
|
759 def _taxlabels(self,options):
|
|
760 """Get taxon labels."""
|
|
761 self.taxlabels=[]
|
|
762 opts=CharBuffer(options)
|
|
763 while True:
|
|
764 taxon=quotestrip(opts.next_word())
|
|
765 if not taxon:
|
|
766 break
|
|
767 self.taxlabels.append(taxon)
|
|
768
|
|
769 def _check_taxlabels(self,taxon):
|
|
770 """Check for presence of taxon in self.taxlabels."""
|
|
771 # According to NEXUS standard, underscores shall be treated as spaces...,
|
|
772 # so checking for identity is more difficult
|
|
773 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels])
|
|
774 nexid=taxon.replace(' ','_')
|
|
775 return nextaxa.get(nexid)
|
|
776
|
|
777 def _charlabels(self,options):
|
|
778 self.charlabels={}
|
|
779 opts=CharBuffer(options)
|
|
780 while True:
|
|
781 try:
|
|
782 # get id and state
|
|
783 w=opts.next_word()
|
|
784 if w is None: # McClade saves and reads charlabel-lists with terminal comma?!
|
|
785 break
|
|
786 identifier=self._resolve(w,set_type=CHARSET)
|
|
787 state=quotestrip(opts.next_word())
|
|
788 self.charlabels[identifier]=state
|
|
789 # check for comma or end of command
|
|
790 c=opts.next_nonwhitespace()
|
|
791 if c is None:
|
|
792 break
|
|
793 elif c!=',':
|
|
794 raise NexusError,'Missing \',\' in line %s.' % options
|
|
795 except NexusError:
|
|
796 raise
|
|
797 except:
|
|
798 raise NexusError,'Format error in line %s.' % options
|
|
799
|
|
800 def _charstatelabels(self,options):
|
|
801 # warning: charstatelabels supports only charlabels-syntax!
|
|
802 self._charlabels(options)
|
|
803
|
|
804 def _statelabels(self,options):
|
|
805 #self.charlabels=options
|
|
806 #print 'Command statelabels is not supported and will be ignored.'
|
|
807 pass
|
|
808
|
|
809 def _matrix(self,options):
|
|
810 if not self.ntax or not self.nchar:
|
|
811 raise NexusError,'Dimensions must be specified before matrix!'
|
|
812 taxlabels_present=(self.taxlabels!=[])
|
|
813 self.matrix={}
|
|
814 taxcount=0
|
|
815 block_interleave=0
|
|
816 #eliminate empty lines and leading/trailing whitespace
|
|
817 lines=[l.strip() for l in options.split('\n') if l.strip()<>'']
|
|
818 lineiter=iter(lines)
|
|
819 while 1:
|
|
820 try:
|
|
821 l=lineiter.next()
|
|
822 except StopIteration:
|
|
823 if taxcount<self.ntax:
|
|
824 raise NexusError, 'Not enough taxa in matrix.'
|
|
825 elif taxcount>self.ntax:
|
|
826 raise NexusError, 'Too many taxa in matrix.'
|
|
827 else:
|
|
828 break
|
|
829 # count the taxa and check for interleaved matrix
|
|
830 taxcount+=1
|
|
831 ##print taxcount
|
|
832 if taxcount>self.ntax:
|
|
833 if not self.interleave:
|
|
834 raise NexusError, 'Too many taxa in matrix - should matrix be interleaved?'
|
|
835 else:
|
|
836 taxcount=1
|
|
837 block_interleave=1
|
|
838 #get taxon name and sequence
|
|
839 linechars=CharBuffer(l)
|
|
840 id=quotestrip(linechars.next_word())
|
|
841 l=linechars.rest().strip()
|
|
842 if taxlabels_present and not self._check_taxlabels(id):
|
|
843 raise NexusError,'Taxon '+id+' not found in taxlabels.'
|
|
844 chars=''
|
|
845 if self.interleave:
|
|
846 #interleaved matrix
|
|
847 #print 'In interleave'
|
|
848 if l:
|
|
849 chars=''.join(l.split())
|
|
850 else:
|
|
851 chars=''.join(lineiter.next().split())
|
|
852 else:
|
|
853 #non-interleaved matrix
|
|
854 chars=''.join(l.split())
|
|
855 while len(chars)<self.nchar:
|
|
856 l=lineiter.next()
|
|
857 chars+=''.join(l.split())
|
|
858 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet)
|
|
859 #first taxon has the reference sequence if matchhar is used
|
|
860 if taxcount==1:
|
|
861 refseq=iupac_seq
|
|
862 else:
|
|
863 if self.matchchar:
|
|
864 while 1:
|
|
865 p=iupac_seq.tostring().find(self.matchchar)
|
|
866 if p==-1:
|
|
867 break
|
|
868 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet)
|
|
869 #check for invalid characters
|
|
870 for i,c in enumerate(iupac_seq.tostring()):
|
|
871 if c not in self.valid_characters and c!=self.gap and c!=self.missing:
|
|
872 raise NexusError, 'Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\
|
|
873 % (id,c,l[i-10:i+10])
|
|
874 #add sequence to matrix
|
|
875 if block_interleave==0:
|
|
876 while self.matrix.has_key(id):
|
|
877 if id.split('.')[-1].startswith('copy'):
|
|
878 id='.'.join(id.split('.')[:-1])+'.copy'+str(eval('0'+id.split('.')[-1][4:])+1)
|
|
879 else:
|
|
880 id+='.copy'
|
|
881 #raise NexusError, id+' already in matrix!\nError in: '+l
|
|
882 self.matrix[id]=iupac_seq
|
|
883 # add taxon name only if taxlabels is not alredy present
|
|
884 if not taxlabels_present:
|
|
885 self.taxlabels.append(id)
|
|
886 else:
|
|
887 taxon_present=self._check_taxlabels(id)
|
|
888 if taxon_present:
|
|
889 self.matrix[taxon_present]+=iupac_seq
|
|
890 else:
|
|
891 raise NexusError, 'Taxon %s not in first block of interleaved matrix.' % id
|
|
892 #check all sequences for length according to nchar
|
|
893 for taxon in self.matrix:
|
|
894 if len(self.matrix[taxon])!=self.nchar:
|
|
895 raise NexusError,'Nchar ('+str(self.nchar)+') does not match data for taxon '+taxon
|
|
896
|
|
897 def _translate(self,options):
|
|
898 self.translate={}
|
|
899 opts=CharBuffer(options)
|
|
900 while True:
|
|
901 try:
|
|
902 # get id and state
|
|
903 identifier=int(opts.next_word())
|
|
904 label=quotestrip(opts.next_word())
|
|
905 self.translate[identifier]=label
|
|
906 # check for comma or end of command
|
|
907 c=opts.next_nonwhitespace()
|
|
908 if c is None:
|
|
909 break
|
|
910 elif c!=',':
|
|
911 raise NexusError,'Missing \',\' in line %s.' % options
|
|
912 except NexusError:
|
|
913 raise
|
|
914 except:
|
|
915 raise NexusError,'Format error in line %s.' % options
|
|
916
|
|
917 def _utree(self,options):
|
|
918 """Some software (clustalx) uses 'utree' to denote an unrooted tree."""
|
|
919 self._tree(options)
|
|
920
|
|
921 def _tree(self,options):
|
|
922 opts=CharBuffer(options)
|
|
923 name=opts.next_word()
|
|
924 if opts.next_nonwhitespace()!='=':
|
|
925 raise NexusError,'Syntax error in tree description: %s' % options[:50]
|
|
926 rooted=False
|
|
927 weight=1.0
|
|
928 while opts.peek_nonwhitespace()=='[':
|
|
929 open=opts.next_nonwhitespace()
|
|
930 symbol=opts.next()
|
|
931 if symbol!='&':
|
|
932 raise NexusError,'Illegal special comment [%s...] in tree description: %s' % (symbol, options[:50])
|
|
933 special=opts.next()
|
|
934 value=opts.next_until(']')
|
|
935 closing=opts.next()
|
|
936 if special=='R':
|
|
937 rooted=True
|
|
938 elif special=='U':
|
|
939 rooted=False
|
|
940 elif special=='W':
|
|
941 weight=float(value)
|
|
942 tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip())
|
|
943 # if there's an active translation table, translate
|
|
944 if self.translate:
|
|
945 for n in tree.get_terminals():
|
|
946 try:
|
|
947 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)])
|
|
948 except (ValueError,KeyError):
|
|
949 raise NexusError,'Unable to substitue %s using \'translate\' data.' % tree.node(n).data.taxon
|
|
950 self.trees.append(tree)
|
|
951
|
|
952 def _apply_block_structure(self,title,lines):
|
|
953 block=Block('')
|
|
954 block.title = title
|
|
955 for line in lines:
|
|
956 block.commandlines.append(Commandline(line, title))
|
|
957 self.structured.append(block)
|
|
958
|
|
959 def _taxset(self, options):
|
|
960 name,taxa=self._get_indices(options,set_type=TAXSET)
|
|
961 self.taxsets[name]=_make_unique(taxa)
|
|
962
|
|
963 def _charset(self, options):
|
|
964 name,sites=self._get_indices(options,set_type=CHARSET)
|
|
965 self.charsets[name]=_make_unique(sites)
|
|
966
|
|
967 def _taxpartition(self, options):
|
|
968 taxpartition={}
|
|
969 quotelevel=False
|
|
970 opts=CharBuffer(options)
|
|
971 name=self._name_n_vector(opts)
|
|
972 if not name:
|
|
973 raise NexusError, 'Formatting error in taxpartition: %s ' % options
|
|
974 # now collect thesubbpartitions and parse them
|
|
975 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
|
|
976 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words
|
|
977 sub=''
|
|
978 while True:
|
|
979 w=opts.next()
|
|
980 if w is None or (w==',' and not quotelevel):
|
|
981 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':')
|
|
982 taxpartition[subname]=_make_unique(subindices)
|
|
983 sub=''
|
|
984 if w is None:
|
|
985 break
|
|
986 else:
|
|
987 if w=="'":
|
|
988 quotelevel=not quotelevel
|
|
989 sub+=w
|
|
990 self.taxpartitions[name]=taxpartition
|
|
991
|
|
992 def _charpartition(self, options):
|
|
993 charpartition={}
|
|
994 quotelevel=False
|
|
995 opts=CharBuffer(options)
|
|
996 name=self._name_n_vector(opts)
|
|
997 if not name:
|
|
998 raise NexusError, 'Formatting error in charpartition: %s ' % options
|
|
999 # now collect thesubbpartitions and parse them
|
|
1000 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
|
|
1001 sub=''
|
|
1002 while True:
|
|
1003 w=opts.next()
|
|
1004 if w is None or (w==',' and not quotelevel):
|
|
1005 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':')
|
|
1006 charpartition[subname]=_make_unique(subindices)
|
|
1007 sub=''
|
|
1008 if w is None:
|
|
1009 break
|
|
1010 else:
|
|
1011 if w=="'":
|
|
1012 quotelevel=not quotelevel
|
|
1013 sub+=w
|
|
1014 self.charpartitions[name]=charpartition
|
|
1015
|
|
1016 def _get_indices(self,options,set_type=CHARSET,separator='='):
|
|
1017 """Parse the taxset/charset specification
|
|
1018 '1 2 3 - 5 dog cat 10- 20 \\ 3' --> [0,1,2,3,4,'dog','cat',10,13,16,19]
|
|
1019 """
|
|
1020 opts=CharBuffer(options)
|
|
1021 name=self._name_n_vector(opts,separator=separator)
|
|
1022 indices=self._parse_list(opts,set_type=set_type)
|
|
1023 if indices is None:
|
|
1024 raise NexusError, 'Formatting error in line: %s ' % options
|
|
1025 return name,indices
|
|
1026
|
|
1027 def _name_n_vector(self,opts,separator='='):
|
|
1028 """Extract name and check that it's not in vector format."""
|
|
1029 rest=opts.rest()
|
|
1030 name=opts.next_word()
|
|
1031 if not name:
|
|
1032 raise NexusError, 'Formatting error in line: %s ' % rest
|
|
1033 name=quotestrip(name)
|
|
1034 if opts.peek_nonwhitespace=='(':
|
|
1035 open=opts.next_nonwhitespace()
|
|
1036 qualifier=open.next_word()
|
|
1037 close=opts.next_nonwhitespace()
|
|
1038 if qualifier.lower()=='vector':
|
|
1039 raise NexusError, 'Unsupported VECTOR format in line %s' % (options)
|
|
1040 elif qualifier.lower()!='standard':
|
|
1041 raise NexusError, 'Unknown qualifier %s in line %s' % (qualifier,options)
|
|
1042 if opts.next_nonwhitespace()!=separator:
|
|
1043 raise NexusError, 'Formatting error in line: %s ' % rest
|
|
1044 return name
|
|
1045
|
|
1046 def _parse_list(self,options_buffer,set_type):
|
|
1047 """Parse a NEXUS list: [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17-21],
|
|
1048 (assuming dog is taxon no. 17 and cat is taxon no. 21).
|
|
1049 """
|
|
1050 plain_list=[]
|
|
1051 if options_buffer.peek_nonwhitespace():
|
|
1052 try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError
|
|
1053 while True:
|
|
1054 identifier=options_buffer.next_word() # next list element
|
|
1055 if not identifier: # end of list?
|
|
1056 break
|
|
1057 start=self._resolve(identifier,set_type=set_type)
|
|
1058 if options_buffer.peek_nonwhitespace()=='-': # followd by -
|
|
1059 end=start
|
|
1060 step=1
|
|
1061 # get hyphen and end of range
|
|
1062 hyphen=options_buffer.next_nonwhitespace()
|
|
1063 end=self._resolve(options_buffer.next_word(),set_type=set_type)
|
|
1064 if set_type==CHARSET:
|
|
1065 if options_buffer.peek_nonwhitespace()=='\\': # followd by \
|
|
1066 backslash=options_buffer.next_nonwhitespace()
|
|
1067 step=int(options_buffer.next_word()) # get backslash and step
|
|
1068 plain_list.extend(range(start,end+1,step))
|
|
1069 else:
|
|
1070 if type(start)==list or type(end)==list:
|
|
1071 raise NexusError, 'Name if character sets not allowed in range definition: %s' % identifier
|
|
1072 start=self.taxlabels.index(start)
|
|
1073 end=self.taxlabels.index(end)
|
|
1074 taxrange=self.taxlabels[start:end+1]
|
|
1075 plain_list.extend(taxrange)
|
|
1076 else:
|
|
1077 if type(start)==list: # start was the name of charset or taxset
|
|
1078 plain_list.extend(start)
|
|
1079 else: # start was an ordinary identifier
|
|
1080 plain_list.append(start)
|
|
1081 except NexusError:
|
|
1082 raise
|
|
1083 except:
|
|
1084 return None
|
|
1085 return plain_list
|
|
1086
|
|
1087 def _resolve(self,identifier,set_type=None):
|
|
1088 """Translate identifier in list into character/taxon index.
|
|
1089 Characters (which are referred to by their index in Nexus.py):
|
|
1090 Plain numbers are returned minus 1 (Nexus indices to python indices)
|
|
1091 Text identifiers are translaterd into their indices (if plain character indentifiers),
|
|
1092 the first hit in charlabels is returned (charlabels don't need to be unique)
|
|
1093 or the range of indices is returned (if names of character sets).
|
|
1094 Taxa (which are referred to by their unique name in Nexus.py):
|
|
1095 Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
|
|
1096 Names are returned unchanged (if plain taxon identifiers), or the names in
|
|
1097 the corresponding taxon set is returned
|
|
1098 """
|
|
1099 identifier=quotestrip(identifier)
|
|
1100 if not set_type:
|
|
1101 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.')
|
|
1102 if set_type==CHARSET:
|
|
1103 try:
|
|
1104 n=int(identifier)
|
|
1105 except ValueError:
|
|
1106 if self.charlabels and identifier in self.charlabels.values():
|
|
1107 for k in self.charlabels:
|
|
1108 if self.charlabels[k]==identifier:
|
|
1109 return k
|
|
1110 elif self.charsets and identifier in self.charsets:
|
|
1111 return self.charsets[identifier]
|
|
1112 else:
|
|
1113 raise NexusError, 'Unknown character identifier: %s' % identifier
|
|
1114 else:
|
|
1115 if n<=self.nchar:
|
|
1116 return n-1
|
|
1117 else:
|
|
1118 raise NexusError, 'Illegal character identifier: %d>nchar (=%d).' % (identifier,self.nchar)
|
|
1119 elif set_type==TAXSET:
|
|
1120 try:
|
|
1121 n=int(identifier)
|
|
1122 except ValueError:
|
|
1123 taxlabels_id=self._check_taxlabels(identifier)
|
|
1124 if taxlabels_id:
|
|
1125 return taxlabels_id
|
|
1126 elif self.taxsets and identifier in self.taxsets:
|
|
1127 return self.taxsets[identifier]
|
|
1128 else:
|
|
1129 raise NexusError, 'Unknown taxon identifier: %s' % identifier
|
|
1130 else:
|
|
1131 if n>0 and n<=self.ntax:
|
|
1132 return self.taxlabels[n-1]
|
|
1133 else:
|
|
1134 raise NexusError, 'Illegal taxon identifier: %d>ntax (=%d).' % (identifier,self.ntax)
|
|
1135 else:
|
|
1136 raise NexusError('Unknown set specification: %s.'% set_type)
|
|
1137
|
|
1138 def _stateset(self, options):
|
|
1139 #Not implemented
|
|
1140 pass
|
|
1141
|
|
1142 def _changeset(self, options):
|
|
1143 #Not implemented
|
|
1144 pass
|
|
1145
|
|
1146 def _treeset(self, options):
|
|
1147 #Not implemented
|
|
1148 pass
|
|
1149
|
|
1150 def _treepartition(self, options):
|
|
1151 #Not implemented
|
|
1152 pass
|
|
1153
|
|
1154 def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False,
|
|
1155 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
|
|
1156 """Writes a nexus file for each partition in charpartition.
|
|
1157 Only non-excluded characters and non-deleted taxa are included, just the data block is written.
|
|
1158 """
|
|
1159
|
|
1160 if not matrix:
|
|
1161 matrix=self.matrix
|
|
1162 if not matrix:
|
|
1163 return
|
|
1164 if not filename:
|
|
1165 filename=self.filename
|
|
1166 if charpartition:
|
|
1167 pfilenames={}
|
|
1168 for p in charpartition:
|
|
1169 total_exclude=[]+exclude
|
|
1170 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]])
|
|
1171 total_exclude=_make_unique(total_exclude)
|
|
1172 pcomment=comment+'\nPartition: '+p+'\n'
|
|
1173 dot=filename.rfind('.')
|
|
1174 if dot>0:
|
|
1175 pfilename=filename[:dot]+'_'+p+'.data'
|
|
1176 else:
|
|
1177 pfilename=filename+'_'+p
|
|
1178 pfilenames[p]=pfilename
|
|
1179 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize,
|
|
1180 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False,
|
|
1181 mrbayes=mrbayes)
|
|
1182 return pfilenames
|
|
1183 else:
|
|
1184 fn=self.filename+'.data'
|
|
1185 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave,
|
|
1186 exclude=exclude,delete=delete,comment=comment,append_sets=False,
|
|
1187 mrbayes=mrbayes)
|
|
1188 return fn
|
|
1189
|
|
1190 def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
|
|
1191 blocksize=None, interleave=False, interleave_by_partition=False,\
|
|
1192 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False):
|
|
1193 """ Writes a nexus file with data and sets block. Character sets and partitions
|
|
1194 are appended by default, and are adjusted according
|
|
1195 to excluded characters (i.e. character sets still point to the same sites (not necessarily same positions),
|
|
1196 without including the deleted characters.
|
|
1197 """
|
|
1198 if not matrix:
|
|
1199 matrix=self.matrix
|
|
1200 if not matrix:
|
|
1201 return
|
|
1202 if not filename:
|
|
1203 filename=self.filename
|
|
1204 if [t for t in delete if not self._check_taxlabels(t)]:
|
|
1205 raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(sets.Set(self.taxlabels)))
|
|
1206 if interleave_by_partition:
|
|
1207 if not interleave_by_partition in self.charpartitions:
|
|
1208 raise NexusError, 'Unknown partition: '+interleave_by_partition
|
|
1209 else:
|
|
1210 partition=self.charpartitions[interleave_by_partition]
|
|
1211 # we need to sort the partition names by starting position before we exclude characters
|
|
1212 names=_sort_keys_by_values(partition)
|
|
1213 newpartition={}
|
|
1214 for p in partition:
|
|
1215 newpartition[p]=[c for c in partition[p] if c not in exclude]
|
|
1216 # how many taxa and how many characters are left?
|
|
1217 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete]
|
|
1218 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete))
|
|
1219 ntax_adjusted=len(undelete)
|
|
1220 nchar_adjusted=len(cropped_matrix[undelete[0]])
|
|
1221 if not undelete or (undelete and undelete[0]==''):
|
|
1222 return
|
|
1223 if isinstance(filename,str):
|
|
1224 try:
|
|
1225 fh=open(filename,'w')
|
|
1226 except IOError:
|
|
1227 raise NexusError, 'Could not open %s for writing.' % filename
|
|
1228 elif isinstance(filename,file):
|
|
1229 fh=filename
|
|
1230 if not omit_NEXUS:
|
|
1231 fh.write('#NEXUS\n')
|
|
1232 if comment:
|
|
1233 fh.write('['+comment+']\n')
|
|
1234 fh.write('begin data;\n')
|
|
1235 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted))
|
|
1236 fh.write('\tformat datatype='+self.datatype)
|
|
1237 if self.respectcase:
|
|
1238 fh.write(' respectcase')
|
|
1239 if self.missing:
|
|
1240 fh.write(' missing='+self.missing)
|
|
1241 if self.gap:
|
|
1242 fh.write(' gap='+self.gap)
|
|
1243 if self.matchchar:
|
|
1244 fh.write(' matchchar='+self.matchchar)
|
|
1245 if self.labels:
|
|
1246 fh.write(' labels='+self.labels)
|
|
1247 if self.equate:
|
|
1248 fh.write(' equate='+self.equate)
|
|
1249 if interleave or interleave_by_partition:
|
|
1250 fh.write(' interleave')
|
|
1251 fh.write(';\n')
|
|
1252 #if self.taxlabels:
|
|
1253 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n')
|
|
1254 if self.charlabels:
|
|
1255 newcharlabels=self._adjust_charlabels(exclude=exclude)
|
|
1256 clkeys=newcharlabels.keys()
|
|
1257 clkeys.sort()
|
|
1258 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n')
|
|
1259 fh.write('matrix\n')
|
|
1260 if not blocksize:
|
|
1261 if interleave:
|
|
1262 blocksize=70
|
|
1263 else:
|
|
1264 blocksize=self.nchar
|
|
1265 # delete deleted taxa and ecxclude excluded characters...
|
|
1266 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete])
|
|
1267 if interleave_by_partition:
|
|
1268 # interleave by partitions, but adjust partitions with regard to excluded characters
|
|
1269 seek=0
|
|
1270 for p in names:
|
|
1271 fh.write('[%s: %s]\n' % (interleave_by_partition,p))
|
|
1272 if len(newpartition[p])>0:
|
|
1273 for taxon in undelete:
|
|
1274 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
|
|
1275 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n')
|
|
1276 fh.write('\n')
|
|
1277 else:
|
|
1278 fh.write('[empty]\n\n')
|
|
1279 seek+=len(newpartition[p])
|
|
1280 elif interleave:
|
|
1281 for seek in range(0,nchar_adjusted,blocksize):
|
|
1282 for taxon in undelete:
|
|
1283 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
|
|
1284 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
|
|
1285 fh.write('\n')
|
|
1286 else:
|
|
1287 for taxon in undelete:
|
|
1288 if blocksize<nchar_adjusted:
|
|
1289 fh.write(safename(taxon,mrbayes=mrbayes)+'\n')
|
|
1290 else:
|
|
1291 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
|
|
1292 for seek in range(0,nchar_adjusted,blocksize):
|
|
1293 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
|
|
1294 fh.write(';\nend;\n')
|
|
1295 if append_sets:
|
|
1296 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes))
|
|
1297 fh.close()
|
|
1298 return filename
|
|
1299
|
|
1300 def append_sets(self,exclude=[],delete=[],mrbayes=False):
|
|
1301 """Appends a sets block to <filename>."""
|
|
1302 if not self.charsets and not self.taxsets and not self.charpartitions:
|
|
1303 return ''
|
|
1304 sets=['\nbegin sets']
|
|
1305 # - now if characters have been excluded, the character sets need to be adjusted,
|
|
1306 # so that they still point to the right character positions
|
|
1307 # calculate a list of offsets: for each deleted character, the following character position
|
|
1308 # in the new file will have an additional offset of -1
|
|
1309 offset=0
|
|
1310 offlist=[]
|
|
1311 for c in range(self.nchar):
|
|
1312 if c in exclude:
|
|
1313 offset+=1
|
|
1314 offlist.append(-1) # dummy value as these character positions are excluded
|
|
1315 else:
|
|
1316 offlist.append(c-offset)
|
|
1317 # now adjust each of the character sets
|
|
1318 for n,ns in self.charsets.items():
|
|
1319 cset=[offlist[c] for c in ns if c not in exclude]
|
|
1320 if cset:
|
|
1321 sets.append('charset %s = %s' % (safename(n),_compact4nexus(cset)))
|
|
1322 for n,s in self.taxsets.items():
|
|
1323 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
|
|
1324 if tset:
|
|
1325 sets.append('taxset %s = %s' % (safename(n),' '.join(tset)))
|
|
1326 for n,p in self.charpartitions.items():
|
|
1327 # as characters have been excluded, the partitions must be adjusted
|
|
1328 # if a partition is empty, it will be omitted from the charpartition command
|
|
1329 # (although paup allows charpartition part=t1:,t2:,t3:1-100)
|
|
1330 names=_sort_keys_by_values(p)
|
|
1331 newpartition={}
|
|
1332 for sn in names:
|
|
1333 nsp=[offlist[c] for c in p[sn] if c not in exclude]
|
|
1334 if nsp:
|
|
1335 newpartition[sn]=nsp
|
|
1336 if newpartition:
|
|
1337 sets.append('charpartition %s = %s' % (safename(n),\
|
|
1338 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition])))
|
|
1339 # now write charpartititions, much easier than charpartitions
|
|
1340 for n,p in self.taxpartitions.items():
|
|
1341 names=_sort_keys_by_values(p)
|
|
1342 newpartition={}
|
|
1343 for sn in names:
|
|
1344 nsp=[t for t in p[sn] if t not in delete]
|
|
1345 if nsp:
|
|
1346 newpartition[sn]=nsp
|
|
1347 if newpartition:
|
|
1348 sets.append('taxpartition %s = %s' % (safename(n),\
|
|
1349 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition])))
|
|
1350 # add 'end' and return everything
|
|
1351 sets.append('end;\n')
|
|
1352 return ';\n'.join(sets)
|
|
1353 f.close()
|
|
1354
|
|
1355 def export_fasta(self, filename=None, width=70):
|
|
1356 """Writes matrix into a fasta file: (self, filename=None, width=70)."""
|
|
1357 if not filename:
|
|
1358 if '.' in filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
|
|
1359 filename='.'.join(self.filename.split('.')[:-1])+'.fas'
|
|
1360 else:
|
|
1361 filename=self.filename+'.fas'
|
|
1362 fh=open(filename,'w')
|
|
1363 for taxon in self.taxlabels:
|
|
1364 fh.write('>'+safename(taxon)+'\n')
|
|
1365 for i in range(0, len(self.matrix[taxon].tostring()), width):
|
|
1366 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n')
|
|
1367 fh.close()
|
|
1368
|
|
1369 def constant(self,matrix=None,delete=[],exclude=[]):
|
|
1370 """Return a list with all constant characters."""
|
|
1371 if not matrix:
|
|
1372 matrix=self.matrix
|
|
1373 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
|
|
1374 if not undelete:
|
|
1375 return None
|
|
1376 elif len(undelete)==1:
|
|
1377 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
|
|
1378 # get the first sequence and expand all ambiguous values
|
|
1379 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for
|
|
1380 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude]
|
|
1381 for taxon in undelete[1:]:
|
|
1382 newconstant=[]
|
|
1383 for site in constant:
|
|
1384 #print '%d (paup=%d)' % (site[0],site[0]+1),
|
|
1385 seqsite=matrix[taxon][site[0]].upper()
|
|
1386 #print seqsite,'checked against',site[1],'\t',
|
|
1387 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]:
|
|
1388 # missing or same as before -> ok
|
|
1389 newconstant.append(site)
|
|
1390 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
|
|
1391 # subset of an ambig or only missing in previous -> take subset
|
|
1392 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite)))
|
|
1393 elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values
|
|
1394 intersect=sets.Set(self.ambiguous_values[seqsite]).intersection(sets.Set(site[1]))
|
|
1395 if intersect:
|
|
1396 newconstant.append((site[0],''.join(intersect)))
|
|
1397 # print 'ok'
|
|
1398 #else:
|
|
1399 # print 'failed'
|
|
1400 #else:
|
|
1401 # print 'failed'
|
|
1402 constant=newconstant
|
|
1403 cpos=[s[0] for s in constant]
|
|
1404 return constant
|
|
1405 # return [x[0] for x in constant]
|
|
1406
|
|
1407 def cstatus(self,site,delete=[],narrow=True):
|
|
1408 """Summarize character.
|
|
1409 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?)
|
|
1410 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -)
|
|
1411 """
|
|
1412 undelete=[t for t in self.taxlabels if t not in delete]
|
|
1413 if not undelete:
|
|
1414 return None
|
|
1415 cstatus=[]
|
|
1416 for t in undelete:
|
|
1417 c=self.matrix[t][site].upper()
|
|
1418 if self.options.get('gapmode')=='missing' and c==self.gap:
|
|
1419 c=self.missing
|
|
1420 if narrow and c==self.missing:
|
|
1421 if c not in cstatus:
|
|
1422 cstatus.append(c)
|
|
1423 else:
|
|
1424 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus])
|
|
1425 if self.missing in cstatus and narrow and len(cstatus)>1:
|
|
1426 cstatus=[c for c in cstatus if c!=self.missing]
|
|
1427 cstatus.sort()
|
|
1428 return cstatus
|
|
1429
|
|
1430 def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]):
|
|
1431 """Calculates a stepmatrix for weighted parsimony.
|
|
1432 See Wheeler (1990), Cladistics 6:269-275 and
|
|
1433 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196
|
|
1434 """
|
|
1435 m=StepMatrix(self.unambiguous_letters,self.gap)
|
|
1436 for site in [s for s in range(self.nchar) if s not in exclude]:
|
|
1437 cstatus=self.cstatus(site,delete)
|
|
1438 for i,b1 in enumerate(cstatus[:-1]):
|
|
1439 for b2 in cstatus[i+1:]:
|
|
1440 m.add(b1.upper(),b2.upper(),1)
|
|
1441 return m.transformation().weighting().smprint(name=name)
|
|
1442
|
|
1443 def crop_matrix(self,matrix=None, delete=[], exclude=[]):
|
|
1444 """Return a matrix without deleted taxa and excluded characters."""
|
|
1445 if not matrix:
|
|
1446 matrix=self.matrix
|
|
1447 if [t for t in delete if not self._check_taxlabels(t)]:
|
|
1448 raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(self.taxlabels))
|
|
1449 if exclude!=[]:
|
|
1450 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
|
|
1451 if not undelete:
|
|
1452 return {}
|
|
1453 m=[matrix[k].tostring() for k in undelete]
|
|
1454 zipped_m=zip(*m)
|
|
1455 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude]
|
|
1456 if sitesm==[]:
|
|
1457 return dict([(t,Seq('',self.alphabet)) for t in undelete])
|
|
1458 else:
|
|
1459 zipped_sitesm=zip(*sitesm)
|
|
1460 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)]
|
|
1461 return dict(zip(undelete,m))
|
|
1462 else:
|
|
1463 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
|
|
1464
|
|
1465 def bootstrap(self,matrix=None,delete=[],exclude=[]):
|
|
1466 """Return a bootstrapped matrix."""
|
|
1467 if not matrix:
|
|
1468 matrix=self.matrix
|
|
1469 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq) # remember if Seq objects
|
|
1470 cm=self.crop_matrix(delete=delete,exclude=exclude) # crop data out
|
|
1471 if not cm: # everything deleted?
|
|
1472 return {}
|
|
1473 elif len(cm[cm.keys()[0]])==0: # everything excluded?
|
|
1474 return cm
|
|
1475 undelete=[t for t in self.taxlabels if t in cm]
|
|
1476 if seqobjects:
|
|
1477 sitesm=zip(*[cm[t].tostring() for t in undelete])
|
|
1478 alphabet=matrix[matrix.keys()[0]].alphabet
|
|
1479 else:
|
|
1480 sitesm=zip(*[cm[t] for t in undelete])
|
|
1481 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))]
|
|
1482 bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
|
|
1483 if seqobjects:
|
|
1484 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
|
|
1485 return dict(zip(undelete,bootstrapseqs))
|
|
1486
|
|
1487 def add_sequence(self,name,sequence):
|
|
1488 """Adds a sequence to the matrix."""
|
|
1489 if not name:
|
|
1490 raise NexusError, 'New sequence must have a name'
|
|
1491 diff=self.nchar-len(sequence)
|
|
1492 if diff<0:
|
|
1493 self.insert_gap(self.nchar,-diff)
|
|
1494 elif diff>0:
|
|
1495 sequence+=self.missing*diff
|
|
1496
|
|
1497 self.matrix[name]=Seq(sequence,self.alphabet)
|
|
1498 self.ntax+=1
|
|
1499 self.taxlabels.append(name)
|
|
1500 #taxlabels?
|
|
1501
|
|
1502 def insert_gap(self,pos,n=1,leftgreedy=False):
|
|
1503 """Add a gap into the matrix and adjust charsets and partitions.
|
|
1504
|
|
1505 pos=0: first position
|
|
1506 pos=nchar: last position
|
|
1507 """
|
|
1508
|
|
1509 def _adjust(set,x,d,leftgreedy=False):
|
|
1510 """Adjusts chartacter sets if gaps are inserted, taking care of
|
|
1511 new gaps within a coherent character set."""
|
|
1512 # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15
|
|
1513 # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18
|
|
1514 # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18
|
|
1515 set.sort()
|
|
1516 addpos=0
|
|
1517 for i,c in enumerate(set):
|
|
1518 if c>=x:
|
|
1519 set[i]=c+d
|
|
1520 # if we add gaps within a group of characters, we want the gap position included in this group
|
|
1521 if c==x:
|
|
1522 if leftgreedy or (i>0 and set[i-1]==c-1):
|
|
1523 addpos=i
|
|
1524 if addpos>0:
|
|
1525 set[addpos:addpos]=range(x,x+d)
|
|
1526 return set
|
|
1527
|
|
1528 if pos<0 or pos>self.nchar:
|
|
1529 raise NexusError('Illegal gap position: %d' % pos)
|
|
1530 if n==0:
|
|
1531 return
|
|
1532 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
|
|
1533 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n
|
|
1534 # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\
|
|
1535 # i,taxon in enumerate(self.taxlabels)])
|
|
1536 zipped=zip(*sitesm)
|
|
1537 mapped=map(''.join,zipped)
|
|
1538 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)]
|
|
1539 self.matrix=dict(listed)
|
|
1540 self.nchar+=n
|
|
1541 # now adjust character sets
|
|
1542 for i,s in self.charsets.items():
|
|
1543 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy)
|
|
1544 for p in self.charpartitions:
|
|
1545 for sp,s in self.charpartitions[p].items():
|
|
1546 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy)
|
|
1547 # now adjust character state labels
|
|
1548 self.charlabels=self._adjust_charlabels(insert=[pos]*n)
|
|
1549 return self.charlabels
|
|
1550
|
|
1551 def _adjust_charlabels(self,exclude=None,insert=None):
|
|
1552 """Return adjusted indices of self.charlabels if characters are excluded or inserted."""
|
|
1553 if exclude and insert:
|
|
1554 raise NexusError, 'Can\'t exclude and insert at the same time'
|
|
1555 if not self.charlabels:
|
|
1556 return None
|
|
1557 labels=self.charlabels.keys()
|
|
1558 labels.sort()
|
|
1559 newcharlabels={}
|
|
1560 if exclude:
|
|
1561 exclude.sort()
|
|
1562 exclude.append(sys.maxint)
|
|
1563 excount=0
|
|
1564 for c in labels:
|
|
1565 if not c in exclude:
|
|
1566 while c>exclude[excount]:
|
|
1567 excount+=1
|
|
1568 newcharlabels[c-excount]=self.charlabels[c]
|
|
1569 elif insert:
|
|
1570 insert.sort()
|
|
1571 insert.append(sys.maxint)
|
|
1572 icount=0
|
|
1573 for c in labels:
|
|
1574 while c>=insert[icount]:
|
|
1575 icount+=1
|
|
1576 newcharlabels[c+icount]=self.charlabels[c]
|
|
1577 else:
|
|
1578 return self.charlabels
|
|
1579 return newcharlabels
|
|
1580
|
|
1581 def invert(self,charlist):
|
|
1582 """Returns all character indices that are not in charlist."""
|
|
1583 return [c for c in range(self.nchar) if c not in charlist]
|
|
1584
|
|
1585 def gaponly(self,include_missing=False):
|
|
1586 """Return gap-only sites."""
|
|
1587 gap=sets.Set(self.gap)
|
|
1588 if include_missing:
|
|
1589 gap.add(self.missing)
|
|
1590 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
|
|
1591 gaponly=[i for i,site in enumerate(sitesm) if sets.Set(site).issubset(gap)]
|
|
1592 return gaponly
|
|
1593
|
|
1594 def terminal_gap_to_missing(self,missing=None,skip_n=True):
|
|
1595 """Replaces all terminal gaps with missing character.
|
|
1596
|
|
1597 Mixtures like ???------??------- are properly resolved."""
|
|
1598
|
|
1599 if not missing:
|
|
1600 missing=self.missing
|
|
1601 replace=[self.missing,self.gap]
|
|
1602 if not skip_n:
|
|
1603 replace.extend(['n','N'])
|
|
1604 for taxon in self.taxlabels:
|
|
1605 sequence=self.matrix[taxon].tostring()
|
|
1606 length=len(sequence)
|
|
1607 start,end=get_start_end(sequence,skiplist=replace)
|
|
1608 sequence=sequence[:end+1]+missing*(length-end-1)
|
|
1609 sequence=start*missing+sequence[start:]
|
|
1610 assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon
|
|
1611 self.matrix[taxon]=Seq(sequence,self.alphabet)
|
|
1612
|