Mercurial > repos > davidmurphy > codonlogo
comparison corebio/seq_io/_nexus/__init__.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c55bdc2fb9fa |
---|---|
1 # Nexus.py - a NEXUS parser | |
2 # | |
3 # Copyright 2005 by Frank Kauff & Cymon J. Cox. All rights reserved. | |
4 # This code is part of the Biopython distribution and governed by its | |
5 # license. Please see the LICENSE file that should have been included | |
6 # as part of this package. | |
7 # | |
8 # Bug reports welcome: fkauff@duke.edu | |
9 # | |
10 | |
11 """Parse the contents of a nexus file. | |
12 Based upon 'NEXUS: An extensible file format for systematic information' | |
13 Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 | |
14 | |
15 Authors: Frank Kauff and Cymon J. Cox | |
16 """ | |
17 | |
18 import os,sys, math, random, copy | |
19 import sets | |
20 | |
21 # --- Changes from Bio.Nexus --- | |
22 # Renamed Nexus.py to __init__.py. Helps with api documentation. | |
23 # One further change in file, tagged with 'GEC' | |
24 #from Bio.Alphabet import IUPAC | |
25 #from Bio.Data import IUPACData | |
26 #from Bio.Seq import Seq | |
27 | |
28 | |
29 from corebio.seq import Seq, Alphabet, protein_alphabet | |
30 import corebio.data as data | |
31 from corebio.utils import Struct | |
32 | |
33 IUPACData = Struct( | |
34 ambiguous_dna_letters = data.dna_extended_letters, | |
35 ambiguous_rna_letters = data.rna_extended_letters, | |
36 ambiguous_dna_values = data.dna_ambiguity, | |
37 ambiguous_rna_values = data.rna_ambiguity, | |
38 protein_letters = data.amino_acid_letters, | |
39 unambiguous_dna_letters = data.dna_letters, | |
40 unambiguous_rna_letters = data.rna_letters, | |
41 ) | |
42 | |
43 IUPAC = Struct( | |
44 ambiguous_dna = Alphabet(IUPACData.ambiguous_dna_letters+'-?'), | |
45 ambiguous_rna = Alphabet(IUPACData.ambiguous_rna_letters+'-?'), | |
46 protein = protein_alphabet #Alphabet(IUPACData.protein_letters+'-?') | |
47 ) | |
48 | |
49 | |
50 | |
51 # End Changes | |
52 | |
53 | |
54 from Trees import Tree,NodeData | |
55 | |
56 C = False | |
57 | |
58 #try: | |
59 # import cnexus | |
60 #except ImportError: | |
61 # C=False | |
62 #else: | |
63 # C=True | |
64 | |
65 INTERLEAVE=70 | |
66 SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\ | |
67 'matrix','tree', 'utree','translate'] | |
68 KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets'] | |
69 PUNCTUATION='()[]{}/\,;:=*\'"`+-<>' | |
70 MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' | |
71 WHITESPACE=' \t\n' | |
72 #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments | |
73 SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored | |
74 CHARSET='chars' | |
75 TAXSET='taxa' | |
76 | |
77 class NexusError(Exception): pass | |
78 | |
79 class CharBuffer: | |
80 """Helps reading NEXUS-words and characters from a buffer.""" | |
81 def __init__(self,string): | |
82 if string: | |
83 self.buffer=list(string) | |
84 else: | |
85 self.buffer=[] | |
86 | |
87 def peek(self): | |
88 if self.buffer: | |
89 return self.buffer[0] | |
90 else: | |
91 return None | |
92 | |
93 def peek_nonwhitespace(self): | |
94 b=''.join(self.buffer).strip() | |
95 if b: | |
96 return b[0] | |
97 else: | |
98 return None | |
99 | |
100 def next(self): | |
101 if self.buffer: | |
102 return self.buffer.pop(0) | |
103 else: | |
104 return None | |
105 | |
106 def next_nonwhitespace(self): | |
107 while True: | |
108 p=self.next() | |
109 if p is None: | |
110 break | |
111 if p not in WHITESPACE: | |
112 return p | |
113 return None | |
114 | |
115 def skip_whitespace(self): | |
116 while self.buffer[0] in WHITESPACE: | |
117 self.buffer=self.buffer[1:] | |
118 | |
119 def next_until(self,target): | |
120 for t in target: | |
121 try: | |
122 pos=self.buffer.index(t) | |
123 except ValueError: | |
124 pass | |
125 else: | |
126 found=''.join(self.buffer[:pos]) | |
127 self.buffer=self.buffer[pos:] | |
128 return found | |
129 else: | |
130 return None | |
131 | |
132 def peek_word(self,word): | |
133 return ''.join(self.buffer[:len(word)])==word | |
134 | |
135 def next_word(self): | |
136 """Return the next NEXUS word from a string, dealing with single and double quotes, | |
137 whitespace and punctuation. | |
138 """ | |
139 | |
140 word=[] | |
141 quoted=False | |
142 first=self.next_nonwhitespace() # get first character | |
143 if not first: # return empty if only whitespace left | |
144 return None | |
145 word.append(first) | |
146 if first=="'": # word starts with a quote | |
147 quoted=True | |
148 elif first in PUNCTUATION: # if it's punctuation, return immediately | |
149 return first | |
150 while True: | |
151 c=self.peek() | |
152 if c=="'": # a quote? | |
153 word.append(self.next()) # store quote | |
154 if self.peek()=="'": # double quote | |
155 skip=self.next() # skip second quote | |
156 elif quoted: # second single quote ends word | |
157 break | |
158 elif quoted: | |
159 word.append(self.next()) # if quoted, then add anything | |
160 elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop | |
161 break | |
162 else: | |
163 word.append(self.next()) # standard character | |
164 return ''.join(word) | |
165 | |
166 def rest(self): | |
167 """Return the rest of the string without parsing.""" | |
168 return ''.join(self.buffer) | |
169 | |
170 class StepMatrix: | |
171 """Calculate a stepmatrix for weighted parsimony. | |
172 See Wheeler (1990), Cladistics 6:269-275. | |
173 """ | |
174 | |
175 def __init__(self,symbols,gap): | |
176 self.data={} | |
177 self.symbols=[s for s in symbols] | |
178 self.symbols.sort() | |
179 if gap: | |
180 self.symbols.append(gap) | |
181 for x in self.symbols: | |
182 for y in [s for s in self.symbols if s!=x]: | |
183 self.set(x,y,0) | |
184 | |
185 def set(self,x,y,value): | |
186 if x>y: | |
187 x,y=y,x | |
188 self.data[x+y]=value | |
189 | |
190 def add(self,x,y,value): | |
191 if x>y: | |
192 x,y=y,x | |
193 self.data[x+y]+=value | |
194 | |
195 def sum(self): | |
196 return reduce(lambda x,y:x+y,self.data.values()) | |
197 | |
198 def transformation(self): | |
199 total=self.sum() | |
200 if total!=0: | |
201 for k in self.data: | |
202 self.data[k]=self.data[k]/float(total) | |
203 return self | |
204 | |
205 def weighting(self): | |
206 for k in self.data: | |
207 if self.data[k]!=0: | |
208 self.data[k]=-math.log(self.data[k]) | |
209 return self | |
210 | |
211 def smprint(self,name='your_name_here'): | |
212 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols)) | |
213 matrix+=' %s\n' % ' '.join(self.symbols) | |
214 for x in self.symbols: | |
215 matrix+='[%s]'.ljust(8) % x | |
216 for y in self.symbols: | |
217 if x==y: | |
218 matrix+=' . ' | |
219 else: | |
220 if x>y: | |
221 x1,y1=y,x | |
222 else: | |
223 x1,y1=x,y | |
224 if self.data[x1+y1]==0: | |
225 matrix+='inf. ' | |
226 else: | |
227 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1]) | |
228 matrix+='\n' | |
229 matrix+=';\n' | |
230 return matrix | |
231 | |
232 def safename(name,mrbayes=False): | |
233 """Return a taxon identifier according to NEXUS standard. | |
234 Wrap quotes around names with punctuation or whitespace, and double single quotes. | |
235 mrbayes=True: write names without quotes, whitespace or punctuation for mrbayes. | |
236 """ | |
237 if mrbayes: | |
238 safe=name.replace(' ','_') | |
239 safe=''.join([c for c in safe if c in MRBAYESSAFE]) | |
240 else: | |
241 safe=name.replace("'","''") | |
242 if sets.Set(safe).intersection(sets.Set(WHITESPACE+PUNCTUATION)): | |
243 safe="'"+safe+"'" | |
244 return safe | |
245 | |
246 def quotestrip(word): | |
247 """Remove quotes and/or double quotes around identifiers.""" | |
248 if not word: | |
249 return None | |
250 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): | |
251 word=word[1:-1] | |
252 return word | |
253 | |
254 def get_start_end(sequence, skiplist=['-','?']): | |
255 """Return position of first and last character which is not in skiplist (defaults to ['-','?']).""" | |
256 | |
257 length=len(sequence) | |
258 if length==0: | |
259 return None,None | |
260 end=length-1 | |
261 while end>=0 and (sequence[end] in skiplist): | |
262 end-=1 | |
263 start=0 | |
264 while start<length and (sequence[start] in skiplist): | |
265 start+=1 | |
266 return start,end | |
267 | |
268 def _sort_keys_by_values(p): | |
269 """Returns a sorted list of keys of p sorted by values of p.""" | |
270 startpos=[(p[pn],pn) for pn in p if p[pn]] | |
271 startpos.sort() | |
272 return zip(*startpos)[1] | |
273 | |
274 def _make_unique(l): | |
275 """Check that all values in list are unique and return a pruned and sorted list.""" | |
276 l=list(sets.Set(l)) | |
277 l.sort() | |
278 return l | |
279 | |
280 def _seqmatrix2strmatrix(matrix): | |
281 """Converts a Seq-object matrix to a plain sequence-string matrix.""" | |
282 return dict([(t,matrix[t].tostring()) for t in matrix]) | |
283 | |
284 def _compact4nexus(orig_list): | |
285 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) | |
286 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). | |
287 """ | |
288 | |
289 if not orig_list: | |
290 return '' | |
291 orig_list=list(sets.Set(orig_list)) | |
292 orig_list.sort() | |
293 shortlist=[] | |
294 clist=orig_list[:] | |
295 clist.append(clist[-1]+.5) # dummy value makes it easier | |
296 while len(clist)>1: | |
297 step=1 | |
298 for i,x in enumerate(clist): | |
299 if x==clist[0]+i*step: # are we still in the right step? | |
300 continue | |
301 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]: | |
302 # second element, and possibly at least 3 elements to link, | |
303 # and the next one is in the right step | |
304 step=x-clist[0] | |
305 else: # pattern broke, add all values before current position to new list | |
306 sub=clist[:i] | |
307 if len(sub)==1: | |
308 shortlist.append(str(sub[0]+1)) | |
309 else: | |
310 if step==1: | |
311 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1)) | |
312 else: | |
313 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step)) | |
314 clist=clist[i:] | |
315 break | |
316 return ' '.join(shortlist) | |
317 | |
318 def combine(matrices): | |
319 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. | |
320 | |
321 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] | |
322 Character sets, character partitions and taxon sets are prefixed, readjusted and present in | |
323 the combined matrix. | |
324 """ | |
325 | |
326 if not matrices: | |
327 return None | |
328 name=matrices[0][0] | |
329 combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix | |
330 mixed_datatypes=(len(sets.Set([n[1].datatype for n in matrices]))>1) | |
331 if mixed_datatypes: | |
332 combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself! | |
333 # raise NexusError, 'Matrices must be of same datatype' | |
334 combined.charlabels=None | |
335 combined.statelabels=None | |
336 combined.interleave=False | |
337 combined.translate=None | |
338 | |
339 # rename taxon sets and character sets and name them with prefix | |
340 for cn,cs in combined.charsets.items(): | |
341 combined.charsets['%s.%s' % (name,cn)]=cs | |
342 del combined.charsets[cn] | |
343 for tn,ts in combined.taxsets.items(): | |
344 combined.taxsets['%s.%s' % (name,tn)]=ts | |
345 del combined.taxsets[tn] | |
346 # previous partitions usually don't make much sense in combined matrix | |
347 # just initiate one new partition parted by single matrices | |
348 combined.charpartitions={'combined':{name:range(combined.nchar)}} | |
349 for n,m in matrices[1:]: # add all other matrices | |
350 both=[t for t in combined.taxlabels if t in m.taxlabels] | |
351 combined_only=[t for t in combined.taxlabels if t not in both] | |
352 m_only=[t for t in m.taxlabels if t not in both] | |
353 for t in both: | |
354 # concatenate sequences and unify gap and missing character symbols | |
355 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) | |
356 # replace date of missing taxa with symbol for missing data | |
357 for t in combined_only: | |
358 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet) | |
359 for t in m_only: | |
360 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\ | |
361 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) | |
362 combined.taxlabels.extend(m_only) # new taxon list | |
363 for cn,cs in m.charsets.items(): # adjust character sets for new matrix | |
364 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs] | |
365 if m.taxsets: | |
366 if not combined.taxsets: | |
367 combined.taxsets={} | |
368 combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()])) # update taxon sets | |
369 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar) # update new charpartition | |
370 # update charlabels | |
371 if m.charlabels: | |
372 if not combined.charlabels: | |
373 combined.charlabels={} | |
374 combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()])) | |
375 combined.nchar+=m.nchar # update nchar and ntax | |
376 combined.ntax+=len(m_only) | |
377 | |
378 return combined | |
379 | |
380 def _kill_comments_and_break_lines(text): | |
381 """Delete []-delimited comments out of a file and break into lines separated by ';'. | |
382 | |
383 stripped_text=_kill_comments_and_break_lines(text): | |
384 Nested and multiline comments are allowed. [ and ] symbols within single | |
385 or double quotes are ignored, newline ends a quote, all symbols with quotes are | |
386 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) | |
387 Special [&...] and [\...] comments remain untouched, if not inside standard comment. | |
388 Quotes inside special [& and [\ are treated as normal characters, | |
389 but no nesting inside these special comments allowed (like [& [\ ]]). | |
390 ';' ist deleted from end of line. | |
391 | |
392 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus | |
393 """ | |
394 contents=CharBuffer(text) | |
395 newtext=[] | |
396 newline=[] | |
397 quotelevel='' | |
398 speciallevel=False | |
399 commlevel=0 | |
400 while True: | |
401 #plain=contents.next_until(["'",'"','[',']','\n',';']) # search for next special character | |
402 #if not plain: | |
403 # newline.append(contents.rest) # not found, just add the rest | |
404 # break | |
405 #newline.append(plain) # add intermediate text | |
406 t=contents.next() # and get special character | |
407 if t is None: | |
408 break | |
409 if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation | |
410 quotelevel='' | |
411 elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation | |
412 quotelevel=t | |
413 elif not quotelevel and t=='[': # opening bracket outside a quote | |
414 if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel: | |
415 speciallevel=True | |
416 else: | |
417 commlevel+=1 | |
418 elif not quotelevel and t==']': # closing bracket ioutside a quote | |
419 if speciallevel: | |
420 speciallevel=False | |
421 else: | |
422 commlevel-=1 | |
423 if commlevel<0: | |
424 raise NexusError, 'Nexus formatting error: unmatched ]' | |
425 continue | |
426 if commlevel==0: # copy if we're not in comment | |
427 if t==';' and not quotelevel: | |
428 newtext.append(''.join(newline)) | |
429 newline=[] | |
430 else: | |
431 newline.append(t) | |
432 #level of comments should be 0 at the end of the file | |
433 if newline: | |
434 newtext.append('\n'.join(newline)) | |
435 if commlevel>0: | |
436 raise NexusError, 'Nexus formatting error: unmatched [' | |
437 return newtext | |
438 | |
439 | |
440 def _adjust_lines(lines): | |
441 """Adjust linebreaks to match ';', strip leading/trailing whitespace | |
442 | |
443 list_of_commandlines=_adjust_lines(input_text) | |
444 Lines are adjusted so that no linebreaks occur within a commandline | |
445 (except matrix command line) | |
446 """ | |
447 formatted_lines=[] | |
448 for l in lines: | |
449 #Convert line endings | |
450 l=l.replace('\r\n','\n').replace('\r','\n').strip() | |
451 if l.lower().startswith('matrix'): | |
452 formatted_lines.append(l) | |
453 else: | |
454 l=l.replace('\n',' ') | |
455 if l: | |
456 formatted_lines.append(l) | |
457 return formatted_lines | |
458 | |
459 def _replace_parenthesized_ambigs(seq,rev_ambig_values): | |
460 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" | |
461 | |
462 opening=seq.find('(') | |
463 while opening>-1: | |
464 closing=seq.find(')') | |
465 if closing<0: | |
466 raise NexusError, 'Missing closing parenthesis in: '+seq | |
467 elif closing<opening: | |
468 raise NexusError, 'Missing opening parenthesis in: '+seq | |
469 ambig=[x for x in seq[opening+1:closing]] | |
470 ambig.sort() | |
471 ambig=''.join(ambig) | |
472 ambig_code=rev_ambig_values[ambig.upper()] | |
473 if ambig!=ambig.upper(): | |
474 ambig_code=ambig_code.lower() | |
475 seq=seq[:opening]+ambig_code+seq[closing+1:] | |
476 opening=seq.find('(') | |
477 return seq | |
478 | |
479 | |
480 class Commandline: | |
481 """Represent a commandline as command and options.""" | |
482 | |
483 def __init__(self, line, title): | |
484 self.options={} | |
485 options=[] | |
486 self.command=None | |
487 try: | |
488 #Assume matrix (all other command lines have been stripped of \n) | |
489 self.command, options = line.strip().split('\n', 1) | |
490 except ValueError: #Not matrix | |
491 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...) | |
492 self.command=line.split()[0] | |
493 options=' '.join(line.split()[1:]) | |
494 self.command = self.command.strip().lower() | |
495 if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved | |
496 self.options=options.strip() | |
497 else: | |
498 if len(options) > 0: | |
499 try: | |
500 options = options.replace('=', ' = ').split() | |
501 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))] | |
502 indices = [] | |
503 for sl in valued_indices: | |
504 indices.extend(sl) | |
505 token_indices = [n for n in range(len(options)) if n not in indices] | |
506 for opt in valued_indices: | |
507 #self.options[options[opt[0]].lower()] = options[opt[2]].lower() | |
508 self.options[options[opt[0]].lower()] = options[opt[2]] | |
509 for token in token_indices: | |
510 self.options[options[token].lower()] = None | |
511 except ValueError: | |
512 raise NexusError, 'Incorrect formatting in line: %s' % line | |
513 | |
514 class Block: | |
515 """Represent a NEXUS block with block name and list of commandlines .""" | |
516 def __init__(self,title=None): | |
517 self.title=title | |
518 self.commandlines=[] | |
519 | |
520 class Nexus(object): | |
521 | |
522 __slots__=['original_taxon_order','__dict__'] | |
523 | |
524 def __init__(self, input=None): | |
525 self.ntax=0 # number of taxa | |
526 self.nchar=0 # number of characters | |
527 self.taxlabels=[] # labels for taxa, ordered by their id | |
528 self.charlabels=None # ... and for characters | |
529 self.statelabels=None # ... and for states | |
530 self.datatype='dna' # (standard), dna, rna, nucleotide, protein | |
531 self.respectcase=False # case sensitivity | |
532 self.missing='?' # symbol for missing characters | |
533 self.gap='-' # symbol for gap | |
534 self.symbols=None # set of symbols | |
535 self.equate=None # set of symbol synonyms | |
536 self.matchchar=None # matching char for matrix representation | |
537 self.labels=None # left, right, no | |
538 self.transpose=False # whether matrix is transposed | |
539 self.interleave=False # whether matrix is interleaved | |
540 self.tokens=False # unsupported | |
541 self.eliminate=None # unsupported | |
542 self.matrix=None # ... | |
543 self.unknown_blocks=[] # blocks we don't care about | |
544 self.taxsets={} | |
545 self.charsets={} | |
546 self.charpartitions={} | |
547 self.taxpartitions={} | |
548 self.trees=[] # list of Trees (instances of tree class) | |
549 self.translate=None # Dict to translate taxon <-> taxon numbers | |
550 self.structured=[] # structured input representation | |
551 self.set={} # dict of the set command to set various options | |
552 self.options={} # dict of the options command in the data block | |
553 | |
554 # some defaults | |
555 self.options['gapmode']='missing' | |
556 | |
557 if input: | |
558 self.read(input) | |
559 | |
560 def get_original_taxon_order(self): | |
561 """Included for backwards compatibility.""" | |
562 return self.taxlabels | |
563 def set_original_taxon_order(self,value): | |
564 """Included for backwards compatibility.""" | |
565 self.taxlabels=value | |
566 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order) | |
567 | |
568 def read(self,input): | |
569 """Read and parse NEXUS imput (filename, file-handle, string.""" | |
570 | |
571 # 1. Assume we have the name of a file in the execution dir | |
572 # Note we need to add parsing of the path to dir/filename | |
573 try: | |
574 file_contents = open(os.path.expanduser(input),'rU').read() | |
575 self.filename=input | |
576 except (TypeError,IOError,AttributeError): | |
577 #2 Assume we have a string from a fh.read() | |
578 #if isinstance(input, str): | |
579 # file_contents = input | |
580 # self.filename='input_string' | |
581 #3 Assume we have a file object | |
582 if hasattr(input,'read'): # file objects or StringIO objects | |
583 file_contents=input.read() | |
584 # GEC : Change next line so that StringIO objects work | |
585 #if input.name: | |
586 if hasattr(input, 'name'): | |
587 self.filename=input.name | |
588 else: | |
589 self.filename='Unknown_nexus_file' | |
590 else: | |
591 print input.strip()[:6] | |
592 raise NexusError, 'Unrecognized input: %s ...' % input[:100] | |
593 if C: | |
594 decommented=cnexus.scanfile(file_contents) | |
595 #check for unmatched parentheses | |
596 if decommented=='[' or decommented==']': | |
597 raise NexusError, 'Unmatched %s' % decommented | |
598 # cnexus can't return lists, so in analogy we separate commandlines with chr(7) | |
599 # (a character that shoudn't be part of a nexus file under normal circumstances) | |
600 commandlines=_adjust_lines(decommented.split(chr(7))) | |
601 else: | |
602 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents)) | |
603 # get rid of stupid 'NEXUS token' | |
604 try: | |
605 if commandlines[0][:6].upper()=='#NEXUS': | |
606 commandlines[0]=commandlines[0][6:].strip() | |
607 except: | |
608 pass | |
609 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands | |
610 nexus_block_gen = self._get_nexus_block(commandlines) | |
611 while 1: | |
612 try: | |
613 title, contents = nexus_block_gen.next() | |
614 except StopIteration: | |
615 break | |
616 if title in KNOWN_NEXUS_BLOCKS: | |
617 self._parse_nexus_block(title, contents) | |
618 else: | |
619 self._unknown_nexus_block(title, contents) | |
620 | |
621 def _get_nexus_block(self,file_contents): | |
622 """Generator for looping through Nexus blocks.""" | |
623 inblock=False | |
624 blocklines=[] | |
625 while file_contents: | |
626 cl=file_contents.pop(0) | |
627 if cl.lower().startswith('begin'): | |
628 if not inblock: | |
629 inblock=True | |
630 title=cl.split()[1].lower() | |
631 else: | |
632 raise NexusError('Illegal block nesting in block %s' % title) | |
633 elif cl.lower().startswith('end'): | |
634 if inblock: | |
635 inblock=False | |
636 yield title,blocklines | |
637 blocklines=[] | |
638 else: | |
639 raise NexusError('Unmatched \'end\'.') | |
640 elif inblock: | |
641 blocklines.append(cl) | |
642 | |
643 def _unknown_nexus_block(self,title, contents): | |
644 block = Block() | |
645 block.commandlines.append(contents) | |
646 block.title = title | |
647 self.unknown_blocks.append(block) | |
648 | |
649 def _parse_nexus_block(self,title, contents): | |
650 """Parse a known Nexus Block """ | |
651 # attached the structered block representation | |
652 self._apply_block_structure(title, contents) | |
653 #now check for taxa,characters,data blocks. If this stuff is defined more than once | |
654 #the later occurences will override the previous ones. | |
655 block=self.structured[-1] | |
656 for line in block.commandlines: | |
657 try: | |
658 getattr(self,'_'+line.command)(line.options) | |
659 except AttributeError: | |
660 raise | |
661 raise NexusError, 'Unknown command: %s ' % line.command | |
662 | |
663 def _dimensions(self,options): | |
664 if options.has_key('ntax'): | |
665 self.ntax=eval(options['ntax']) | |
666 if options.has_key('nchar'): | |
667 self.nchar=eval(options['nchar']) | |
668 | |
669 def _format(self,options): | |
670 # print options | |
671 # we first need to test respectcase, then symbols (which depends on respectcase) | |
672 # then datatype (which, if standard, depends on symbols and respectcase in order to generate | |
673 # dicts for ambiguous values and alphabet | |
674 if options.has_key('respectcase'): | |
675 self.respectcase=True | |
676 # adjust symbols to for respectcase | |
677 if options.has_key('symbols'): | |
678 self.symbols=options['symbols'] | |
679 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ | |
680 (self.symbold.startswith("'") and self.symbols.endswith("'")): | |
681 self.symbols=self.symbols[1:-1].replace(' ','') | |
682 if not self.respectcase: | |
683 self.symbols=self.symbols.lower()+self.symbols.upper() | |
684 self.symbols=list(sets.Set(self.symbols)) | |
685 if options.has_key('datatype'): | |
686 self.datatype=options['datatype'].lower() | |
687 if self.datatype=='dna' or self.datatype=='nucleotide': | |
688 self.alphabet=IUPAC.ambiguous_dna | |
689 self.ambiguous_values=IUPACData.ambiguous_dna_values | |
690 self.unambiguous_letters=IUPACData.unambiguous_dna_letters | |
691 elif self.datatype=='rna': | |
692 self.alphabet=IUPAC.ambiguous_rna | |
693 self.ambiguous_values=IUPACData.ambiguous_rna_values | |
694 self.unambiguous_letters=IUPACData.unambiguous_rna_letters | |
695 elif self.datatype=='protein': | |
696 self.alphabet=IUPAC.protein | |
697 self.ambiguous_values={'B':'DN','Z':'EQ','X':IUPACData.protein_letters} # that's how PAUP handles it | |
698 self.unambiguous_letters=IUPACData.protein_letters+'*' # stop-codon | |
699 elif self.datatype=='standard': | |
700 raise NexusError('Datatype standard is not yet supported.') | |
701 #self.alphabet=None | |
702 #self.ambiguous_values={} | |
703 #if not self.symbols: | |
704 # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states | |
705 #self.unambiguous_letters=self.symbols | |
706 else: | |
707 raise NexusError, 'Unsupported datatype: '+self.datatype | |
708 self.valid_characters=''.join(self.ambiguous_values.keys())+self.unambiguous_letters | |
709 if not self.respectcase: | |
710 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper() | |
711 #we have to sort the reverse ambig coding dict key characters: | |
712 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N' | |
713 rev=dict([(i[1],i[0]) for i in self.ambiguous_values.items() if i[0]!='X']) | |
714 self.rev_ambiguous_values={} | |
715 for (k,v) in rev.items(): | |
716 key=[c for c in k] | |
717 key.sort() | |
718 self.rev_ambiguous_values[''.join(key)]=v | |
719 #overwrite symbols for datype rna,dna,nucleotide | |
720 if self.datatype in ['dna','rna','nucleotide']: | |
721 self.symbols=self.alphabet.letters | |
722 if self.missing not in self.ambiguous_values: | |
723 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap | |
724 self.ambiguous_values[self.gap]=self.gap | |
725 elif self.datatype=='standard': | |
726 if not self.symbols: | |
727 self.symbols=['1','0'] | |
728 if options.has_key('missing'): | |
729 self.missing=options['missing'][0] | |
730 if options.has_key('gap'): | |
731 self.gap=options['gap'][0] | |
732 if options.has_key('equate'): | |
733 self.equate=options['equate'] | |
734 if options.has_key('matchchar'): | |
735 self.matchchar=options['matchchar'][0] | |
736 if options.has_key('labels'): | |
737 self.labels=options['labels'] | |
738 if options.has_key('transpose'): | |
739 raise NexusError, 'TRANSPOSE is not supported!' | |
740 self.transpose=True | |
741 if options.has_key('interleave'): | |
742 if options['interleave']==None or options['interleave'].lower()=='yes': | |
743 self.interleave=True | |
744 if options.has_key('tokens'): | |
745 self.tokens=True | |
746 if options.has_key('notokens'): | |
747 self.tokens=False | |
748 | |
749 | |
750 def _set(self,options): | |
751 self.set=options; | |
752 | |
753 def _options(self,options): | |
754 self.options=options; | |
755 | |
756 def _eliminate(self,options): | |
757 self.eliminate=options | |
758 | |
759 def _taxlabels(self,options): | |
760 """Get taxon labels.""" | |
761 self.taxlabels=[] | |
762 opts=CharBuffer(options) | |
763 while True: | |
764 taxon=quotestrip(opts.next_word()) | |
765 if not taxon: | |
766 break | |
767 self.taxlabels.append(taxon) | |
768 | |
769 def _check_taxlabels(self,taxon): | |
770 """Check for presence of taxon in self.taxlabels.""" | |
771 # According to NEXUS standard, underscores shall be treated as spaces..., | |
772 # so checking for identity is more difficult | |
773 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels]) | |
774 nexid=taxon.replace(' ','_') | |
775 return nextaxa.get(nexid) | |
776 | |
777 def _charlabels(self,options): | |
778 self.charlabels={} | |
779 opts=CharBuffer(options) | |
780 while True: | |
781 try: | |
782 # get id and state | |
783 w=opts.next_word() | |
784 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! | |
785 break | |
786 identifier=self._resolve(w,set_type=CHARSET) | |
787 state=quotestrip(opts.next_word()) | |
788 self.charlabels[identifier]=state | |
789 # check for comma or end of command | |
790 c=opts.next_nonwhitespace() | |
791 if c is None: | |
792 break | |
793 elif c!=',': | |
794 raise NexusError,'Missing \',\' in line %s.' % options | |
795 except NexusError: | |
796 raise | |
797 except: | |
798 raise NexusError,'Format error in line %s.' % options | |
799 | |
800 def _charstatelabels(self,options): | |
801 # warning: charstatelabels supports only charlabels-syntax! | |
802 self._charlabels(options) | |
803 | |
804 def _statelabels(self,options): | |
805 #self.charlabels=options | |
806 #print 'Command statelabels is not supported and will be ignored.' | |
807 pass | |
808 | |
809 def _matrix(self,options): | |
810 if not self.ntax or not self.nchar: | |
811 raise NexusError,'Dimensions must be specified before matrix!' | |
812 taxlabels_present=(self.taxlabels!=[]) | |
813 self.matrix={} | |
814 taxcount=0 | |
815 block_interleave=0 | |
816 #eliminate empty lines and leading/trailing whitespace | |
817 lines=[l.strip() for l in options.split('\n') if l.strip()<>''] | |
818 lineiter=iter(lines) | |
819 while 1: | |
820 try: | |
821 l=lineiter.next() | |
822 except StopIteration: | |
823 if taxcount<self.ntax: | |
824 raise NexusError, 'Not enough taxa in matrix.' | |
825 elif taxcount>self.ntax: | |
826 raise NexusError, 'Too many taxa in matrix.' | |
827 else: | |
828 break | |
829 # count the taxa and check for interleaved matrix | |
830 taxcount+=1 | |
831 ##print taxcount | |
832 if taxcount>self.ntax: | |
833 if not self.interleave: | |
834 raise NexusError, 'Too many taxa in matrix - should matrix be interleaved?' | |
835 else: | |
836 taxcount=1 | |
837 block_interleave=1 | |
838 #get taxon name and sequence | |
839 linechars=CharBuffer(l) | |
840 id=quotestrip(linechars.next_word()) | |
841 l=linechars.rest().strip() | |
842 if taxlabels_present and not self._check_taxlabels(id): | |
843 raise NexusError,'Taxon '+id+' not found in taxlabels.' | |
844 chars='' | |
845 if self.interleave: | |
846 #interleaved matrix | |
847 #print 'In interleave' | |
848 if l: | |
849 chars=''.join(l.split()) | |
850 else: | |
851 chars=''.join(lineiter.next().split()) | |
852 else: | |
853 #non-interleaved matrix | |
854 chars=''.join(l.split()) | |
855 while len(chars)<self.nchar: | |
856 l=lineiter.next() | |
857 chars+=''.join(l.split()) | |
858 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet) | |
859 #first taxon has the reference sequence if matchhar is used | |
860 if taxcount==1: | |
861 refseq=iupac_seq | |
862 else: | |
863 if self.matchchar: | |
864 while 1: | |
865 p=iupac_seq.tostring().find(self.matchchar) | |
866 if p==-1: | |
867 break | |
868 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet) | |
869 #check for invalid characters | |
870 for i,c in enumerate(iupac_seq.tostring()): | |
871 if c not in self.valid_characters and c!=self.gap and c!=self.missing: | |
872 raise NexusError, 'Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\ | |
873 % (id,c,l[i-10:i+10]) | |
874 #add sequence to matrix | |
875 if block_interleave==0: | |
876 while self.matrix.has_key(id): | |
877 if id.split('.')[-1].startswith('copy'): | |
878 id='.'.join(id.split('.')[:-1])+'.copy'+str(eval('0'+id.split('.')[-1][4:])+1) | |
879 else: | |
880 id+='.copy' | |
881 #raise NexusError, id+' already in matrix!\nError in: '+l | |
882 self.matrix[id]=iupac_seq | |
883 # add taxon name only if taxlabels is not alredy present | |
884 if not taxlabels_present: | |
885 self.taxlabels.append(id) | |
886 else: | |
887 taxon_present=self._check_taxlabels(id) | |
888 if taxon_present: | |
889 self.matrix[taxon_present]+=iupac_seq | |
890 else: | |
891 raise NexusError, 'Taxon %s not in first block of interleaved matrix.' % id | |
892 #check all sequences for length according to nchar | |
893 for taxon in self.matrix: | |
894 if len(self.matrix[taxon])!=self.nchar: | |
895 raise NexusError,'Nchar ('+str(self.nchar)+') does not match data for taxon '+taxon | |
896 | |
897 def _translate(self,options): | |
898 self.translate={} | |
899 opts=CharBuffer(options) | |
900 while True: | |
901 try: | |
902 # get id and state | |
903 identifier=int(opts.next_word()) | |
904 label=quotestrip(opts.next_word()) | |
905 self.translate[identifier]=label | |
906 # check for comma or end of command | |
907 c=opts.next_nonwhitespace() | |
908 if c is None: | |
909 break | |
910 elif c!=',': | |
911 raise NexusError,'Missing \',\' in line %s.' % options | |
912 except NexusError: | |
913 raise | |
914 except: | |
915 raise NexusError,'Format error in line %s.' % options | |
916 | |
917 def _utree(self,options): | |
918 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" | |
919 self._tree(options) | |
920 | |
921 def _tree(self,options): | |
922 opts=CharBuffer(options) | |
923 name=opts.next_word() | |
924 if opts.next_nonwhitespace()!='=': | |
925 raise NexusError,'Syntax error in tree description: %s' % options[:50] | |
926 rooted=False | |
927 weight=1.0 | |
928 while opts.peek_nonwhitespace()=='[': | |
929 open=opts.next_nonwhitespace() | |
930 symbol=opts.next() | |
931 if symbol!='&': | |
932 raise NexusError,'Illegal special comment [%s...] in tree description: %s' % (symbol, options[:50]) | |
933 special=opts.next() | |
934 value=opts.next_until(']') | |
935 closing=opts.next() | |
936 if special=='R': | |
937 rooted=True | |
938 elif special=='U': | |
939 rooted=False | |
940 elif special=='W': | |
941 weight=float(value) | |
942 tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip()) | |
943 # if there's an active translation table, translate | |
944 if self.translate: | |
945 for n in tree.get_terminals(): | |
946 try: | |
947 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)]) | |
948 except (ValueError,KeyError): | |
949 raise NexusError,'Unable to substitue %s using \'translate\' data.' % tree.node(n).data.taxon | |
950 self.trees.append(tree) | |
951 | |
952 def _apply_block_structure(self,title,lines): | |
953 block=Block('') | |
954 block.title = title | |
955 for line in lines: | |
956 block.commandlines.append(Commandline(line, title)) | |
957 self.structured.append(block) | |
958 | |
959 def _taxset(self, options): | |
960 name,taxa=self._get_indices(options,set_type=TAXSET) | |
961 self.taxsets[name]=_make_unique(taxa) | |
962 | |
963 def _charset(self, options): | |
964 name,sites=self._get_indices(options,set_type=CHARSET) | |
965 self.charsets[name]=_make_unique(sites) | |
966 | |
967 def _taxpartition(self, options): | |
968 taxpartition={} | |
969 quotelevel=False | |
970 opts=CharBuffer(options) | |
971 name=self._name_n_vector(opts) | |
972 if not name: | |
973 raise NexusError, 'Formatting error in taxpartition: %s ' % options | |
974 # now collect thesubbpartitions and parse them | |
975 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... | |
976 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words | |
977 sub='' | |
978 while True: | |
979 w=opts.next() | |
980 if w is None or (w==',' and not quotelevel): | |
981 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':') | |
982 taxpartition[subname]=_make_unique(subindices) | |
983 sub='' | |
984 if w is None: | |
985 break | |
986 else: | |
987 if w=="'": | |
988 quotelevel=not quotelevel | |
989 sub+=w | |
990 self.taxpartitions[name]=taxpartition | |
991 | |
992 def _charpartition(self, options): | |
993 charpartition={} | |
994 quotelevel=False | |
995 opts=CharBuffer(options) | |
996 name=self._name_n_vector(opts) | |
997 if not name: | |
998 raise NexusError, 'Formatting error in charpartition: %s ' % options | |
999 # now collect thesubbpartitions and parse them | |
1000 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... | |
1001 sub='' | |
1002 while True: | |
1003 w=opts.next() | |
1004 if w is None or (w==',' and not quotelevel): | |
1005 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':') | |
1006 charpartition[subname]=_make_unique(subindices) | |
1007 sub='' | |
1008 if w is None: | |
1009 break | |
1010 else: | |
1011 if w=="'": | |
1012 quotelevel=not quotelevel | |
1013 sub+=w | |
1014 self.charpartitions[name]=charpartition | |
1015 | |
1016 def _get_indices(self,options,set_type=CHARSET,separator='='): | |
1017 """Parse the taxset/charset specification | |
1018 '1 2 3 - 5 dog cat 10- 20 \\ 3' --> [0,1,2,3,4,'dog','cat',10,13,16,19] | |
1019 """ | |
1020 opts=CharBuffer(options) | |
1021 name=self._name_n_vector(opts,separator=separator) | |
1022 indices=self._parse_list(opts,set_type=set_type) | |
1023 if indices is None: | |
1024 raise NexusError, 'Formatting error in line: %s ' % options | |
1025 return name,indices | |
1026 | |
1027 def _name_n_vector(self,opts,separator='='): | |
1028 """Extract name and check that it's not in vector format.""" | |
1029 rest=opts.rest() | |
1030 name=opts.next_word() | |
1031 if not name: | |
1032 raise NexusError, 'Formatting error in line: %s ' % rest | |
1033 name=quotestrip(name) | |
1034 if opts.peek_nonwhitespace=='(': | |
1035 open=opts.next_nonwhitespace() | |
1036 qualifier=open.next_word() | |
1037 close=opts.next_nonwhitespace() | |
1038 if qualifier.lower()=='vector': | |
1039 raise NexusError, 'Unsupported VECTOR format in line %s' % (options) | |
1040 elif qualifier.lower()!='standard': | |
1041 raise NexusError, 'Unknown qualifier %s in line %s' % (qualifier,options) | |
1042 if opts.next_nonwhitespace()!=separator: | |
1043 raise NexusError, 'Formatting error in line: %s ' % rest | |
1044 return name | |
1045 | |
1046 def _parse_list(self,options_buffer,set_type): | |
1047 """Parse a NEXUS list: [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17-21], | |
1048 (assuming dog is taxon no. 17 and cat is taxon no. 21). | |
1049 """ | |
1050 plain_list=[] | |
1051 if options_buffer.peek_nonwhitespace(): | |
1052 try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError | |
1053 while True: | |
1054 identifier=options_buffer.next_word() # next list element | |
1055 if not identifier: # end of list? | |
1056 break | |
1057 start=self._resolve(identifier,set_type=set_type) | |
1058 if options_buffer.peek_nonwhitespace()=='-': # followd by - | |
1059 end=start | |
1060 step=1 | |
1061 # get hyphen and end of range | |
1062 hyphen=options_buffer.next_nonwhitespace() | |
1063 end=self._resolve(options_buffer.next_word(),set_type=set_type) | |
1064 if set_type==CHARSET: | |
1065 if options_buffer.peek_nonwhitespace()=='\\': # followd by \ | |
1066 backslash=options_buffer.next_nonwhitespace() | |
1067 step=int(options_buffer.next_word()) # get backslash and step | |
1068 plain_list.extend(range(start,end+1,step)) | |
1069 else: | |
1070 if type(start)==list or type(end)==list: | |
1071 raise NexusError, 'Name if character sets not allowed in range definition: %s' % identifier | |
1072 start=self.taxlabels.index(start) | |
1073 end=self.taxlabels.index(end) | |
1074 taxrange=self.taxlabels[start:end+1] | |
1075 plain_list.extend(taxrange) | |
1076 else: | |
1077 if type(start)==list: # start was the name of charset or taxset | |
1078 plain_list.extend(start) | |
1079 else: # start was an ordinary identifier | |
1080 plain_list.append(start) | |
1081 except NexusError: | |
1082 raise | |
1083 except: | |
1084 return None | |
1085 return plain_list | |
1086 | |
1087 def _resolve(self,identifier,set_type=None): | |
1088 """Translate identifier in list into character/taxon index. | |
1089 Characters (which are referred to by their index in Nexus.py): | |
1090 Plain numbers are returned minus 1 (Nexus indices to python indices) | |
1091 Text identifiers are translaterd into their indices (if plain character indentifiers), | |
1092 the first hit in charlabels is returned (charlabels don't need to be unique) | |
1093 or the range of indices is returned (if names of character sets). | |
1094 Taxa (which are referred to by their unique name in Nexus.py): | |
1095 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. | |
1096 Names are returned unchanged (if plain taxon identifiers), or the names in | |
1097 the corresponding taxon set is returned | |
1098 """ | |
1099 identifier=quotestrip(identifier) | |
1100 if not set_type: | |
1101 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') | |
1102 if set_type==CHARSET: | |
1103 try: | |
1104 n=int(identifier) | |
1105 except ValueError: | |
1106 if self.charlabels and identifier in self.charlabels.values(): | |
1107 for k in self.charlabels: | |
1108 if self.charlabels[k]==identifier: | |
1109 return k | |
1110 elif self.charsets and identifier in self.charsets: | |
1111 return self.charsets[identifier] | |
1112 else: | |
1113 raise NexusError, 'Unknown character identifier: %s' % identifier | |
1114 else: | |
1115 if n<=self.nchar: | |
1116 return n-1 | |
1117 else: | |
1118 raise NexusError, 'Illegal character identifier: %d>nchar (=%d).' % (identifier,self.nchar) | |
1119 elif set_type==TAXSET: | |
1120 try: | |
1121 n=int(identifier) | |
1122 except ValueError: | |
1123 taxlabels_id=self._check_taxlabels(identifier) | |
1124 if taxlabels_id: | |
1125 return taxlabels_id | |
1126 elif self.taxsets and identifier in self.taxsets: | |
1127 return self.taxsets[identifier] | |
1128 else: | |
1129 raise NexusError, 'Unknown taxon identifier: %s' % identifier | |
1130 else: | |
1131 if n>0 and n<=self.ntax: | |
1132 return self.taxlabels[n-1] | |
1133 else: | |
1134 raise NexusError, 'Illegal taxon identifier: %d>ntax (=%d).' % (identifier,self.ntax) | |
1135 else: | |
1136 raise NexusError('Unknown set specification: %s.'% set_type) | |
1137 | |
1138 def _stateset(self, options): | |
1139 #Not implemented | |
1140 pass | |
1141 | |
1142 def _changeset(self, options): | |
1143 #Not implemented | |
1144 pass | |
1145 | |
1146 def _treeset(self, options): | |
1147 #Not implemented | |
1148 pass | |
1149 | |
1150 def _treepartition(self, options): | |
1151 #Not implemented | |
1152 pass | |
1153 | |
1154 def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False, | |
1155 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False): | |
1156 """Writes a nexus file for each partition in charpartition. | |
1157 Only non-excluded characters and non-deleted taxa are included, just the data block is written. | |
1158 """ | |
1159 | |
1160 if not matrix: | |
1161 matrix=self.matrix | |
1162 if not matrix: | |
1163 return | |
1164 if not filename: | |
1165 filename=self.filename | |
1166 if charpartition: | |
1167 pfilenames={} | |
1168 for p in charpartition: | |
1169 total_exclude=[]+exclude | |
1170 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]]) | |
1171 total_exclude=_make_unique(total_exclude) | |
1172 pcomment=comment+'\nPartition: '+p+'\n' | |
1173 dot=filename.rfind('.') | |
1174 if dot>0: | |
1175 pfilename=filename[:dot]+'_'+p+'.data' | |
1176 else: | |
1177 pfilename=filename+'_'+p | |
1178 pfilenames[p]=pfilename | |
1179 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize, | |
1180 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False, | |
1181 mrbayes=mrbayes) | |
1182 return pfilenames | |
1183 else: | |
1184 fn=self.filename+'.data' | |
1185 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave, | |
1186 exclude=exclude,delete=delete,comment=comment,append_sets=False, | |
1187 mrbayes=mrbayes) | |
1188 return fn | |
1189 | |
1190 def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\ | |
1191 blocksize=None, interleave=False, interleave_by_partition=False,\ | |
1192 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False): | |
1193 """ Writes a nexus file with data and sets block. Character sets and partitions | |
1194 are appended by default, and are adjusted according | |
1195 to excluded characters (i.e. character sets still point to the same sites (not necessarily same positions), | |
1196 without including the deleted characters. | |
1197 """ | |
1198 if not matrix: | |
1199 matrix=self.matrix | |
1200 if not matrix: | |
1201 return | |
1202 if not filename: | |
1203 filename=self.filename | |
1204 if [t for t in delete if not self._check_taxlabels(t)]: | |
1205 raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(sets.Set(self.taxlabels))) | |
1206 if interleave_by_partition: | |
1207 if not interleave_by_partition in self.charpartitions: | |
1208 raise NexusError, 'Unknown partition: '+interleave_by_partition | |
1209 else: | |
1210 partition=self.charpartitions[interleave_by_partition] | |
1211 # we need to sort the partition names by starting position before we exclude characters | |
1212 names=_sort_keys_by_values(partition) | |
1213 newpartition={} | |
1214 for p in partition: | |
1215 newpartition[p]=[c for c in partition[p] if c not in exclude] | |
1216 # how many taxa and how many characters are left? | |
1217 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] | |
1218 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete)) | |
1219 ntax_adjusted=len(undelete) | |
1220 nchar_adjusted=len(cropped_matrix[undelete[0]]) | |
1221 if not undelete or (undelete and undelete[0]==''): | |
1222 return | |
1223 if isinstance(filename,str): | |
1224 try: | |
1225 fh=open(filename,'w') | |
1226 except IOError: | |
1227 raise NexusError, 'Could not open %s for writing.' % filename | |
1228 elif isinstance(filename,file): | |
1229 fh=filename | |
1230 if not omit_NEXUS: | |
1231 fh.write('#NEXUS\n') | |
1232 if comment: | |
1233 fh.write('['+comment+']\n') | |
1234 fh.write('begin data;\n') | |
1235 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) | |
1236 fh.write('\tformat datatype='+self.datatype) | |
1237 if self.respectcase: | |
1238 fh.write(' respectcase') | |
1239 if self.missing: | |
1240 fh.write(' missing='+self.missing) | |
1241 if self.gap: | |
1242 fh.write(' gap='+self.gap) | |
1243 if self.matchchar: | |
1244 fh.write(' matchchar='+self.matchchar) | |
1245 if self.labels: | |
1246 fh.write(' labels='+self.labels) | |
1247 if self.equate: | |
1248 fh.write(' equate='+self.equate) | |
1249 if interleave or interleave_by_partition: | |
1250 fh.write(' interleave') | |
1251 fh.write(';\n') | |
1252 #if self.taxlabels: | |
1253 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') | |
1254 if self.charlabels: | |
1255 newcharlabels=self._adjust_charlabels(exclude=exclude) | |
1256 clkeys=newcharlabels.keys() | |
1257 clkeys.sort() | |
1258 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n') | |
1259 fh.write('matrix\n') | |
1260 if not blocksize: | |
1261 if interleave: | |
1262 blocksize=70 | |
1263 else: | |
1264 blocksize=self.nchar | |
1265 # delete deleted taxa and ecxclude excluded characters... | |
1266 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete]) | |
1267 if interleave_by_partition: | |
1268 # interleave by partitions, but adjust partitions with regard to excluded characters | |
1269 seek=0 | |
1270 for p in names: | |
1271 fh.write('[%s: %s]\n' % (interleave_by_partition,p)) | |
1272 if len(newpartition[p])>0: | |
1273 for taxon in undelete: | |
1274 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) | |
1275 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n') | |
1276 fh.write('\n') | |
1277 else: | |
1278 fh.write('[empty]\n\n') | |
1279 seek+=len(newpartition[p]) | |
1280 elif interleave: | |
1281 for seek in range(0,nchar_adjusted,blocksize): | |
1282 for taxon in undelete: | |
1283 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) | |
1284 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') | |
1285 fh.write('\n') | |
1286 else: | |
1287 for taxon in undelete: | |
1288 if blocksize<nchar_adjusted: | |
1289 fh.write(safename(taxon,mrbayes=mrbayes)+'\n') | |
1290 else: | |
1291 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) | |
1292 for seek in range(0,nchar_adjusted,blocksize): | |
1293 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') | |
1294 fh.write(';\nend;\n') | |
1295 if append_sets: | |
1296 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes)) | |
1297 fh.close() | |
1298 return filename | |
1299 | |
1300 def append_sets(self,exclude=[],delete=[],mrbayes=False): | |
1301 """Appends a sets block to <filename>.""" | |
1302 if not self.charsets and not self.taxsets and not self.charpartitions: | |
1303 return '' | |
1304 sets=['\nbegin sets'] | |
1305 # - now if characters have been excluded, the character sets need to be adjusted, | |
1306 # so that they still point to the right character positions | |
1307 # calculate a list of offsets: for each deleted character, the following character position | |
1308 # in the new file will have an additional offset of -1 | |
1309 offset=0 | |
1310 offlist=[] | |
1311 for c in range(self.nchar): | |
1312 if c in exclude: | |
1313 offset+=1 | |
1314 offlist.append(-1) # dummy value as these character positions are excluded | |
1315 else: | |
1316 offlist.append(c-offset) | |
1317 # now adjust each of the character sets | |
1318 for n,ns in self.charsets.items(): | |
1319 cset=[offlist[c] for c in ns if c not in exclude] | |
1320 if cset: | |
1321 sets.append('charset %s = %s' % (safename(n),_compact4nexus(cset))) | |
1322 for n,s in self.taxsets.items(): | |
1323 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete] | |
1324 if tset: | |
1325 sets.append('taxset %s = %s' % (safename(n),' '.join(tset))) | |
1326 for n,p in self.charpartitions.items(): | |
1327 # as characters have been excluded, the partitions must be adjusted | |
1328 # if a partition is empty, it will be omitted from the charpartition command | |
1329 # (although paup allows charpartition part=t1:,t2:,t3:1-100) | |
1330 names=_sort_keys_by_values(p) | |
1331 newpartition={} | |
1332 for sn in names: | |
1333 nsp=[offlist[c] for c in p[sn] if c not in exclude] | |
1334 if nsp: | |
1335 newpartition[sn]=nsp | |
1336 if newpartition: | |
1337 sets.append('charpartition %s = %s' % (safename(n),\ | |
1338 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition]))) | |
1339 # now write charpartititions, much easier than charpartitions | |
1340 for n,p in self.taxpartitions.items(): | |
1341 names=_sort_keys_by_values(p) | |
1342 newpartition={} | |
1343 for sn in names: | |
1344 nsp=[t for t in p[sn] if t not in delete] | |
1345 if nsp: | |
1346 newpartition[sn]=nsp | |
1347 if newpartition: | |
1348 sets.append('taxpartition %s = %s' % (safename(n),\ | |
1349 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition]))) | |
1350 # add 'end' and return everything | |
1351 sets.append('end;\n') | |
1352 return ';\n'.join(sets) | |
1353 f.close() | |
1354 | |
1355 def export_fasta(self, filename=None, width=70): | |
1356 """Writes matrix into a fasta file: (self, filename=None, width=70).""" | |
1357 if not filename: | |
1358 if '.' in filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: | |
1359 filename='.'.join(self.filename.split('.')[:-1])+'.fas' | |
1360 else: | |
1361 filename=self.filename+'.fas' | |
1362 fh=open(filename,'w') | |
1363 for taxon in self.taxlabels: | |
1364 fh.write('>'+safename(taxon)+'\n') | |
1365 for i in range(0, len(self.matrix[taxon].tostring()), width): | |
1366 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n') | |
1367 fh.close() | |
1368 | |
1369 def constant(self,matrix=None,delete=[],exclude=[]): | |
1370 """Return a list with all constant characters.""" | |
1371 if not matrix: | |
1372 matrix=self.matrix | |
1373 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] | |
1374 if not undelete: | |
1375 return None | |
1376 elif len(undelete)==1: | |
1377 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] | |
1378 # get the first sequence and expand all ambiguous values | |
1379 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for | |
1380 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude] | |
1381 for taxon in undelete[1:]: | |
1382 newconstant=[] | |
1383 for site in constant: | |
1384 #print '%d (paup=%d)' % (site[0],site[0]+1), | |
1385 seqsite=matrix[taxon][site[0]].upper() | |
1386 #print seqsite,'checked against',site[1],'\t', | |
1387 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]: | |
1388 # missing or same as before -> ok | |
1389 newconstant.append(site) | |
1390 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap): | |
1391 # subset of an ambig or only missing in previous -> take subset | |
1392 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite))) | |
1393 elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values | |
1394 intersect=sets.Set(self.ambiguous_values[seqsite]).intersection(sets.Set(site[1])) | |
1395 if intersect: | |
1396 newconstant.append((site[0],''.join(intersect))) | |
1397 # print 'ok' | |
1398 #else: | |
1399 # print 'failed' | |
1400 #else: | |
1401 # print 'failed' | |
1402 constant=newconstant | |
1403 cpos=[s[0] for s in constant] | |
1404 return constant | |
1405 # return [x[0] for x in constant] | |
1406 | |
1407 def cstatus(self,site,delete=[],narrow=True): | |
1408 """Summarize character. | |
1409 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) | |
1410 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) | |
1411 """ | |
1412 undelete=[t for t in self.taxlabels if t not in delete] | |
1413 if not undelete: | |
1414 return None | |
1415 cstatus=[] | |
1416 for t in undelete: | |
1417 c=self.matrix[t][site].upper() | |
1418 if self.options.get('gapmode')=='missing' and c==self.gap: | |
1419 c=self.missing | |
1420 if narrow and c==self.missing: | |
1421 if c not in cstatus: | |
1422 cstatus.append(c) | |
1423 else: | |
1424 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus]) | |
1425 if self.missing in cstatus and narrow and len(cstatus)>1: | |
1426 cstatus=[c for c in cstatus if c!=self.missing] | |
1427 cstatus.sort() | |
1428 return cstatus | |
1429 | |
1430 def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]): | |
1431 """Calculates a stepmatrix for weighted parsimony. | |
1432 See Wheeler (1990), Cladistics 6:269-275 and | |
1433 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196 | |
1434 """ | |
1435 m=StepMatrix(self.unambiguous_letters,self.gap) | |
1436 for site in [s for s in range(self.nchar) if s not in exclude]: | |
1437 cstatus=self.cstatus(site,delete) | |
1438 for i,b1 in enumerate(cstatus[:-1]): | |
1439 for b2 in cstatus[i+1:]: | |
1440 m.add(b1.upper(),b2.upper(),1) | |
1441 return m.transformation().weighting().smprint(name=name) | |
1442 | |
1443 def crop_matrix(self,matrix=None, delete=[], exclude=[]): | |
1444 """Return a matrix without deleted taxa and excluded characters.""" | |
1445 if not matrix: | |
1446 matrix=self.matrix | |
1447 if [t for t in delete if not self._check_taxlabels(t)]: | |
1448 raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(self.taxlabels)) | |
1449 if exclude!=[]: | |
1450 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] | |
1451 if not undelete: | |
1452 return {} | |
1453 m=[matrix[k].tostring() for k in undelete] | |
1454 zipped_m=zip(*m) | |
1455 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude] | |
1456 if sitesm==[]: | |
1457 return dict([(t,Seq('',self.alphabet)) for t in undelete]) | |
1458 else: | |
1459 zipped_sitesm=zip(*sitesm) | |
1460 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)] | |
1461 return dict(zip(undelete,m)) | |
1462 else: | |
1463 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete]) | |
1464 | |
1465 def bootstrap(self,matrix=None,delete=[],exclude=[]): | |
1466 """Return a bootstrapped matrix.""" | |
1467 if not matrix: | |
1468 matrix=self.matrix | |
1469 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq) # remember if Seq objects | |
1470 cm=self.crop_matrix(delete=delete,exclude=exclude) # crop data out | |
1471 if not cm: # everything deleted? | |
1472 return {} | |
1473 elif len(cm[cm.keys()[0]])==0: # everything excluded? | |
1474 return cm | |
1475 undelete=[t for t in self.taxlabels if t in cm] | |
1476 if seqobjects: | |
1477 sitesm=zip(*[cm[t].tostring() for t in undelete]) | |
1478 alphabet=matrix[matrix.keys()[0]].alphabet | |
1479 else: | |
1480 sitesm=zip(*[cm[t] for t in undelete]) | |
1481 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))] | |
1482 bootstrapseqs=map(''.join,zip(*bootstrapsitesm)) | |
1483 if seqobjects: | |
1484 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs] | |
1485 return dict(zip(undelete,bootstrapseqs)) | |
1486 | |
1487 def add_sequence(self,name,sequence): | |
1488 """Adds a sequence to the matrix.""" | |
1489 if not name: | |
1490 raise NexusError, 'New sequence must have a name' | |
1491 diff=self.nchar-len(sequence) | |
1492 if diff<0: | |
1493 self.insert_gap(self.nchar,-diff) | |
1494 elif diff>0: | |
1495 sequence+=self.missing*diff | |
1496 | |
1497 self.matrix[name]=Seq(sequence,self.alphabet) | |
1498 self.ntax+=1 | |
1499 self.taxlabels.append(name) | |
1500 #taxlabels? | |
1501 | |
1502 def insert_gap(self,pos,n=1,leftgreedy=False): | |
1503 """Add a gap into the matrix and adjust charsets and partitions. | |
1504 | |
1505 pos=0: first position | |
1506 pos=nchar: last position | |
1507 """ | |
1508 | |
1509 def _adjust(set,x,d,leftgreedy=False): | |
1510 """Adjusts chartacter sets if gaps are inserted, taking care of | |
1511 new gaps within a coherent character set.""" | |
1512 # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15 | |
1513 # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18 | |
1514 # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18 | |
1515 set.sort() | |
1516 addpos=0 | |
1517 for i,c in enumerate(set): | |
1518 if c>=x: | |
1519 set[i]=c+d | |
1520 # if we add gaps within a group of characters, we want the gap position included in this group | |
1521 if c==x: | |
1522 if leftgreedy or (i>0 and set[i-1]==c-1): | |
1523 addpos=i | |
1524 if addpos>0: | |
1525 set[addpos:addpos]=range(x,x+d) | |
1526 return set | |
1527 | |
1528 if pos<0 or pos>self.nchar: | |
1529 raise NexusError('Illegal gap position: %d' % pos) | |
1530 if n==0: | |
1531 return | |
1532 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) | |
1533 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n | |
1534 # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\ | |
1535 # i,taxon in enumerate(self.taxlabels)]) | |
1536 zipped=zip(*sitesm) | |
1537 mapped=map(''.join,zipped) | |
1538 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)] | |
1539 self.matrix=dict(listed) | |
1540 self.nchar+=n | |
1541 # now adjust character sets | |
1542 for i,s in self.charsets.items(): | |
1543 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy) | |
1544 for p in self.charpartitions: | |
1545 for sp,s in self.charpartitions[p].items(): | |
1546 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy) | |
1547 # now adjust character state labels | |
1548 self.charlabels=self._adjust_charlabels(insert=[pos]*n) | |
1549 return self.charlabels | |
1550 | |
1551 def _adjust_charlabels(self,exclude=None,insert=None): | |
1552 """Return adjusted indices of self.charlabels if characters are excluded or inserted.""" | |
1553 if exclude and insert: | |
1554 raise NexusError, 'Can\'t exclude and insert at the same time' | |
1555 if not self.charlabels: | |
1556 return None | |
1557 labels=self.charlabels.keys() | |
1558 labels.sort() | |
1559 newcharlabels={} | |
1560 if exclude: | |
1561 exclude.sort() | |
1562 exclude.append(sys.maxint) | |
1563 excount=0 | |
1564 for c in labels: | |
1565 if not c in exclude: | |
1566 while c>exclude[excount]: | |
1567 excount+=1 | |
1568 newcharlabels[c-excount]=self.charlabels[c] | |
1569 elif insert: | |
1570 insert.sort() | |
1571 insert.append(sys.maxint) | |
1572 icount=0 | |
1573 for c in labels: | |
1574 while c>=insert[icount]: | |
1575 icount+=1 | |
1576 newcharlabels[c+icount]=self.charlabels[c] | |
1577 else: | |
1578 return self.charlabels | |
1579 return newcharlabels | |
1580 | |
1581 def invert(self,charlist): | |
1582 """Returns all character indices that are not in charlist.""" | |
1583 return [c for c in range(self.nchar) if c not in charlist] | |
1584 | |
1585 def gaponly(self,include_missing=False): | |
1586 """Return gap-only sites.""" | |
1587 gap=sets.Set(self.gap) | |
1588 if include_missing: | |
1589 gap.add(self.missing) | |
1590 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) | |
1591 gaponly=[i for i,site in enumerate(sitesm) if sets.Set(site).issubset(gap)] | |
1592 return gaponly | |
1593 | |
1594 def terminal_gap_to_missing(self,missing=None,skip_n=True): | |
1595 """Replaces all terminal gaps with missing character. | |
1596 | |
1597 Mixtures like ???------??------- are properly resolved.""" | |
1598 | |
1599 if not missing: | |
1600 missing=self.missing | |
1601 replace=[self.missing,self.gap] | |
1602 if not skip_n: | |
1603 replace.extend(['n','N']) | |
1604 for taxon in self.taxlabels: | |
1605 sequence=self.matrix[taxon].tostring() | |
1606 length=len(sequence) | |
1607 start,end=get_start_end(sequence,skiplist=replace) | |
1608 sequence=sequence[:end+1]+missing*(length-end-1) | |
1609 sequence=start*missing+sequence[start:] | |
1610 assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon | |
1611 self.matrix[taxon]=Seq(sequence,self.alphabet) | |
1612 |