comparison corebio/seq_io/_nexus/__init__.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1 # Nexus.py - a NEXUS parser
2 #
3 # Copyright 2005 by Frank Kauff & Cymon J. Cox. All rights reserved.
4 # This code is part of the Biopython distribution and governed by its
5 # license. Please see the LICENSE file that should have been included
6 # as part of this package.
7 #
8 # Bug reports welcome: fkauff@duke.edu
9 #
10
11 """Parse the contents of a nexus file.
12 Based upon 'NEXUS: An extensible file format for systematic information'
13 Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
14
15 Authors: Frank Kauff and Cymon J. Cox
16 """
17
18 import os,sys, math, random, copy
19 import sets
20
21 # --- Changes from Bio.Nexus ---
22 # Renamed Nexus.py to __init__.py. Helps with api documentation.
23 # One further change in file, tagged with 'GEC'
24 #from Bio.Alphabet import IUPAC
25 #from Bio.Data import IUPACData
26 #from Bio.Seq import Seq
27
28
29 from corebio.seq import Seq, Alphabet, protein_alphabet
30 import corebio.data as data
31 from corebio.utils import Struct
32
33 IUPACData = Struct(
34 ambiguous_dna_letters = data.dna_extended_letters,
35 ambiguous_rna_letters = data.rna_extended_letters,
36 ambiguous_dna_values = data.dna_ambiguity,
37 ambiguous_rna_values = data.rna_ambiguity,
38 protein_letters = data.amino_acid_letters,
39 unambiguous_dna_letters = data.dna_letters,
40 unambiguous_rna_letters = data.rna_letters,
41 )
42
43 IUPAC = Struct(
44 ambiguous_dna = Alphabet(IUPACData.ambiguous_dna_letters+'-?'),
45 ambiguous_rna = Alphabet(IUPACData.ambiguous_rna_letters+'-?'),
46 protein = protein_alphabet #Alphabet(IUPACData.protein_letters+'-?')
47 )
48
49
50
51 # End Changes
52
53
54 from Trees import Tree,NodeData
55
56 C = False
57
58 #try:
59 # import cnexus
60 #except ImportError:
61 # C=False
62 #else:
63 # C=True
64
65 INTERLEAVE=70
66 SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\
67 'matrix','tree', 'utree','translate']
68 KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets']
69 PUNCTUATION='()[]{}/\,;:=*\'"`+-<>'
70 MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_'
71 WHITESPACE=' \t\n'
72 #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments
73 SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored
74 CHARSET='chars'
75 TAXSET='taxa'
76
77 class NexusError(Exception): pass
78
79 class CharBuffer:
80 """Helps reading NEXUS-words and characters from a buffer."""
81 def __init__(self,string):
82 if string:
83 self.buffer=list(string)
84 else:
85 self.buffer=[]
86
87 def peek(self):
88 if self.buffer:
89 return self.buffer[0]
90 else:
91 return None
92
93 def peek_nonwhitespace(self):
94 b=''.join(self.buffer).strip()
95 if b:
96 return b[0]
97 else:
98 return None
99
100 def next(self):
101 if self.buffer:
102 return self.buffer.pop(0)
103 else:
104 return None
105
106 def next_nonwhitespace(self):
107 while True:
108 p=self.next()
109 if p is None:
110 break
111 if p not in WHITESPACE:
112 return p
113 return None
114
115 def skip_whitespace(self):
116 while self.buffer[0] in WHITESPACE:
117 self.buffer=self.buffer[1:]
118
119 def next_until(self,target):
120 for t in target:
121 try:
122 pos=self.buffer.index(t)
123 except ValueError:
124 pass
125 else:
126 found=''.join(self.buffer[:pos])
127 self.buffer=self.buffer[pos:]
128 return found
129 else:
130 return None
131
132 def peek_word(self,word):
133 return ''.join(self.buffer[:len(word)])==word
134
135 def next_word(self):
136 """Return the next NEXUS word from a string, dealing with single and double quotes,
137 whitespace and punctuation.
138 """
139
140 word=[]
141 quoted=False
142 first=self.next_nonwhitespace() # get first character
143 if not first: # return empty if only whitespace left
144 return None
145 word.append(first)
146 if first=="'": # word starts with a quote
147 quoted=True
148 elif first in PUNCTUATION: # if it's punctuation, return immediately
149 return first
150 while True:
151 c=self.peek()
152 if c=="'": # a quote?
153 word.append(self.next()) # store quote
154 if self.peek()=="'": # double quote
155 skip=self.next() # skip second quote
156 elif quoted: # second single quote ends word
157 break
158 elif quoted:
159 word.append(self.next()) # if quoted, then add anything
160 elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop
161 break
162 else:
163 word.append(self.next()) # standard character
164 return ''.join(word)
165
166 def rest(self):
167 """Return the rest of the string without parsing."""
168 return ''.join(self.buffer)
169
170 class StepMatrix:
171 """Calculate a stepmatrix for weighted parsimony.
172 See Wheeler (1990), Cladistics 6:269-275.
173 """
174
175 def __init__(self,symbols,gap):
176 self.data={}
177 self.symbols=[s for s in symbols]
178 self.symbols.sort()
179 if gap:
180 self.symbols.append(gap)
181 for x in self.symbols:
182 for y in [s for s in self.symbols if s!=x]:
183 self.set(x,y,0)
184
185 def set(self,x,y,value):
186 if x>y:
187 x,y=y,x
188 self.data[x+y]=value
189
190 def add(self,x,y,value):
191 if x>y:
192 x,y=y,x
193 self.data[x+y]+=value
194
195 def sum(self):
196 return reduce(lambda x,y:x+y,self.data.values())
197
198 def transformation(self):
199 total=self.sum()
200 if total!=0:
201 for k in self.data:
202 self.data[k]=self.data[k]/float(total)
203 return self
204
205 def weighting(self):
206 for k in self.data:
207 if self.data[k]!=0:
208 self.data[k]=-math.log(self.data[k])
209 return self
210
211 def smprint(self,name='your_name_here'):
212 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols))
213 matrix+=' %s\n' % ' '.join(self.symbols)
214 for x in self.symbols:
215 matrix+='[%s]'.ljust(8) % x
216 for y in self.symbols:
217 if x==y:
218 matrix+=' . '
219 else:
220 if x>y:
221 x1,y1=y,x
222 else:
223 x1,y1=x,y
224 if self.data[x1+y1]==0:
225 matrix+='inf. '
226 else:
227 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1])
228 matrix+='\n'
229 matrix+=';\n'
230 return matrix
231
232 def safename(name,mrbayes=False):
233 """Return a taxon identifier according to NEXUS standard.
234 Wrap quotes around names with punctuation or whitespace, and double single quotes.
235 mrbayes=True: write names without quotes, whitespace or punctuation for mrbayes.
236 """
237 if mrbayes:
238 safe=name.replace(' ','_')
239 safe=''.join([c for c in safe if c in MRBAYESSAFE])
240 else:
241 safe=name.replace("'","''")
242 if sets.Set(safe).intersection(sets.Set(WHITESPACE+PUNCTUATION)):
243 safe="'"+safe+"'"
244 return safe
245
246 def quotestrip(word):
247 """Remove quotes and/or double quotes around identifiers."""
248 if not word:
249 return None
250 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')):
251 word=word[1:-1]
252 return word
253
254 def get_start_end(sequence, skiplist=['-','?']):
255 """Return position of first and last character which is not in skiplist (defaults to ['-','?'])."""
256
257 length=len(sequence)
258 if length==0:
259 return None,None
260 end=length-1
261 while end>=0 and (sequence[end] in skiplist):
262 end-=1
263 start=0
264 while start<length and (sequence[start] in skiplist):
265 start+=1
266 return start,end
267
268 def _sort_keys_by_values(p):
269 """Returns a sorted list of keys of p sorted by values of p."""
270 startpos=[(p[pn],pn) for pn in p if p[pn]]
271 startpos.sort()
272 return zip(*startpos)[1]
273
274 def _make_unique(l):
275 """Check that all values in list are unique and return a pruned and sorted list."""
276 l=list(sets.Set(l))
277 l.sort()
278 return l
279
280 def _seqmatrix2strmatrix(matrix):
281 """Converts a Seq-object matrix to a plain sequence-string matrix."""
282 return dict([(t,matrix[t].tostring()) for t in matrix])
283
284 def _compact4nexus(orig_list):
285 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
286 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
287 """
288
289 if not orig_list:
290 return ''
291 orig_list=list(sets.Set(orig_list))
292 orig_list.sort()
293 shortlist=[]
294 clist=orig_list[:]
295 clist.append(clist[-1]+.5) # dummy value makes it easier
296 while len(clist)>1:
297 step=1
298 for i,x in enumerate(clist):
299 if x==clist[0]+i*step: # are we still in the right step?
300 continue
301 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
302 # second element, and possibly at least 3 elements to link,
303 # and the next one is in the right step
304 step=x-clist[0]
305 else: # pattern broke, add all values before current position to new list
306 sub=clist[:i]
307 if len(sub)==1:
308 shortlist.append(str(sub[0]+1))
309 else:
310 if step==1:
311 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1))
312 else:
313 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step))
314 clist=clist[i:]
315 break
316 return ' '.join(shortlist)
317
318 def combine(matrices):
319 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
320
321 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
322 Character sets, character partitions and taxon sets are prefixed, readjusted and present in
323 the combined matrix.
324 """
325
326 if not matrices:
327 return None
328 name=matrices[0][0]
329 combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix
330 mixed_datatypes=(len(sets.Set([n[1].datatype for n in matrices]))>1)
331 if mixed_datatypes:
332 combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself!
333 # raise NexusError, 'Matrices must be of same datatype'
334 combined.charlabels=None
335 combined.statelabels=None
336 combined.interleave=False
337 combined.translate=None
338
339 # rename taxon sets and character sets and name them with prefix
340 for cn,cs in combined.charsets.items():
341 combined.charsets['%s.%s' % (name,cn)]=cs
342 del combined.charsets[cn]
343 for tn,ts in combined.taxsets.items():
344 combined.taxsets['%s.%s' % (name,tn)]=ts
345 del combined.taxsets[tn]
346 # previous partitions usually don't make much sense in combined matrix
347 # just initiate one new partition parted by single matrices
348 combined.charpartitions={'combined':{name:range(combined.nchar)}}
349 for n,m in matrices[1:]: # add all other matrices
350 both=[t for t in combined.taxlabels if t in m.taxlabels]
351 combined_only=[t for t in combined.taxlabels if t not in both]
352 m_only=[t for t in m.taxlabels if t not in both]
353 for t in both:
354 # concatenate sequences and unify gap and missing character symbols
355 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
356 # replace date of missing taxa with symbol for missing data
357 for t in combined_only:
358 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet)
359 for t in m_only:
360 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\
361 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
362 combined.taxlabels.extend(m_only) # new taxon list
363 for cn,cs in m.charsets.items(): # adjust character sets for new matrix
364 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs]
365 if m.taxsets:
366 if not combined.taxsets:
367 combined.taxsets={}
368 combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()])) # update taxon sets
369 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar) # update new charpartition
370 # update charlabels
371 if m.charlabels:
372 if not combined.charlabels:
373 combined.charlabels={}
374 combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()]))
375 combined.nchar+=m.nchar # update nchar and ntax
376 combined.ntax+=len(m_only)
377
378 return combined
379
380 def _kill_comments_and_break_lines(text):
381 """Delete []-delimited comments out of a file and break into lines separated by ';'.
382
383 stripped_text=_kill_comments_and_break_lines(text):
384 Nested and multiline comments are allowed. [ and ] symbols within single
385 or double quotes are ignored, newline ends a quote, all symbols with quotes are
386 treated the same (thus not quoting inside comments like [this character ']' ends a comment])
387 Special [&...] and [\...] comments remain untouched, if not inside standard comment.
388 Quotes inside special [& and [\ are treated as normal characters,
389 but no nesting inside these special comments allowed (like [& [\ ]]).
390 ';' ist deleted from end of line.
391
392 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus
393 """
394 contents=CharBuffer(text)
395 newtext=[]
396 newline=[]
397 quotelevel=''
398 speciallevel=False
399 commlevel=0
400 while True:
401 #plain=contents.next_until(["'",'"','[',']','\n',';']) # search for next special character
402 #if not plain:
403 # newline.append(contents.rest) # not found, just add the rest
404 # break
405 #newline.append(plain) # add intermediate text
406 t=contents.next() # and get special character
407 if t is None:
408 break
409 if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation
410 quotelevel=''
411 elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation
412 quotelevel=t
413 elif not quotelevel and t=='[': # opening bracket outside a quote
414 if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel:
415 speciallevel=True
416 else:
417 commlevel+=1
418 elif not quotelevel and t==']': # closing bracket ioutside a quote
419 if speciallevel:
420 speciallevel=False
421 else:
422 commlevel-=1
423 if commlevel<0:
424 raise NexusError, 'Nexus formatting error: unmatched ]'
425 continue
426 if commlevel==0: # copy if we're not in comment
427 if t==';' and not quotelevel:
428 newtext.append(''.join(newline))
429 newline=[]
430 else:
431 newline.append(t)
432 #level of comments should be 0 at the end of the file
433 if newline:
434 newtext.append('\n'.join(newline))
435 if commlevel>0:
436 raise NexusError, 'Nexus formatting error: unmatched ['
437 return newtext
438
439
440 def _adjust_lines(lines):
441 """Adjust linebreaks to match ';', strip leading/trailing whitespace
442
443 list_of_commandlines=_adjust_lines(input_text)
444 Lines are adjusted so that no linebreaks occur within a commandline
445 (except matrix command line)
446 """
447 formatted_lines=[]
448 for l in lines:
449 #Convert line endings
450 l=l.replace('\r\n','\n').replace('\r','\n').strip()
451 if l.lower().startswith('matrix'):
452 formatted_lines.append(l)
453 else:
454 l=l.replace('\n',' ')
455 if l:
456 formatted_lines.append(l)
457 return formatted_lines
458
459 def _replace_parenthesized_ambigs(seq,rev_ambig_values):
460 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
461
462 opening=seq.find('(')
463 while opening>-1:
464 closing=seq.find(')')
465 if closing<0:
466 raise NexusError, 'Missing closing parenthesis in: '+seq
467 elif closing<opening:
468 raise NexusError, 'Missing opening parenthesis in: '+seq
469 ambig=[x for x in seq[opening+1:closing]]
470 ambig.sort()
471 ambig=''.join(ambig)
472 ambig_code=rev_ambig_values[ambig.upper()]
473 if ambig!=ambig.upper():
474 ambig_code=ambig_code.lower()
475 seq=seq[:opening]+ambig_code+seq[closing+1:]
476 opening=seq.find('(')
477 return seq
478
479
480 class Commandline:
481 """Represent a commandline as command and options."""
482
483 def __init__(self, line, title):
484 self.options={}
485 options=[]
486 self.command=None
487 try:
488 #Assume matrix (all other command lines have been stripped of \n)
489 self.command, options = line.strip().split('\n', 1)
490 except ValueError: #Not matrix
491 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...)
492 self.command=line.split()[0]
493 options=' '.join(line.split()[1:])
494 self.command = self.command.strip().lower()
495 if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved
496 self.options=options.strip()
497 else:
498 if len(options) > 0:
499 try:
500 options = options.replace('=', ' = ').split()
501 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
502 indices = []
503 for sl in valued_indices:
504 indices.extend(sl)
505 token_indices = [n for n in range(len(options)) if n not in indices]
506 for opt in valued_indices:
507 #self.options[options[opt[0]].lower()] = options[opt[2]].lower()
508 self.options[options[opt[0]].lower()] = options[opt[2]]
509 for token in token_indices:
510 self.options[options[token].lower()] = None
511 except ValueError:
512 raise NexusError, 'Incorrect formatting in line: %s' % line
513
514 class Block:
515 """Represent a NEXUS block with block name and list of commandlines ."""
516 def __init__(self,title=None):
517 self.title=title
518 self.commandlines=[]
519
520 class Nexus(object):
521
522 __slots__=['original_taxon_order','__dict__']
523
524 def __init__(self, input=None):
525 self.ntax=0 # number of taxa
526 self.nchar=0 # number of characters
527 self.taxlabels=[] # labels for taxa, ordered by their id
528 self.charlabels=None # ... and for characters
529 self.statelabels=None # ... and for states
530 self.datatype='dna' # (standard), dna, rna, nucleotide, protein
531 self.respectcase=False # case sensitivity
532 self.missing='?' # symbol for missing characters
533 self.gap='-' # symbol for gap
534 self.symbols=None # set of symbols
535 self.equate=None # set of symbol synonyms
536 self.matchchar=None # matching char for matrix representation
537 self.labels=None # left, right, no
538 self.transpose=False # whether matrix is transposed
539 self.interleave=False # whether matrix is interleaved
540 self.tokens=False # unsupported
541 self.eliminate=None # unsupported
542 self.matrix=None # ...
543 self.unknown_blocks=[] # blocks we don't care about
544 self.taxsets={}
545 self.charsets={}
546 self.charpartitions={}
547 self.taxpartitions={}
548 self.trees=[] # list of Trees (instances of tree class)
549 self.translate=None # Dict to translate taxon <-> taxon numbers
550 self.structured=[] # structured input representation
551 self.set={} # dict of the set command to set various options
552 self.options={} # dict of the options command in the data block
553
554 # some defaults
555 self.options['gapmode']='missing'
556
557 if input:
558 self.read(input)
559
560 def get_original_taxon_order(self):
561 """Included for backwards compatibility."""
562 return self.taxlabels
563 def set_original_taxon_order(self,value):
564 """Included for backwards compatibility."""
565 self.taxlabels=value
566 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
567
568 def read(self,input):
569 """Read and parse NEXUS imput (filename, file-handle, string."""
570
571 # 1. Assume we have the name of a file in the execution dir
572 # Note we need to add parsing of the path to dir/filename
573 try:
574 file_contents = open(os.path.expanduser(input),'rU').read()
575 self.filename=input
576 except (TypeError,IOError,AttributeError):
577 #2 Assume we have a string from a fh.read()
578 #if isinstance(input, str):
579 # file_contents = input
580 # self.filename='input_string'
581 #3 Assume we have a file object
582 if hasattr(input,'read'): # file objects or StringIO objects
583 file_contents=input.read()
584 # GEC : Change next line so that StringIO objects work
585 #if input.name:
586 if hasattr(input, 'name'):
587 self.filename=input.name
588 else:
589 self.filename='Unknown_nexus_file'
590 else:
591 print input.strip()[:6]
592 raise NexusError, 'Unrecognized input: %s ...' % input[:100]
593 if C:
594 decommented=cnexus.scanfile(file_contents)
595 #check for unmatched parentheses
596 if decommented=='[' or decommented==']':
597 raise NexusError, 'Unmatched %s' % decommented
598 # cnexus can't return lists, so in analogy we separate commandlines with chr(7)
599 # (a character that shoudn't be part of a nexus file under normal circumstances)
600 commandlines=_adjust_lines(decommented.split(chr(7)))
601 else:
602 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents))
603 # get rid of stupid 'NEXUS token'
604 try:
605 if commandlines[0][:6].upper()=='#NEXUS':
606 commandlines[0]=commandlines[0][6:].strip()
607 except:
608 pass
609 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands
610 nexus_block_gen = self._get_nexus_block(commandlines)
611 while 1:
612 try:
613 title, contents = nexus_block_gen.next()
614 except StopIteration:
615 break
616 if title in KNOWN_NEXUS_BLOCKS:
617 self._parse_nexus_block(title, contents)
618 else:
619 self._unknown_nexus_block(title, contents)
620
621 def _get_nexus_block(self,file_contents):
622 """Generator for looping through Nexus blocks."""
623 inblock=False
624 blocklines=[]
625 while file_contents:
626 cl=file_contents.pop(0)
627 if cl.lower().startswith('begin'):
628 if not inblock:
629 inblock=True
630 title=cl.split()[1].lower()
631 else:
632 raise NexusError('Illegal block nesting in block %s' % title)
633 elif cl.lower().startswith('end'):
634 if inblock:
635 inblock=False
636 yield title,blocklines
637 blocklines=[]
638 else:
639 raise NexusError('Unmatched \'end\'.')
640 elif inblock:
641 blocklines.append(cl)
642
643 def _unknown_nexus_block(self,title, contents):
644 block = Block()
645 block.commandlines.append(contents)
646 block.title = title
647 self.unknown_blocks.append(block)
648
649 def _parse_nexus_block(self,title, contents):
650 """Parse a known Nexus Block """
651 # attached the structered block representation
652 self._apply_block_structure(title, contents)
653 #now check for taxa,characters,data blocks. If this stuff is defined more than once
654 #the later occurences will override the previous ones.
655 block=self.structured[-1]
656 for line in block.commandlines:
657 try:
658 getattr(self,'_'+line.command)(line.options)
659 except AttributeError:
660 raise
661 raise NexusError, 'Unknown command: %s ' % line.command
662
663 def _dimensions(self,options):
664 if options.has_key('ntax'):
665 self.ntax=eval(options['ntax'])
666 if options.has_key('nchar'):
667 self.nchar=eval(options['nchar'])
668
669 def _format(self,options):
670 # print options
671 # we first need to test respectcase, then symbols (which depends on respectcase)
672 # then datatype (which, if standard, depends on symbols and respectcase in order to generate
673 # dicts for ambiguous values and alphabet
674 if options.has_key('respectcase'):
675 self.respectcase=True
676 # adjust symbols to for respectcase
677 if options.has_key('symbols'):
678 self.symbols=options['symbols']
679 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\
680 (self.symbold.startswith("'") and self.symbols.endswith("'")):
681 self.symbols=self.symbols[1:-1].replace(' ','')
682 if not self.respectcase:
683 self.symbols=self.symbols.lower()+self.symbols.upper()
684 self.symbols=list(sets.Set(self.symbols))
685 if options.has_key('datatype'):
686 self.datatype=options['datatype'].lower()
687 if self.datatype=='dna' or self.datatype=='nucleotide':
688 self.alphabet=IUPAC.ambiguous_dna
689 self.ambiguous_values=IUPACData.ambiguous_dna_values
690 self.unambiguous_letters=IUPACData.unambiguous_dna_letters
691 elif self.datatype=='rna':
692 self.alphabet=IUPAC.ambiguous_rna
693 self.ambiguous_values=IUPACData.ambiguous_rna_values
694 self.unambiguous_letters=IUPACData.unambiguous_rna_letters
695 elif self.datatype=='protein':
696 self.alphabet=IUPAC.protein
697 self.ambiguous_values={'B':'DN','Z':'EQ','X':IUPACData.protein_letters} # that's how PAUP handles it
698 self.unambiguous_letters=IUPACData.protein_letters+'*' # stop-codon
699 elif self.datatype=='standard':
700 raise NexusError('Datatype standard is not yet supported.')
701 #self.alphabet=None
702 #self.ambiguous_values={}
703 #if not self.symbols:
704 # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states
705 #self.unambiguous_letters=self.symbols
706 else:
707 raise NexusError, 'Unsupported datatype: '+self.datatype
708 self.valid_characters=''.join(self.ambiguous_values.keys())+self.unambiguous_letters
709 if not self.respectcase:
710 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper()
711 #we have to sort the reverse ambig coding dict key characters:
712 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N'
713 rev=dict([(i[1],i[0]) for i in self.ambiguous_values.items() if i[0]!='X'])
714 self.rev_ambiguous_values={}
715 for (k,v) in rev.items():
716 key=[c for c in k]
717 key.sort()
718 self.rev_ambiguous_values[''.join(key)]=v
719 #overwrite symbols for datype rna,dna,nucleotide
720 if self.datatype in ['dna','rna','nucleotide']:
721 self.symbols=self.alphabet.letters
722 if self.missing not in self.ambiguous_values:
723 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap
724 self.ambiguous_values[self.gap]=self.gap
725 elif self.datatype=='standard':
726 if not self.symbols:
727 self.symbols=['1','0']
728 if options.has_key('missing'):
729 self.missing=options['missing'][0]
730 if options.has_key('gap'):
731 self.gap=options['gap'][0]
732 if options.has_key('equate'):
733 self.equate=options['equate']
734 if options.has_key('matchchar'):
735 self.matchchar=options['matchchar'][0]
736 if options.has_key('labels'):
737 self.labels=options['labels']
738 if options.has_key('transpose'):
739 raise NexusError, 'TRANSPOSE is not supported!'
740 self.transpose=True
741 if options.has_key('interleave'):
742 if options['interleave']==None or options['interleave'].lower()=='yes':
743 self.interleave=True
744 if options.has_key('tokens'):
745 self.tokens=True
746 if options.has_key('notokens'):
747 self.tokens=False
748
749
750 def _set(self,options):
751 self.set=options;
752
753 def _options(self,options):
754 self.options=options;
755
756 def _eliminate(self,options):
757 self.eliminate=options
758
759 def _taxlabels(self,options):
760 """Get taxon labels."""
761 self.taxlabels=[]
762 opts=CharBuffer(options)
763 while True:
764 taxon=quotestrip(opts.next_word())
765 if not taxon:
766 break
767 self.taxlabels.append(taxon)
768
769 def _check_taxlabels(self,taxon):
770 """Check for presence of taxon in self.taxlabels."""
771 # According to NEXUS standard, underscores shall be treated as spaces...,
772 # so checking for identity is more difficult
773 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels])
774 nexid=taxon.replace(' ','_')
775 return nextaxa.get(nexid)
776
777 def _charlabels(self,options):
778 self.charlabels={}
779 opts=CharBuffer(options)
780 while True:
781 try:
782 # get id and state
783 w=opts.next_word()
784 if w is None: # McClade saves and reads charlabel-lists with terminal comma?!
785 break
786 identifier=self._resolve(w,set_type=CHARSET)
787 state=quotestrip(opts.next_word())
788 self.charlabels[identifier]=state
789 # check for comma or end of command
790 c=opts.next_nonwhitespace()
791 if c is None:
792 break
793 elif c!=',':
794 raise NexusError,'Missing \',\' in line %s.' % options
795 except NexusError:
796 raise
797 except:
798 raise NexusError,'Format error in line %s.' % options
799
800 def _charstatelabels(self,options):
801 # warning: charstatelabels supports only charlabels-syntax!
802 self._charlabels(options)
803
804 def _statelabels(self,options):
805 #self.charlabels=options
806 #print 'Command statelabels is not supported and will be ignored.'
807 pass
808
809 def _matrix(self,options):
810 if not self.ntax or not self.nchar:
811 raise NexusError,'Dimensions must be specified before matrix!'
812 taxlabels_present=(self.taxlabels!=[])
813 self.matrix={}
814 taxcount=0
815 block_interleave=0
816 #eliminate empty lines and leading/trailing whitespace
817 lines=[l.strip() for l in options.split('\n') if l.strip()<>'']
818 lineiter=iter(lines)
819 while 1:
820 try:
821 l=lineiter.next()
822 except StopIteration:
823 if taxcount<self.ntax:
824 raise NexusError, 'Not enough taxa in matrix.'
825 elif taxcount>self.ntax:
826 raise NexusError, 'Too many taxa in matrix.'
827 else:
828 break
829 # count the taxa and check for interleaved matrix
830 taxcount+=1
831 ##print taxcount
832 if taxcount>self.ntax:
833 if not self.interleave:
834 raise NexusError, 'Too many taxa in matrix - should matrix be interleaved?'
835 else:
836 taxcount=1
837 block_interleave=1
838 #get taxon name and sequence
839 linechars=CharBuffer(l)
840 id=quotestrip(linechars.next_word())
841 l=linechars.rest().strip()
842 if taxlabels_present and not self._check_taxlabels(id):
843 raise NexusError,'Taxon '+id+' not found in taxlabels.'
844 chars=''
845 if self.interleave:
846 #interleaved matrix
847 #print 'In interleave'
848 if l:
849 chars=''.join(l.split())
850 else:
851 chars=''.join(lineiter.next().split())
852 else:
853 #non-interleaved matrix
854 chars=''.join(l.split())
855 while len(chars)<self.nchar:
856 l=lineiter.next()
857 chars+=''.join(l.split())
858 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet)
859 #first taxon has the reference sequence if matchhar is used
860 if taxcount==1:
861 refseq=iupac_seq
862 else:
863 if self.matchchar:
864 while 1:
865 p=iupac_seq.tostring().find(self.matchchar)
866 if p==-1:
867 break
868 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet)
869 #check for invalid characters
870 for i,c in enumerate(iupac_seq.tostring()):
871 if c not in self.valid_characters and c!=self.gap and c!=self.missing:
872 raise NexusError, 'Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\
873 % (id,c,l[i-10:i+10])
874 #add sequence to matrix
875 if block_interleave==0:
876 while self.matrix.has_key(id):
877 if id.split('.')[-1].startswith('copy'):
878 id='.'.join(id.split('.')[:-1])+'.copy'+str(eval('0'+id.split('.')[-1][4:])+1)
879 else:
880 id+='.copy'
881 #raise NexusError, id+' already in matrix!\nError in: '+l
882 self.matrix[id]=iupac_seq
883 # add taxon name only if taxlabels is not alredy present
884 if not taxlabels_present:
885 self.taxlabels.append(id)
886 else:
887 taxon_present=self._check_taxlabels(id)
888 if taxon_present:
889 self.matrix[taxon_present]+=iupac_seq
890 else:
891 raise NexusError, 'Taxon %s not in first block of interleaved matrix.' % id
892 #check all sequences for length according to nchar
893 for taxon in self.matrix:
894 if len(self.matrix[taxon])!=self.nchar:
895 raise NexusError,'Nchar ('+str(self.nchar)+') does not match data for taxon '+taxon
896
897 def _translate(self,options):
898 self.translate={}
899 opts=CharBuffer(options)
900 while True:
901 try:
902 # get id and state
903 identifier=int(opts.next_word())
904 label=quotestrip(opts.next_word())
905 self.translate[identifier]=label
906 # check for comma or end of command
907 c=opts.next_nonwhitespace()
908 if c is None:
909 break
910 elif c!=',':
911 raise NexusError,'Missing \',\' in line %s.' % options
912 except NexusError:
913 raise
914 except:
915 raise NexusError,'Format error in line %s.' % options
916
917 def _utree(self,options):
918 """Some software (clustalx) uses 'utree' to denote an unrooted tree."""
919 self._tree(options)
920
921 def _tree(self,options):
922 opts=CharBuffer(options)
923 name=opts.next_word()
924 if opts.next_nonwhitespace()!='=':
925 raise NexusError,'Syntax error in tree description: %s' % options[:50]
926 rooted=False
927 weight=1.0
928 while opts.peek_nonwhitespace()=='[':
929 open=opts.next_nonwhitespace()
930 symbol=opts.next()
931 if symbol!='&':
932 raise NexusError,'Illegal special comment [%s...] in tree description: %s' % (symbol, options[:50])
933 special=opts.next()
934 value=opts.next_until(']')
935 closing=opts.next()
936 if special=='R':
937 rooted=True
938 elif special=='U':
939 rooted=False
940 elif special=='W':
941 weight=float(value)
942 tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip())
943 # if there's an active translation table, translate
944 if self.translate:
945 for n in tree.get_terminals():
946 try:
947 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)])
948 except (ValueError,KeyError):
949 raise NexusError,'Unable to substitue %s using \'translate\' data.' % tree.node(n).data.taxon
950 self.trees.append(tree)
951
952 def _apply_block_structure(self,title,lines):
953 block=Block('')
954 block.title = title
955 for line in lines:
956 block.commandlines.append(Commandline(line, title))
957 self.structured.append(block)
958
959 def _taxset(self, options):
960 name,taxa=self._get_indices(options,set_type=TAXSET)
961 self.taxsets[name]=_make_unique(taxa)
962
963 def _charset(self, options):
964 name,sites=self._get_indices(options,set_type=CHARSET)
965 self.charsets[name]=_make_unique(sites)
966
967 def _taxpartition(self, options):
968 taxpartition={}
969 quotelevel=False
970 opts=CharBuffer(options)
971 name=self._name_n_vector(opts)
972 if not name:
973 raise NexusError, 'Formatting error in taxpartition: %s ' % options
974 # now collect thesubbpartitions and parse them
975 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
976 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words
977 sub=''
978 while True:
979 w=opts.next()
980 if w is None or (w==',' and not quotelevel):
981 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':')
982 taxpartition[subname]=_make_unique(subindices)
983 sub=''
984 if w is None:
985 break
986 else:
987 if w=="'":
988 quotelevel=not quotelevel
989 sub+=w
990 self.taxpartitions[name]=taxpartition
991
992 def _charpartition(self, options):
993 charpartition={}
994 quotelevel=False
995 opts=CharBuffer(options)
996 name=self._name_n_vector(opts)
997 if not name:
998 raise NexusError, 'Formatting error in charpartition: %s ' % options
999 # now collect thesubbpartitions and parse them
1000 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
1001 sub=''
1002 while True:
1003 w=opts.next()
1004 if w is None or (w==',' and not quotelevel):
1005 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':')
1006 charpartition[subname]=_make_unique(subindices)
1007 sub=''
1008 if w is None:
1009 break
1010 else:
1011 if w=="'":
1012 quotelevel=not quotelevel
1013 sub+=w
1014 self.charpartitions[name]=charpartition
1015
1016 def _get_indices(self,options,set_type=CHARSET,separator='='):
1017 """Parse the taxset/charset specification
1018 '1 2 3 - 5 dog cat 10- 20 \\ 3' --> [0,1,2,3,4,'dog','cat',10,13,16,19]
1019 """
1020 opts=CharBuffer(options)
1021 name=self._name_n_vector(opts,separator=separator)
1022 indices=self._parse_list(opts,set_type=set_type)
1023 if indices is None:
1024 raise NexusError, 'Formatting error in line: %s ' % options
1025 return name,indices
1026
1027 def _name_n_vector(self,opts,separator='='):
1028 """Extract name and check that it's not in vector format."""
1029 rest=opts.rest()
1030 name=opts.next_word()
1031 if not name:
1032 raise NexusError, 'Formatting error in line: %s ' % rest
1033 name=quotestrip(name)
1034 if opts.peek_nonwhitespace=='(':
1035 open=opts.next_nonwhitespace()
1036 qualifier=open.next_word()
1037 close=opts.next_nonwhitespace()
1038 if qualifier.lower()=='vector':
1039 raise NexusError, 'Unsupported VECTOR format in line %s' % (options)
1040 elif qualifier.lower()!='standard':
1041 raise NexusError, 'Unknown qualifier %s in line %s' % (qualifier,options)
1042 if opts.next_nonwhitespace()!=separator:
1043 raise NexusError, 'Formatting error in line: %s ' % rest
1044 return name
1045
1046 def _parse_list(self,options_buffer,set_type):
1047 """Parse a NEXUS list: [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17-21],
1048 (assuming dog is taxon no. 17 and cat is taxon no. 21).
1049 """
1050 plain_list=[]
1051 if options_buffer.peek_nonwhitespace():
1052 try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError
1053 while True:
1054 identifier=options_buffer.next_word() # next list element
1055 if not identifier: # end of list?
1056 break
1057 start=self._resolve(identifier,set_type=set_type)
1058 if options_buffer.peek_nonwhitespace()=='-': # followd by -
1059 end=start
1060 step=1
1061 # get hyphen and end of range
1062 hyphen=options_buffer.next_nonwhitespace()
1063 end=self._resolve(options_buffer.next_word(),set_type=set_type)
1064 if set_type==CHARSET:
1065 if options_buffer.peek_nonwhitespace()=='\\': # followd by \
1066 backslash=options_buffer.next_nonwhitespace()
1067 step=int(options_buffer.next_word()) # get backslash and step
1068 plain_list.extend(range(start,end+1,step))
1069 else:
1070 if type(start)==list or type(end)==list:
1071 raise NexusError, 'Name if character sets not allowed in range definition: %s' % identifier
1072 start=self.taxlabels.index(start)
1073 end=self.taxlabels.index(end)
1074 taxrange=self.taxlabels[start:end+1]
1075 plain_list.extend(taxrange)
1076 else:
1077 if type(start)==list: # start was the name of charset or taxset
1078 plain_list.extend(start)
1079 else: # start was an ordinary identifier
1080 plain_list.append(start)
1081 except NexusError:
1082 raise
1083 except:
1084 return None
1085 return plain_list
1086
1087 def _resolve(self,identifier,set_type=None):
1088 """Translate identifier in list into character/taxon index.
1089 Characters (which are referred to by their index in Nexus.py):
1090 Plain numbers are returned minus 1 (Nexus indices to python indices)
1091 Text identifiers are translaterd into their indices (if plain character indentifiers),
1092 the first hit in charlabels is returned (charlabels don't need to be unique)
1093 or the range of indices is returned (if names of character sets).
1094 Taxa (which are referred to by their unique name in Nexus.py):
1095 Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
1096 Names are returned unchanged (if plain taxon identifiers), or the names in
1097 the corresponding taxon set is returned
1098 """
1099 identifier=quotestrip(identifier)
1100 if not set_type:
1101 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.')
1102 if set_type==CHARSET:
1103 try:
1104 n=int(identifier)
1105 except ValueError:
1106 if self.charlabels and identifier in self.charlabels.values():
1107 for k in self.charlabels:
1108 if self.charlabels[k]==identifier:
1109 return k
1110 elif self.charsets and identifier in self.charsets:
1111 return self.charsets[identifier]
1112 else:
1113 raise NexusError, 'Unknown character identifier: %s' % identifier
1114 else:
1115 if n<=self.nchar:
1116 return n-1
1117 else:
1118 raise NexusError, 'Illegal character identifier: %d>nchar (=%d).' % (identifier,self.nchar)
1119 elif set_type==TAXSET:
1120 try:
1121 n=int(identifier)
1122 except ValueError:
1123 taxlabels_id=self._check_taxlabels(identifier)
1124 if taxlabels_id:
1125 return taxlabels_id
1126 elif self.taxsets and identifier in self.taxsets:
1127 return self.taxsets[identifier]
1128 else:
1129 raise NexusError, 'Unknown taxon identifier: %s' % identifier
1130 else:
1131 if n>0 and n<=self.ntax:
1132 return self.taxlabels[n-1]
1133 else:
1134 raise NexusError, 'Illegal taxon identifier: %d>ntax (=%d).' % (identifier,self.ntax)
1135 else:
1136 raise NexusError('Unknown set specification: %s.'% set_type)
1137
1138 def _stateset(self, options):
1139 #Not implemented
1140 pass
1141
1142 def _changeset(self, options):
1143 #Not implemented
1144 pass
1145
1146 def _treeset(self, options):
1147 #Not implemented
1148 pass
1149
1150 def _treepartition(self, options):
1151 #Not implemented
1152 pass
1153
1154 def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False,
1155 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
1156 """Writes a nexus file for each partition in charpartition.
1157 Only non-excluded characters and non-deleted taxa are included, just the data block is written.
1158 """
1159
1160 if not matrix:
1161 matrix=self.matrix
1162 if not matrix:
1163 return
1164 if not filename:
1165 filename=self.filename
1166 if charpartition:
1167 pfilenames={}
1168 for p in charpartition:
1169 total_exclude=[]+exclude
1170 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]])
1171 total_exclude=_make_unique(total_exclude)
1172 pcomment=comment+'\nPartition: '+p+'\n'
1173 dot=filename.rfind('.')
1174 if dot>0:
1175 pfilename=filename[:dot]+'_'+p+'.data'
1176 else:
1177 pfilename=filename+'_'+p
1178 pfilenames[p]=pfilename
1179 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize,
1180 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False,
1181 mrbayes=mrbayes)
1182 return pfilenames
1183 else:
1184 fn=self.filename+'.data'
1185 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave,
1186 exclude=exclude,delete=delete,comment=comment,append_sets=False,
1187 mrbayes=mrbayes)
1188 return fn
1189
1190 def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
1191 blocksize=None, interleave=False, interleave_by_partition=False,\
1192 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False):
1193 """ Writes a nexus file with data and sets block. Character sets and partitions
1194 are appended by default, and are adjusted according
1195 to excluded characters (i.e. character sets still point to the same sites (not necessarily same positions),
1196 without including the deleted characters.
1197 """
1198 if not matrix:
1199 matrix=self.matrix
1200 if not matrix:
1201 return
1202 if not filename:
1203 filename=self.filename
1204 if [t for t in delete if not self._check_taxlabels(t)]:
1205 raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(sets.Set(self.taxlabels)))
1206 if interleave_by_partition:
1207 if not interleave_by_partition in self.charpartitions:
1208 raise NexusError, 'Unknown partition: '+interleave_by_partition
1209 else:
1210 partition=self.charpartitions[interleave_by_partition]
1211 # we need to sort the partition names by starting position before we exclude characters
1212 names=_sort_keys_by_values(partition)
1213 newpartition={}
1214 for p in partition:
1215 newpartition[p]=[c for c in partition[p] if c not in exclude]
1216 # how many taxa and how many characters are left?
1217 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete]
1218 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete))
1219 ntax_adjusted=len(undelete)
1220 nchar_adjusted=len(cropped_matrix[undelete[0]])
1221 if not undelete or (undelete and undelete[0]==''):
1222 return
1223 if isinstance(filename,str):
1224 try:
1225 fh=open(filename,'w')
1226 except IOError:
1227 raise NexusError, 'Could not open %s for writing.' % filename
1228 elif isinstance(filename,file):
1229 fh=filename
1230 if not omit_NEXUS:
1231 fh.write('#NEXUS\n')
1232 if comment:
1233 fh.write('['+comment+']\n')
1234 fh.write('begin data;\n')
1235 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted))
1236 fh.write('\tformat datatype='+self.datatype)
1237 if self.respectcase:
1238 fh.write(' respectcase')
1239 if self.missing:
1240 fh.write(' missing='+self.missing)
1241 if self.gap:
1242 fh.write(' gap='+self.gap)
1243 if self.matchchar:
1244 fh.write(' matchchar='+self.matchchar)
1245 if self.labels:
1246 fh.write(' labels='+self.labels)
1247 if self.equate:
1248 fh.write(' equate='+self.equate)
1249 if interleave or interleave_by_partition:
1250 fh.write(' interleave')
1251 fh.write(';\n')
1252 #if self.taxlabels:
1253 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n')
1254 if self.charlabels:
1255 newcharlabels=self._adjust_charlabels(exclude=exclude)
1256 clkeys=newcharlabels.keys()
1257 clkeys.sort()
1258 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n')
1259 fh.write('matrix\n')
1260 if not blocksize:
1261 if interleave:
1262 blocksize=70
1263 else:
1264 blocksize=self.nchar
1265 # delete deleted taxa and ecxclude excluded characters...
1266 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete])
1267 if interleave_by_partition:
1268 # interleave by partitions, but adjust partitions with regard to excluded characters
1269 seek=0
1270 for p in names:
1271 fh.write('[%s: %s]\n' % (interleave_by_partition,p))
1272 if len(newpartition[p])>0:
1273 for taxon in undelete:
1274 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1275 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n')
1276 fh.write('\n')
1277 else:
1278 fh.write('[empty]\n\n')
1279 seek+=len(newpartition[p])
1280 elif interleave:
1281 for seek in range(0,nchar_adjusted,blocksize):
1282 for taxon in undelete:
1283 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1284 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1285 fh.write('\n')
1286 else:
1287 for taxon in undelete:
1288 if blocksize<nchar_adjusted:
1289 fh.write(safename(taxon,mrbayes=mrbayes)+'\n')
1290 else:
1291 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1292 for seek in range(0,nchar_adjusted,blocksize):
1293 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1294 fh.write(';\nend;\n')
1295 if append_sets:
1296 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes))
1297 fh.close()
1298 return filename
1299
1300 def append_sets(self,exclude=[],delete=[],mrbayes=False):
1301 """Appends a sets block to <filename>."""
1302 if not self.charsets and not self.taxsets and not self.charpartitions:
1303 return ''
1304 sets=['\nbegin sets']
1305 # - now if characters have been excluded, the character sets need to be adjusted,
1306 # so that they still point to the right character positions
1307 # calculate a list of offsets: for each deleted character, the following character position
1308 # in the new file will have an additional offset of -1
1309 offset=0
1310 offlist=[]
1311 for c in range(self.nchar):
1312 if c in exclude:
1313 offset+=1
1314 offlist.append(-1) # dummy value as these character positions are excluded
1315 else:
1316 offlist.append(c-offset)
1317 # now adjust each of the character sets
1318 for n,ns in self.charsets.items():
1319 cset=[offlist[c] for c in ns if c not in exclude]
1320 if cset:
1321 sets.append('charset %s = %s' % (safename(n),_compact4nexus(cset)))
1322 for n,s in self.taxsets.items():
1323 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
1324 if tset:
1325 sets.append('taxset %s = %s' % (safename(n),' '.join(tset)))
1326 for n,p in self.charpartitions.items():
1327 # as characters have been excluded, the partitions must be adjusted
1328 # if a partition is empty, it will be omitted from the charpartition command
1329 # (although paup allows charpartition part=t1:,t2:,t3:1-100)
1330 names=_sort_keys_by_values(p)
1331 newpartition={}
1332 for sn in names:
1333 nsp=[offlist[c] for c in p[sn] if c not in exclude]
1334 if nsp:
1335 newpartition[sn]=nsp
1336 if newpartition:
1337 sets.append('charpartition %s = %s' % (safename(n),\
1338 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition])))
1339 # now write charpartititions, much easier than charpartitions
1340 for n,p in self.taxpartitions.items():
1341 names=_sort_keys_by_values(p)
1342 newpartition={}
1343 for sn in names:
1344 nsp=[t for t in p[sn] if t not in delete]
1345 if nsp:
1346 newpartition[sn]=nsp
1347 if newpartition:
1348 sets.append('taxpartition %s = %s' % (safename(n),\
1349 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition])))
1350 # add 'end' and return everything
1351 sets.append('end;\n')
1352 return ';\n'.join(sets)
1353 f.close()
1354
1355 def export_fasta(self, filename=None, width=70):
1356 """Writes matrix into a fasta file: (self, filename=None, width=70)."""
1357 if not filename:
1358 if '.' in filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1359 filename='.'.join(self.filename.split('.')[:-1])+'.fas'
1360 else:
1361 filename=self.filename+'.fas'
1362 fh=open(filename,'w')
1363 for taxon in self.taxlabels:
1364 fh.write('>'+safename(taxon)+'\n')
1365 for i in range(0, len(self.matrix[taxon].tostring()), width):
1366 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n')
1367 fh.close()
1368
1369 def constant(self,matrix=None,delete=[],exclude=[]):
1370 """Return a list with all constant characters."""
1371 if not matrix:
1372 matrix=self.matrix
1373 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1374 if not undelete:
1375 return None
1376 elif len(undelete)==1:
1377 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
1378 # get the first sequence and expand all ambiguous values
1379 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for
1380 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude]
1381 for taxon in undelete[1:]:
1382 newconstant=[]
1383 for site in constant:
1384 #print '%d (paup=%d)' % (site[0],site[0]+1),
1385 seqsite=matrix[taxon][site[0]].upper()
1386 #print seqsite,'checked against',site[1],'\t',
1387 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]:
1388 # missing or same as before -> ok
1389 newconstant.append(site)
1390 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
1391 # subset of an ambig or only missing in previous -> take subset
1392 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite)))
1393 elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values
1394 intersect=sets.Set(self.ambiguous_values[seqsite]).intersection(sets.Set(site[1]))
1395 if intersect:
1396 newconstant.append((site[0],''.join(intersect)))
1397 # print 'ok'
1398 #else:
1399 # print 'failed'
1400 #else:
1401 # print 'failed'
1402 constant=newconstant
1403 cpos=[s[0] for s in constant]
1404 return constant
1405 # return [x[0] for x in constant]
1406
1407 def cstatus(self,site,delete=[],narrow=True):
1408 """Summarize character.
1409 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?)
1410 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -)
1411 """
1412 undelete=[t for t in self.taxlabels if t not in delete]
1413 if not undelete:
1414 return None
1415 cstatus=[]
1416 for t in undelete:
1417 c=self.matrix[t][site].upper()
1418 if self.options.get('gapmode')=='missing' and c==self.gap:
1419 c=self.missing
1420 if narrow and c==self.missing:
1421 if c not in cstatus:
1422 cstatus.append(c)
1423 else:
1424 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus])
1425 if self.missing in cstatus and narrow and len(cstatus)>1:
1426 cstatus=[c for c in cstatus if c!=self.missing]
1427 cstatus.sort()
1428 return cstatus
1429
1430 def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]):
1431 """Calculates a stepmatrix for weighted parsimony.
1432 See Wheeler (1990), Cladistics 6:269-275 and
1433 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196
1434 """
1435 m=StepMatrix(self.unambiguous_letters,self.gap)
1436 for site in [s for s in range(self.nchar) if s not in exclude]:
1437 cstatus=self.cstatus(site,delete)
1438 for i,b1 in enumerate(cstatus[:-1]):
1439 for b2 in cstatus[i+1:]:
1440 m.add(b1.upper(),b2.upper(),1)
1441 return m.transformation().weighting().smprint(name=name)
1442
1443 def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1444 """Return a matrix without deleted taxa and excluded characters."""
1445 if not matrix:
1446 matrix=self.matrix
1447 if [t for t in delete if not self._check_taxlabels(t)]:
1448 raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(self.taxlabels))
1449 if exclude!=[]:
1450 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1451 if not undelete:
1452 return {}
1453 m=[matrix[k].tostring() for k in undelete]
1454 zipped_m=zip(*m)
1455 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude]
1456 if sitesm==[]:
1457 return dict([(t,Seq('',self.alphabet)) for t in undelete])
1458 else:
1459 zipped_sitesm=zip(*sitesm)
1460 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)]
1461 return dict(zip(undelete,m))
1462 else:
1463 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1464
1465 def bootstrap(self,matrix=None,delete=[],exclude=[]):
1466 """Return a bootstrapped matrix."""
1467 if not matrix:
1468 matrix=self.matrix
1469 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq) # remember if Seq objects
1470 cm=self.crop_matrix(delete=delete,exclude=exclude) # crop data out
1471 if not cm: # everything deleted?
1472 return {}
1473 elif len(cm[cm.keys()[0]])==0: # everything excluded?
1474 return cm
1475 undelete=[t for t in self.taxlabels if t in cm]
1476 if seqobjects:
1477 sitesm=zip(*[cm[t].tostring() for t in undelete])
1478 alphabet=matrix[matrix.keys()[0]].alphabet
1479 else:
1480 sitesm=zip(*[cm[t] for t in undelete])
1481 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))]
1482 bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
1483 if seqobjects:
1484 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
1485 return dict(zip(undelete,bootstrapseqs))
1486
1487 def add_sequence(self,name,sequence):
1488 """Adds a sequence to the matrix."""
1489 if not name:
1490 raise NexusError, 'New sequence must have a name'
1491 diff=self.nchar-len(sequence)
1492 if diff<0:
1493 self.insert_gap(self.nchar,-diff)
1494 elif diff>0:
1495 sequence+=self.missing*diff
1496
1497 self.matrix[name]=Seq(sequence,self.alphabet)
1498 self.ntax+=1
1499 self.taxlabels.append(name)
1500 #taxlabels?
1501
1502 def insert_gap(self,pos,n=1,leftgreedy=False):
1503 """Add a gap into the matrix and adjust charsets and partitions.
1504
1505 pos=0: first position
1506 pos=nchar: last position
1507 """
1508
1509 def _adjust(set,x,d,leftgreedy=False):
1510 """Adjusts chartacter sets if gaps are inserted, taking care of
1511 new gaps within a coherent character set."""
1512 # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15
1513 # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18
1514 # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18
1515 set.sort()
1516 addpos=0
1517 for i,c in enumerate(set):
1518 if c>=x:
1519 set[i]=c+d
1520 # if we add gaps within a group of characters, we want the gap position included in this group
1521 if c==x:
1522 if leftgreedy or (i>0 and set[i-1]==c-1):
1523 addpos=i
1524 if addpos>0:
1525 set[addpos:addpos]=range(x,x+d)
1526 return set
1527
1528 if pos<0 or pos>self.nchar:
1529 raise NexusError('Illegal gap position: %d' % pos)
1530 if n==0:
1531 return
1532 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1533 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n
1534 # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\
1535 # i,taxon in enumerate(self.taxlabels)])
1536 zipped=zip(*sitesm)
1537 mapped=map(''.join,zipped)
1538 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)]
1539 self.matrix=dict(listed)
1540 self.nchar+=n
1541 # now adjust character sets
1542 for i,s in self.charsets.items():
1543 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1544 for p in self.charpartitions:
1545 for sp,s in self.charpartitions[p].items():
1546 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1547 # now adjust character state labels
1548 self.charlabels=self._adjust_charlabels(insert=[pos]*n)
1549 return self.charlabels
1550
1551 def _adjust_charlabels(self,exclude=None,insert=None):
1552 """Return adjusted indices of self.charlabels if characters are excluded or inserted."""
1553 if exclude and insert:
1554 raise NexusError, 'Can\'t exclude and insert at the same time'
1555 if not self.charlabels:
1556 return None
1557 labels=self.charlabels.keys()
1558 labels.sort()
1559 newcharlabels={}
1560 if exclude:
1561 exclude.sort()
1562 exclude.append(sys.maxint)
1563 excount=0
1564 for c in labels:
1565 if not c in exclude:
1566 while c>exclude[excount]:
1567 excount+=1
1568 newcharlabels[c-excount]=self.charlabels[c]
1569 elif insert:
1570 insert.sort()
1571 insert.append(sys.maxint)
1572 icount=0
1573 for c in labels:
1574 while c>=insert[icount]:
1575 icount+=1
1576 newcharlabels[c+icount]=self.charlabels[c]
1577 else:
1578 return self.charlabels
1579 return newcharlabels
1580
1581 def invert(self,charlist):
1582 """Returns all character indices that are not in charlist."""
1583 return [c for c in range(self.nchar) if c not in charlist]
1584
1585 def gaponly(self,include_missing=False):
1586 """Return gap-only sites."""
1587 gap=sets.Set(self.gap)
1588 if include_missing:
1589 gap.add(self.missing)
1590 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1591 gaponly=[i for i,site in enumerate(sitesm) if sets.Set(site).issubset(gap)]
1592 return gaponly
1593
1594 def terminal_gap_to_missing(self,missing=None,skip_n=True):
1595 """Replaces all terminal gaps with missing character.
1596
1597 Mixtures like ???------??------- are properly resolved."""
1598
1599 if not missing:
1600 missing=self.missing
1601 replace=[self.missing,self.gap]
1602 if not skip_n:
1603 replace.extend(['n','N'])
1604 for taxon in self.taxlabels:
1605 sequence=self.matrix[taxon].tostring()
1606 length=len(sequence)
1607 start,end=get_start_end(sequence,skiplist=replace)
1608 sequence=sequence[:end+1]+missing*(length-end-1)
1609 sequence=start*missing+sequence[start:]
1610 assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon
1611 self.matrix[taxon]=Seq(sequence,self.alphabet)
1612