Mercurial > repos > davidmurphy > codonlogo

diff corebio/seq_io/_nexus/init.py @ 0:c55bdc2fb9fa
Uploaded
author: davidmurphy
date: Thu, 27 Oct 2011 12:09:09 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/corebio/seq_io/_nexus/__init__.py	Thu Oct 27 12:09:09 2011 -0400
@@ -0,0 +1,1612 @@
+# Nexus.py - a NEXUS parser
+#
+# Copyright 2005 by Frank Kauff & Cymon J. Cox. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+# 
+# Bug reports welcome: fkauff@duke.edu
+#
+
+"""Parse the contents of a nexus file.
+Based upon 'NEXUS: An extensible file format for systematic information'
+Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
+
+Authors: Frank Kauff and Cymon J. Cox
+"""
+
+import os,sys, math, random, copy
+import sets
+
+# --- Changes from Bio.Nexus ---
+# Renamed Nexus.py to __init__.py. Helps with api documentation.
+# One further change in file, tagged with 'GEC'
+#from Bio.Alphabet import IUPAC
+#from Bio.Data import IUPACData
+#from Bio.Seq import Seq
+
+
+from corebio.seq import Seq, Alphabet, protein_alphabet
+import corebio.data as data
+from corebio.utils import Struct
+
+IUPACData = Struct( 
+    ambiguous_dna_letters = data.dna_extended_letters,
+    ambiguous_rna_letters = data.rna_extended_letters,
+    ambiguous_dna_values = data.dna_ambiguity,
+    ambiguous_rna_values = data.rna_ambiguity,
+    protein_letters = data.amino_acid_letters, 
+    unambiguous_dna_letters = data.dna_letters,
+    unambiguous_rna_letters = data.rna_letters,
+    )
+    
+IUPAC = Struct(
+    ambiguous_dna = Alphabet(IUPACData.ambiguous_dna_letters+'-?'),
+    ambiguous_rna = Alphabet(IUPACData.ambiguous_rna_letters+'-?'),
+    protein = protein_alphabet #Alphabet(IUPACData.protein_letters+'-?')
+    )
+
+
+
+# End Changes
+
+
+from Trees import Tree,NodeData
+
+C = False
+
+#try:
+#    import cnexus
+#except ImportError:
+#    C=False
+#else:
+#    C=True
+
+INTERLEAVE=70
+SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\
+        'matrix','tree', 'utree','translate']
+KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets']
+PUNCTUATION='()[]{}/\,;:=*\'"`+-<>'
+MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_'
+WHITESPACE=' \t\n'
+#SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments
+SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored
+CHARSET='chars'
+TAXSET='taxa'
+
+class NexusError(Exception): pass
+
+class CharBuffer:
+    """Helps reading NEXUS-words and characters from a buffer."""
+    def __init__(self,string):
+        if string:
+            self.buffer=list(string)
+        else:
+            self.buffer=[]
+    
+    def peek(self):
+        if self.buffer:
+            return self.buffer[0]
+        else:
+            return None
+
+    def peek_nonwhitespace(self):
+        b=''.join(self.buffer).strip()
+        if b:
+            return b[0]
+        else:
+            return None
+                
+    def next(self):
+        if self.buffer:
+            return self.buffer.pop(0)
+        else:
+            return None
+
+    def next_nonwhitespace(self):
+        while True:
+            p=self.next()
+            if p is None:
+                break
+            if p not in WHITESPACE: 
+                return p
+        return None
+
+    def skip_whitespace(self):
+        while self.buffer[0] in WHITESPACE:
+            self.buffer=self.buffer[1:]
+    
+    def next_until(self,target):
+        for t in target:
+            try:
+                pos=self.buffer.index(t)
+            except ValueError:
+                pass
+            else:
+                found=''.join(self.buffer[:pos])
+                self.buffer=self.buffer[pos:]
+                return found
+        else:
+            return None
+
+    def peek_word(self,word):
+        return ''.join(self.buffer[:len(word)])==word
+    
+    def next_word(self):
+        """Return the next NEXUS word from a string, dealing with single and double quotes,
+        whitespace and punctuation.
+        """
+
+        word=[]
+        quoted=False
+        first=self.next_nonwhitespace()                                   # get first character
+        if not first:                                       # return empty if only whitespace left
+            return None
+        word.append(first)
+        if first=="'":                                      # word starts with a quote
+            quoted=True
+        elif first in PUNCTUATION:                          # if it's punctuation, return immediately
+            return first
+        while True:             
+            c=self.peek()
+            if c=="'":                                      # a quote?
+                word.append(self.next())                    # store quote 
+                if self.peek()=="'":                        # double quote
+                    skip=self.next()                        # skip second quote 
+                elif quoted:                                # second single quote ends word
+                    break
+            elif quoted:
+                word.append(self.next())                              # if quoted, then add anything
+            elif not c or c in PUNCTUATION or c in WHITESPACE:        # if not quoted and special character, stop 
+                break
+            else:
+                word.append(self.next())                    # standard character
+        return ''.join(word)
+                
+    def rest(self):
+        """Return the rest of the string without parsing."""
+        return ''.join(self.buffer)
+
+class StepMatrix:
+    """Calculate a stepmatrix for weighted parsimony.
+    See Wheeler (1990), Cladistics 6:269-275.
+    """
+    
+    def __init__(self,symbols,gap):
+        self.data={}
+        self.symbols=[s for s in symbols]
+        self.symbols.sort()
+        if gap:
+            self.symbols.append(gap)
+        for x in self.symbols:
+            for y in [s for s in self.symbols if s!=x]:
+                self.set(x,y,0)
+
+    def set(self,x,y,value):
+        if x>y:
+            x,y=y,x
+        self.data[x+y]=value
+
+    def add(self,x,y,value):
+        if x>y:
+            x,y=y,x
+        self.data[x+y]+=value
+
+    def sum(self):
+        return reduce(lambda x,y:x+y,self.data.values()) 
+
+    def transformation(self):
+        total=self.sum()
+        if total!=0:
+            for k in self.data:
+                self.data[k]=self.data[k]/float(total)
+        return self
+    
+    def weighting(self):
+        for k in self.data:
+            if self.data[k]!=0:
+                self.data[k]=-math.log(self.data[k])
+        return self
+
+    def smprint(self,name='your_name_here'):
+        matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols))
+        matrix+='        %s\n' % '        '.join(self.symbols)
+        for x in self.symbols:
+            matrix+='[%s]'.ljust(8) % x
+            for y in self.symbols:
+                if x==y:
+                    matrix+=' .       '
+                else:
+                    if x>y:
+                        x1,y1=y,x
+                    else:
+                        x1,y1=x,y
+                    if self.data[x1+y1]==0:
+                        matrix+='inf.     '
+                    else:
+                        matrix+='%2.2f'.ljust(10) % (self.data[x1+y1])
+            matrix+='\n'
+        matrix+=';\n'
+        return matrix
+    
+def safename(name,mrbayes=False):
+    """Return a taxon identifier according to NEXUS standard.
+    Wrap quotes around names with punctuation or whitespace, and double single quotes.
+    mrbayes=True: write names without quotes, whitespace or punctuation for mrbayes.
+    """
+    if mrbayes:
+        safe=name.replace(' ','_')
+        safe=''.join([c for c in safe if c in MRBAYESSAFE])
+    else:
+        safe=name.replace("'","''")
+        if sets.Set(safe).intersection(sets.Set(WHITESPACE+PUNCTUATION)):
+            safe="'"+safe+"'"
+    return safe
+
+def quotestrip(word):
+    """Remove quotes and/or double quotes around identifiers."""
+    if not word:
+        return None
+    while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')):
+        word=word[1:-1]
+    return word
+
+def get_start_end(sequence, skiplist=['-','?']):
+    """Return position of first and last character which is not in skiplist (defaults to ['-','?'])."""
+
+    length=len(sequence)
+    if length==0:
+        return None,None
+    end=length-1
+    while end>=0 and (sequence[end] in skiplist):
+        end-=1
+    start=0
+    while start<length and (sequence[start] in skiplist):
+        start+=1
+    return start,end 
+    
+def _sort_keys_by_values(p):
+    """Returns a sorted list of keys of p sorted by values of p."""     
+    startpos=[(p[pn],pn) for pn in p if p[pn]]
+    startpos.sort()
+    return zip(*startpos)[1]
+    
+def _make_unique(l):
+    """Check that all values in list are unique and return a pruned and sorted list."""
+    l=list(sets.Set(l))
+    l.sort()
+    return l
+
+def _seqmatrix2strmatrix(matrix):
+    """Converts a Seq-object matrix to a plain sequence-string matrix."""
+    return dict([(t,matrix[t].tostring()) for t in matrix])
+
+def _compact4nexus(orig_list):
+    """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
+    into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
+    """
+    
+    if not orig_list:
+        return ''
+    orig_list=list(sets.Set(orig_list))
+    orig_list.sort()
+    shortlist=[]
+    clist=orig_list[:]
+    clist.append(clist[-1]+.5) # dummy value makes it easier 
+    while len(clist)>1:
+        step=1
+        for i,x in enumerate(clist):
+            if x==clist[0]+i*step:   # are we still in the right step?
+                continue
+            elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
+                # second element, and possibly at least 3 elements to link,
+                # and the next one is in the right step 
+                step=x-clist[0]
+            else:   # pattern broke, add all values before current position to new list
+                sub=clist[:i]
+                if len(sub)==1:
+                    shortlist.append(str(sub[0]+1))
+                else:
+                    if step==1:
+                        shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1))
+                    else:
+                        shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step))
+                clist=clist[i:]
+                break
+    return ' '.join(shortlist)
+
+def combine(matrices):
+    """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
+
+    combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
+    Character sets, character partitions and taxon sets are prefixed, readjusted and present in
+    the combined matrix. 
+    """
+    
+    if not matrices:
+        return None
+    name=matrices[0][0]
+    combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix
+    mixed_datatypes=(len(sets.Set([n[1].datatype for n in matrices]))>1)
+    if mixed_datatypes:
+        combined.datatype='None'    # dealing with mixed matrices is application specific. You take care of that yourself!
+    #    raise NexusError, 'Matrices must be of same datatype' 
+    combined.charlabels=None
+    combined.statelabels=None
+    combined.interleave=False
+    combined.translate=None
+
+    # rename taxon sets and character sets and name them with prefix
+    for cn,cs in combined.charsets.items():
+        combined.charsets['%s.%s' % (name,cn)]=cs
+        del combined.charsets[cn]
+    for tn,ts in combined.taxsets.items():
+        combined.taxsets['%s.%s' % (name,tn)]=ts
+        del combined.taxsets[tn]
+    # previous partitions usually don't make much sense in combined matrix
+    # just initiate one new partition parted by single matrices
+    combined.charpartitions={'combined':{name:range(combined.nchar)}}
+    for n,m in matrices[1:]:    # add all other matrices
+        both=[t for t in combined.taxlabels if t in m.taxlabels]
+        combined_only=[t for t in combined.taxlabels if t not in both]
+        m_only=[t for t in m.taxlabels if t not in both]
+        for t in both:
+            # concatenate sequences and unify gap and missing character symbols
+            combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
+        # replace date of missing taxa with symbol for missing data
+        for t in combined_only:
+            combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet)
+        for t in m_only:
+            combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\
+                Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
+        combined.taxlabels.extend(m_only)    # new taxon list
+        for cn,cs in m.charsets.items():                # adjust character sets for new matrix
+            combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs]
+        if m.taxsets:
+            if not combined.taxsets:
+                combined.taxsets={}
+            combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()]))   # update taxon sets
+        combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar)     # update new charpartition
+        # update charlabels
+        if m.charlabels:
+            if not combined.charlabels:
+                combined.charlabels={}
+            combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()]))
+        combined.nchar+=m.nchar # update nchar and ntax
+        combined.ntax+=len(m_only)
+        
+    return combined
+
+def _kill_comments_and_break_lines(text):
+    """Delete []-delimited comments out of a file and break into lines separated by ';'.
+    
+    stripped_text=_kill_comments_and_break_lines(text):
+    Nested and multiline comments are allowed. [ and ] symbols within single
+    or double quotes are ignored, newline ends a quote, all symbols with quotes are
+    treated the same (thus not quoting inside comments like [this character ']' ends a comment])
+    Special [&...] and [\...] comments remain untouched, if not inside standard comment.
+    Quotes inside special [& and [\ are treated as normal characters,
+    but no nesting inside these special comments allowed (like [&   [\   ]]).
+    ';' ist deleted from end of line.
+   
+    NOTE: this function is very slow for large files, and obsolete when using C extension cnexus
+    """
+    contents=CharBuffer(text)
+    newtext=[] 
+    newline=[]
+    quotelevel=''
+    speciallevel=False
+    commlevel=0
+    while True:
+        #plain=contents.next_until(["'",'"','[',']','\n',';'])   # search for next special character
+        #if not plain:
+        #    newline.append(contents.rest)                       # not found, just add the rest
+        #    break
+        #newline.append(plain)                                   # add intermediate text
+        t=contents.next()                                       # and get special character
+        if t is None:
+            break
+        if t==quotelevel and not (commlevel or speciallevel):            # matching quote ends quotation
+            quotelevel=''
+        elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation
+            quotelevel=t
+        elif not quotelevel and t=='[':                             # opening bracket outside a quote
+            if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel:
+                speciallevel=True
+            else:
+                commlevel+=1
+        elif not quotelevel and t==']':                             # closing bracket ioutside a quote 
+            if speciallevel:
+                speciallevel=False
+            else:
+                commlevel-=1
+                if commlevel<0:
+                    raise NexusError, 'Nexus formatting error: unmatched ]'
+                continue 
+        if commlevel==0:                        # copy if we're not in comment
+            if t==';' and not quotelevel:
+                newtext.append(''.join(newline))
+                newline=[]
+            else:
+                newline.append(t)
+    #level of comments should be 0 at the end of the file
+    if newline:
+        newtext.append('\n'.join(newline))
+    if commlevel>0:
+        raise NexusError, 'Nexus formatting error: unmatched ['
+    return newtext
+
+
+def _adjust_lines(lines):
+    """Adjust linebreaks to match ';', strip leading/trailing whitespace 
+
+    list_of_commandlines=_adjust_lines(input_text)
+    Lines are adjusted so that no linebreaks occur within a commandline 
+    (except matrix command line)
+    """
+    formatted_lines=[] 
+    for l in lines:
+        #Convert line endings
+        l=l.replace('\r\n','\n').replace('\r','\n').strip()
+        if l.lower().startswith('matrix'):
+            formatted_lines.append(l)
+        else:
+            l=l.replace('\n',' ')
+            if l:
+                formatted_lines.append(l)
+    return formatted_lines
+    
+def _replace_parenthesized_ambigs(seq,rev_ambig_values):
+    """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
+
+    opening=seq.find('(')
+    while opening>-1:
+        closing=seq.find(')')
+        if closing<0:
+            raise NexusError, 'Missing closing parenthesis in: '+seq
+        elif closing<opening:
+            raise NexusError, 'Missing opening parenthesis in: '+seq
+        ambig=[x for x in seq[opening+1:closing]]
+        ambig.sort()
+        ambig=''.join(ambig)
+        ambig_code=rev_ambig_values[ambig.upper()]
+        if ambig!=ambig.upper():
+            ambig_code=ambig_code.lower()
+        seq=seq[:opening]+ambig_code+seq[closing+1:]        
+        opening=seq.find('(')
+    return seq
+
+
+class Commandline:
+    """Represent a commandline as command and options."""
+    
+    def __init__(self, line, title):
+        self.options={}
+        options=[]
+        self.command=None
+        try:
+            #Assume matrix (all other command lines have been stripped of \n)
+            self.command, options = line.strip().split('\n', 1)
+        except ValueError: #Not matrix
+            #self.command,options=line.split(' ',1)  #no: could be tab or spaces (translate...)
+            self.command=line.split()[0]
+            options=' '.join(line.split()[1:])
+        self.command = self.command.strip().lower()
+        if self.command in SPECIAL_COMMANDS:   # special command that need newlines and order of options preserved
+            self.options=options.strip()
+        else:    
+            if len(options) > 0:
+                try: 
+                    options = options.replace('=', ' = ').split()
+                    valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
+                    indices = []
+                    for sl in valued_indices:
+                        indices.extend(sl)
+                    token_indices = [n for n in range(len(options)) if n not in indices]
+                    for opt in valued_indices:
+                        #self.options[options[opt[0]].lower()] = options[opt[2]].lower()
+                        self.options[options[opt[0]].lower()] = options[opt[2]]
+                    for token in token_indices:
+                        self.options[options[token].lower()] = None
+                except ValueError:
+                    raise NexusError, 'Incorrect formatting in line: %s' % line
+                
+class Block:
+    """Represent a NEXUS block with block name and list of commandlines ."""
+    def __init__(self,title=None):
+        self.title=title
+        self.commandlines=[]
+
+class Nexus(object):
+
+    __slots__=['original_taxon_order','__dict__']
+
+    def __init__(self, input=None):
+        self.ntax=0                     # number of taxa
+        self.nchar=0                    # number of characters
+        self.taxlabels=[]             # labels for taxa, ordered by their id
+        self.charlabels=None            # ... and for characters
+        self.statelabels=None           # ... and for states
+        self.datatype='dna'             # (standard), dna, rna, nucleotide, protein
+        self.respectcase=False          # case sensitivity
+        self.missing='?'                # symbol for missing characters
+        self.gap='-'                    # symbol for gap
+        self.symbols=None               # set of symbols
+        self.equate=None                # set of symbol synonyms
+        self.matchchar=None             # matching char for matrix representation
+        self.labels=None                # left, right, no
+        self.transpose=False            # whether matrix is transposed
+        self.interleave=False           # whether matrix is interleaved
+        self.tokens=False               # unsupported          
+        self.eliminate=None             # unsupported 
+        self.matrix=None                # ...
+        self.unknown_blocks=[]          # blocks we don't care about
+        self.taxsets={}
+        self.charsets={}
+        self.charpartitions={}
+        self.taxpartitions={}
+        self.trees=[]                   # list of Trees (instances of tree class)
+        self.translate=None             # Dict to translate taxon <-> taxon numbers
+        self.structured=[]              # structured input representation
+        self.set={}                     # dict of the set command to set various options 
+        self.options={}                 # dict of the options command in the data block
+   
+        # some defaults
+        self.options['gapmode']='missing'
+        
+        if input:
+            self.read(input)
+
+    def get_original_taxon_order(self):
+        """Included for backwards compatibility."""
+        return self.taxlabels
+    def set_original_taxon_order(self,value):
+        """Included for backwards compatibility."""
+        self.taxlabels=value
+    original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
+    
+    def read(self,input):
+        """Read and parse NEXUS imput (filename, file-handle, string."""
+        
+        # 1. Assume we have the name of a file in the execution dir
+        # Note we need to add parsing of the path to dir/filename
+        try:
+            file_contents = open(os.path.expanduser(input),'rU').read()
+            self.filename=input
+        except (TypeError,IOError,AttributeError):
+            #2 Assume we have a string from a fh.read()
+            #if isinstance(input, str):
+            #    file_contents = input
+            #    self.filename='input_string'
+            #3 Assume we have a file object
+            if hasattr(input,'read'): # file objects or StringIO objects
+                file_contents=input.read()
+                # GEC : Change next line so that StringIO objects work
+                #if input.name:
+                if hasattr(input, 'name'): 
+                    self.filename=input.name
+                else:
+                    self.filename='Unknown_nexus_file'
+            else:
+                print input.strip()[:6]
+                raise NexusError, 'Unrecognized input: %s ...' % input[:100]
+        if C:
+            decommented=cnexus.scanfile(file_contents)
+            #check for unmatched parentheses
+            if decommented=='[' or decommented==']':
+                raise NexusError, 'Unmatched %s' % decommented
+            # cnexus can't return lists, so in analogy we separate commandlines with chr(7)
+            # (a character that shoudn't be part of a nexus file under normal circumstances)
+            commandlines=_adjust_lines(decommented.split(chr(7)))
+        else:
+            commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents))
+        # get rid of stupid 'NEXUS token'
+        try:
+            if commandlines[0][:6].upper()=='#NEXUS':
+                commandlines[0]=commandlines[0][6:].strip()
+        except:
+            pass
+        # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands
+        nexus_block_gen = self._get_nexus_block(commandlines)
+        while 1:
+            try:
+                title, contents = nexus_block_gen.next()
+            except StopIteration:
+                break
+            if title in KNOWN_NEXUS_BLOCKS:
+                self._parse_nexus_block(title, contents)
+            else:
+                self._unknown_nexus_block(title, contents)
+
+    def _get_nexus_block(self,file_contents):
+        """Generator for looping through Nexus blocks."""
+        inblock=False
+        blocklines=[]
+        while file_contents:
+            cl=file_contents.pop(0)
+            if cl.lower().startswith('begin'):
+                if not inblock:
+                    inblock=True
+                    title=cl.split()[1].lower()
+                else:
+                    raise NexusError('Illegal block nesting in block %s' % title)
+            elif cl.lower().startswith('end'):
+                if inblock:
+                    inblock=False
+                    yield title,blocklines
+                    blocklines=[]
+                else:
+                    raise NexusError('Unmatched \'end\'.')
+            elif inblock:
+                blocklines.append(cl)
+
+    def _unknown_nexus_block(self,title, contents):
+        block = Block()
+        block.commandlines.append(contents)
+        block.title = title
+        self.unknown_blocks.append(block)        
+
+    def _parse_nexus_block(self,title, contents):
+        """Parse a known Nexus Block """
+        # attached the structered block representation
+        self._apply_block_structure(title, contents)
+        #now check for taxa,characters,data blocks. If this stuff is defined more than once
+        #the later occurences will override the previous ones.
+        block=self.structured[-1] 
+        for line in block.commandlines:
+            try:
+                getattr(self,'_'+line.command)(line.options)
+            except AttributeError:
+                raise
+                raise NexusError, 'Unknown command: %s ' % line.command
+    
+    def _dimensions(self,options):
+        if options.has_key('ntax'):
+            self.ntax=eval(options['ntax'])
+        if options.has_key('nchar'):
+            self.nchar=eval(options['nchar'])
+
+    def _format(self,options):
+        # print options
+        # we first need to test respectcase, then symbols (which depends on respectcase)
+        # then datatype (which, if standard, depends on symbols and respectcase in order to generate
+        # dicts for ambiguous values and alphabet
+        if options.has_key('respectcase'):
+            self.respectcase=True
+        # adjust symbols to for respectcase
+        if options.has_key('symbols'):
+            self.symbols=options['symbols']
+            if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\
+            (self.symbold.startswith("'") and self.symbols.endswith("'")):
+                self.symbols=self.symbols[1:-1].replace(' ','')
+            if not self.respectcase:
+                self.symbols=self.symbols.lower()+self.symbols.upper()
+                self.symbols=list(sets.Set(self.symbols))
+        if options.has_key('datatype'):
+            self.datatype=options['datatype'].lower()
+            if self.datatype=='dna' or self.datatype=='nucleotide':
+                self.alphabet=IUPAC.ambiguous_dna
+                self.ambiguous_values=IUPACData.ambiguous_dna_values
+                self.unambiguous_letters=IUPACData.unambiguous_dna_letters
+            elif self.datatype=='rna':
+                self.alphabet=IUPAC.ambiguous_rna
+                self.ambiguous_values=IUPACData.ambiguous_rna_values
+                self.unambiguous_letters=IUPACData.unambiguous_rna_letters
+            elif self.datatype=='protein':
+                self.alphabet=IUPAC.protein
+                self.ambiguous_values={'B':'DN','Z':'EQ','X':IUPACData.protein_letters} # that's how PAUP handles it
+                self.unambiguous_letters=IUPACData.protein_letters+'*' # stop-codon
+            elif self.datatype=='standard':
+                raise NexusError('Datatype standard is not yet supported.')
+                #self.alphabet=None
+                #self.ambiguous_values={}
+                #if not self.symbols:
+                #    self.symbols='01' # if nothing else defined, then 0 and 1 are the default states
+                #self.unambiguous_letters=self.symbols
+            else:
+                raise NexusError, 'Unsupported datatype: '+self.datatype
+            self.valid_characters=''.join(self.ambiguous_values.keys())+self.unambiguous_letters
+            if not self.respectcase:
+                self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper()
+            #we have to sort the reverse ambig coding dict key characters:
+            #to be sure that it's 'ACGT':'N' and not 'GTCA':'N'
+            rev=dict([(i[1],i[0]) for i in self.ambiguous_values.items() if i[0]!='X'])
+            self.rev_ambiguous_values={}
+            for (k,v) in rev.items():
+                key=[c for c in k]
+                key.sort()
+                self.rev_ambiguous_values[''.join(key)]=v
+        #overwrite symbols for datype rna,dna,nucleotide
+        if self.datatype in ['dna','rna','nucleotide']:
+            self.symbols=self.alphabet.letters
+            if self.missing not in self.ambiguous_values:
+                self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap
+            self.ambiguous_values[self.gap]=self.gap
+        elif self.datatype=='standard':
+            if not self.symbols:
+                self.symbols=['1','0']
+        if options.has_key('missing'):
+            self.missing=options['missing'][0]
+        if options.has_key('gap'):
+            self.gap=options['gap'][0]
+        if options.has_key('equate'):
+            self.equate=options['equate']
+        if options.has_key('matchchar'):
+            self.matchchar=options['matchchar'][0]
+        if options.has_key('labels'):
+            self.labels=options['labels']
+        if options.has_key('transpose'):
+            raise NexusError, 'TRANSPOSE is not supported!'
+            self.transpose=True
+        if options.has_key('interleave'):
+            if options['interleave']==None or options['interleave'].lower()=='yes':
+                self.interleave=True
+        if options.has_key('tokens'):
+            self.tokens=True
+        if options.has_key('notokens'):
+            self.tokens=False
+
+
+    def _set(self,options):
+        self.set=options;
+
+    def _options(self,options):
+        self.options=options;
+
+    def _eliminate(self,options):
+        self.eliminate=options
+
+    def _taxlabels(self,options):
+        """Get taxon labels."""
+        self.taxlabels=[]
+        opts=CharBuffer(options)
+        while True:
+            taxon=quotestrip(opts.next_word())
+            if not taxon:
+                break
+            self.taxlabels.append(taxon)
+
+    def _check_taxlabels(self,taxon): 
+        """Check for presence of taxon in self.taxlabels."""
+        # According to NEXUS standard, underscores shall be treated as spaces...,
+        # so checking for identity is more difficult
+        nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels])
+        nexid=taxon.replace(' ','_')
+        return nextaxa.get(nexid)
+
+    def _charlabels(self,options):
+        self.charlabels={}
+        opts=CharBuffer(options)
+        while True:
+            try:
+                # get id and state
+                w=opts.next_word()
+                if w is None: # McClade saves and reads charlabel-lists with terminal comma?!
+                    break
+                identifier=self._resolve(w,set_type=CHARSET) 
+                state=quotestrip(opts.next_word())
+                self.charlabels[identifier]=state
+                # check for comma or end of command
+                c=opts.next_nonwhitespace()
+                if c is None:
+                    break
+                elif c!=',':
+                    raise NexusError,'Missing \',\' in line %s.' % options
+            except NexusError:
+                raise
+            except:
+                raise NexusError,'Format error in line %s.' % options
+
+    def _charstatelabels(self,options):
+        # warning: charstatelabels supports only charlabels-syntax!
+        self._charlabels(options)
+
+    def _statelabels(self,options):
+        #self.charlabels=options
+        #print 'Command statelabels is not supported and will be ignored.'
+        pass
+
+    def _matrix(self,options):
+        if not self.ntax or not self.nchar:
+            raise NexusError,'Dimensions must be specified before matrix!'
+        taxlabels_present=(self.taxlabels!=[])
+        self.matrix={}
+        taxcount=0
+        block_interleave=0
+        #eliminate empty lines and leading/trailing whitespace 
+        lines=[l.strip() for l in options.split('\n') if l.strip()<>'']
+        lineiter=iter(lines)
+        while 1:
+            try:
+                l=lineiter.next()
+            except StopIteration:
+                if taxcount<self.ntax:
+                    raise NexusError, 'Not enough taxa in matrix.'
+                elif taxcount>self.ntax:
+                    raise NexusError, 'Too many taxa in matrix.'
+                else:
+                    break
+            # count the taxa and check for interleaved matrix
+            taxcount+=1
+            ##print taxcount
+            if taxcount>self.ntax:
+                if not self.interleave:
+                    raise NexusError, 'Too many taxa in matrix - should matrix be interleaved?'
+                else:
+                    taxcount=1
+                    block_interleave=1
+            #get taxon name and sequence
+            linechars=CharBuffer(l)
+            id=quotestrip(linechars.next_word())
+            l=linechars.rest().strip()
+            if taxlabels_present and not self._check_taxlabels(id):
+                raise NexusError,'Taxon '+id+' not found in taxlabels.'
+            chars=''
+            if self.interleave:
+                #interleaved matrix
+                #print 'In interleave'
+                if l:
+                    chars=''.join(l.split())
+                else:
+                    chars=''.join(lineiter.next().split())
+            else:
+                #non-interleaved matrix
+                chars=''.join(l.split())
+                while len(chars)<self.nchar:
+                    l=lineiter.next()
+                    chars+=''.join(l.split())
+            iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet)
+            #first taxon has the reference sequence if matchhar is used
+            if taxcount==1:
+                refseq=iupac_seq
+            else:
+                if self.matchchar:
+                    while 1:
+                        p=iupac_seq.tostring().find(self.matchchar)
+                        if p==-1:
+                            break
+                        iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet)
+            #check for invalid characters
+            for i,c in enumerate(iupac_seq.tostring()):
+                if c not in self.valid_characters and c!=self.gap and c!=self.missing:
+                    raise NexusError, 'Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\
+                            % (id,c,l[i-10:i+10])
+            #add sequence to matrix
+            if block_interleave==0:
+                while self.matrix.has_key(id):
+                    if id.split('.')[-1].startswith('copy'):
+                        id='.'.join(id.split('.')[:-1])+'.copy'+str(eval('0'+id.split('.')[-1][4:])+1)
+                    else:
+                        id+='.copy'
+                    #raise NexusError, id+' already in matrix!\nError in: '+l
+                self.matrix[id]=iupac_seq
+                # add taxon name only if taxlabels is not alredy present
+                if not taxlabels_present:
+                    self.taxlabels.append(id)
+            else:
+                taxon_present=self._check_taxlabels(id)
+                if taxon_present:
+                    self.matrix[taxon_present]+=iupac_seq
+                else:
+                    raise NexusError, 'Taxon %s not in first block of interleaved matrix.' % id
+        #check all sequences for length according to nchar
+        for taxon in self.matrix:
+            if len(self.matrix[taxon])!=self.nchar:
+                raise NexusError,'Nchar ('+str(self.nchar)+') does not match data for taxon '+taxon
+
+    def _translate(self,options):
+        self.translate={}
+        opts=CharBuffer(options)
+        while True:
+            try:
+                # get id and state
+                identifier=int(opts.next_word()) 
+                label=quotestrip(opts.next_word())
+                self.translate[identifier]=label
+                # check for comma or end of command
+                c=opts.next_nonwhitespace()
+                if c is None:
+                    break
+                elif c!=',':
+                    raise NexusError,'Missing \',\' in line %s.' % options
+            except NexusError:
+                raise
+            except:
+                raise NexusError,'Format error in line %s.' % options
+
+    def _utree(self,options):
+        """Some software (clustalx) uses 'utree' to denote an unrooted tree."""
+        self._tree(options)
+        
+    def _tree(self,options):
+        opts=CharBuffer(options)
+        name=opts.next_word()
+        if opts.next_nonwhitespace()!='=':
+            raise NexusError,'Syntax error in tree description: %s' % options[:50]
+        rooted=False
+        weight=1.0
+        while opts.peek_nonwhitespace()=='[':
+            open=opts.next_nonwhitespace()
+            symbol=opts.next()
+            if symbol!='&':
+                raise NexusError,'Illegal special comment [%s...] in tree description: %s' % (symbol, options[:50])
+            special=opts.next()
+            value=opts.next_until(']')
+            closing=opts.next()
+            if special=='R':
+                rooted=True
+            elif special=='U':
+                rooted=False
+            elif special=='W':
+                weight=float(value)
+        tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip())
+        # if there's an active translation table, translate
+        if self.translate:
+            for n in tree.get_terminals():
+                try:
+                    tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)])
+                except (ValueError,KeyError):
+                    raise NexusError,'Unable to substitue %s using \'translate\' data.' % tree.node(n).data.taxon
+        self.trees.append(tree)
+
+    def _apply_block_structure(self,title,lines):
+        block=Block('')
+        block.title = title            
+        for line in lines:
+            block.commandlines.append(Commandline(line, title))
+        self.structured.append(block)
+       
+    def _taxset(self, options):
+        name,taxa=self._get_indices(options,set_type=TAXSET)
+        self.taxsets[name]=_make_unique(taxa)
+                
+    def _charset(self, options):
+        name,sites=self._get_indices(options,set_type=CHARSET)
+        self.charsets[name]=_make_unique(sites)
+        
+    def _taxpartition(self, options):
+        taxpartition={}
+        quotelevel=False
+        opts=CharBuffer(options)
+        name=self._name_n_vector(opts)
+        if not name:
+            raise NexusError, 'Formatting error in taxpartition: %s ' % options
+        # now collect thesubbpartitions and parse them
+        # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
+        # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words
+        sub=''
+        while True:
+            w=opts.next()
+            if w is None or (w==',' and not quotelevel):
+                subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':')
+                taxpartition[subname]=_make_unique(subindices)
+                sub=''
+                if w is None:
+                    break
+            else:
+                if w=="'":
+                    quotelevel=not quotelevel
+                sub+=w
+        self.taxpartitions[name]=taxpartition
+
+    def _charpartition(self, options):
+        charpartition={}
+        quotelevel=False
+        opts=CharBuffer(options)
+        name=self._name_n_vector(opts)
+        if not name:
+            raise NexusError, 'Formatting error in charpartition: %s ' % options
+        # now collect thesubbpartitions and parse them
+        # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
+        sub=''
+        while True:
+            w=opts.next()
+            if w is None or (w==',' and not quotelevel):
+                subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':')
+                charpartition[subname]=_make_unique(subindices)
+                sub=''
+                if w is None:
+                    break
+            else:
+                if w=="'":
+                    quotelevel=not quotelevel
+                sub+=w
+        self.charpartitions[name]=charpartition
+
+    def _get_indices(self,options,set_type=CHARSET,separator='='):
+        """Parse the taxset/charset specification 
+            '1 2   3 - 5 dog cat   10- 20 \\ 3' --> [0,1,2,3,4,'dog','cat',10,13,16,19]
+        """
+        opts=CharBuffer(options)
+        name=self._name_n_vector(opts,separator=separator)
+        indices=self._parse_list(opts,set_type=set_type)
+        if indices is None:
+            raise NexusError, 'Formatting error in line: %s ' % options
+        return name,indices
+
+    def _name_n_vector(self,opts,separator='='):
+        """Extract name and check that it's not in vector format."""
+        rest=opts.rest()
+        name=opts.next_word()
+        if not name:
+            raise NexusError, 'Formatting error in line: %s ' % rest
+        name=quotestrip(name)
+        if opts.peek_nonwhitespace=='(':
+            open=opts.next_nonwhitespace()
+            qualifier=open.next_word()
+            close=opts.next_nonwhitespace()
+            if  qualifier.lower()=='vector':
+                raise NexusError, 'Unsupported VECTOR format in line %s' % (options)
+            elif qualifier.lower()!='standard':
+                raise NexusError, 'Unknown qualifier %s in line %s' % (qualifier,options)
+        if opts.next_nonwhitespace()!=separator:
+            raise NexusError, 'Formatting error in line: %s ' % rest
+        return name
+    
+    def _parse_list(self,options_buffer,set_type):
+        """Parse a NEXUS list: [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17-21],
+        (assuming dog is taxon no. 17 and cat is taxon no. 21).
+        """
+        plain_list=[]
+        if options_buffer.peek_nonwhitespace():
+            try:    # capture all possible exceptions and treat them as formatting erros, if they are not NexusError
+                while True:
+                    identifier=options_buffer.next_word()                                     # next list element
+                    if not identifier:                                              # end of list?
+                        break
+                    start=self._resolve(identifier,set_type=set_type)
+                    if options_buffer.peek_nonwhitespace()=='-':                            # followd by -
+                        end=start
+                        step=1
+                        # get hyphen and end of range
+                        hyphen=options_buffer.next_nonwhitespace()
+                        end=self._resolve(options_buffer.next_word(),set_type=set_type)
+                        if set_type==CHARSET:
+                            if options_buffer.peek_nonwhitespace()=='\\':                           # followd by \
+                                backslash=options_buffer.next_nonwhitespace()
+                                step=int(options_buffer.next_word())         # get backslash and step
+                            plain_list.extend(range(start,end+1,step)) 
+                        else:
+                            if type(start)==list or type(end)==list:
+                                raise NexusError, 'Name if character sets not allowed in range definition: %s' % identifier
+                            start=self.taxlabels.index(start)
+                            end=self.taxlabels.index(end)
+                            taxrange=self.taxlabels[start:end+1]
+                            plain_list.extend(taxrange)
+                    else:
+                        if type(start)==list:           # start was the name of charset or taxset
+                            plain_list.extend(start)
+                        else:                           # start was an ordinary identifier
+                            plain_list.append(start)
+            except NexusError:
+                raise
+            except:
+                return None
+        return plain_list
+        
+    def _resolve(self,identifier,set_type=None):
+        """Translate identifier in list into character/taxon index.
+        Characters (which are referred to by their index in Nexus.py):
+            Plain numbers are returned minus 1 (Nexus indices to python indices)
+            Text identifiers are translaterd into their indices (if plain character indentifiers),
+            the first hit in charlabels is returned (charlabels don't need to be unique)
+            or the range of indices is returned (if names of character sets).
+        Taxa (which are referred to by their unique name in Nexus.py):
+            Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
+            Names are returned unchanged (if plain taxon identifiers), or the names in
+            the corresponding taxon set is returned
+        """
+        identifier=quotestrip(identifier)
+        if not set_type:
+            raise NexusError('INTERNAL ERROR: Need type to resolve identifier.')
+        if set_type==CHARSET:
+            try:
+                n=int(identifier)
+            except ValueError:
+                if self.charlabels and identifier in self.charlabels.values():
+                    for k in self.charlabels:
+                        if self.charlabels[k]==identifier:
+                            return k
+                elif self.charsets and identifier in self.charsets:
+                    return self.charsets[identifier]
+                else:
+                    raise NexusError, 'Unknown character identifier: %s' % identifier
+            else:
+                if n<=self.nchar:
+                    return n-1
+                else:
+                    raise NexusError, 'Illegal character identifier: %d>nchar (=%d).' % (identifier,self.nchar)
+        elif set_type==TAXSET:
+            try:
+                n=int(identifier)
+            except ValueError:
+                taxlabels_id=self._check_taxlabels(identifier)
+                if taxlabels_id:
+                    return taxlabels_id
+                elif self.taxsets and identifier in self.taxsets:
+                    return self.taxsets[identifier]
+                else:
+                    raise NexusError, 'Unknown taxon identifier: %s' % identifier
+            else:
+                if n>0 and n<=self.ntax:
+                    return self.taxlabels[n-1]
+                else:
+                    raise NexusError, 'Illegal taxon identifier: %d>ntax (=%d).' % (identifier,self.ntax)
+        else:
+            raise NexusError('Unknown set specification: %s.'% set_type)
+
+    def _stateset(self, options):
+        #Not implemented
+        pass
+
+    def _changeset(self, options):
+        #Not implemented
+        pass
+
+    def _treeset(self, options):
+        #Not implemented
+        pass
+
+    def _treepartition(self, options):
+        #Not implemented
+        pass
+
+    def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False,
+            exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
+        """Writes a nexus file for each partition in charpartition. 
+           Only non-excluded characters and non-deleted taxa are included, just the data block is written.
+        """
+
+        if not matrix:
+            matrix=self.matrix
+        if not matrix:
+            return
+        if not filename:
+            filename=self.filename
+        if charpartition:
+            pfilenames={}
+            for p in charpartition:
+                total_exclude=[]+exclude
+                total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]])
+                total_exclude=_make_unique(total_exclude)
+                pcomment=comment+'\nPartition: '+p+'\n'
+                dot=filename.rfind('.')
+                if dot>0:
+                    pfilename=filename[:dot]+'_'+p+'.data'
+                else:
+                    pfilename=filename+'_'+p
+                pfilenames[p]=pfilename
+                self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize,
+                        interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False,
+                        mrbayes=mrbayes)
+            return pfilenames
+        else:
+            fn=self.filename+'.data'
+            self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave,
+                    exclude=exclude,delete=delete,comment=comment,append_sets=False,
+                    mrbayes=mrbayes)
+            return fn
+    
+    def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
+            blocksize=None, interleave=False, interleave_by_partition=False,\
+            comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False):
+        """ Writes a nexus file with data and sets block. Character sets and partitions 
+            are appended by default, and are adjusted according
+            to excluded characters (i.e. character sets still point to the same sites (not necessarily same positions),
+            without including the deleted characters. 
+        """
+        if not matrix:
+            matrix=self.matrix
+        if not matrix:
+            return
+        if not filename:
+            filename=self.filename
+        if [t for t in delete if not self._check_taxlabels(t)]:
+            raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(sets.Set(self.taxlabels)))
+        if interleave_by_partition:
+            if not interleave_by_partition in self.charpartitions:
+                raise NexusError, 'Unknown partition: '+interleave_by_partition
+            else:
+                partition=self.charpartitions[interleave_by_partition]
+                # we need to sort the partition names by starting position before we exclude characters
+                names=_sort_keys_by_values(partition)
+                newpartition={}
+                for p in partition:
+                    newpartition[p]=[c for c in partition[p] if c not in exclude]
+        # how many taxa and how many characters are left?
+        undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete]
+        cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete))
+        ntax_adjusted=len(undelete)
+        nchar_adjusted=len(cropped_matrix[undelete[0]])
+        if not undelete or (undelete and undelete[0]==''):
+            return
+        if isinstance(filename,str):
+            try:
+                fh=open(filename,'w')
+            except IOError:
+                raise NexusError, 'Could not open %s for writing.' % filename
+        elif isinstance(filename,file):
+            fh=filename
+        if not omit_NEXUS:
+            fh.write('#NEXUS\n')
+        if comment:
+            fh.write('['+comment+']\n')
+        fh.write('begin data;\n')
+        fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted))
+        fh.write('\tformat datatype='+self.datatype)
+        if self.respectcase:
+            fh.write(' respectcase')
+        if self.missing:
+            fh.write(' missing='+self.missing)
+        if self.gap:
+            fh.write(' gap='+self.gap)
+        if self.matchchar:
+            fh.write(' matchchar='+self.matchchar)
+        if self.labels:
+            fh.write(' labels='+self.labels)
+        if self.equate:
+            fh.write(' equate='+self.equate)
+        if interleave or interleave_by_partition:
+            fh.write(' interleave')
+        fh.write(';\n')
+        #if self.taxlabels:
+        #    fh.write('taxlabels '+' '.join(self.taxlabels)+';\n')
+        if self.charlabels:
+            newcharlabels=self._adjust_charlabels(exclude=exclude)
+            clkeys=newcharlabels.keys()
+            clkeys.sort()
+            fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n')
+        fh.write('matrix\n')
+        if not blocksize:
+            if interleave:
+                blocksize=70
+            else:
+                blocksize=self.nchar
+        # delete deleted taxa and ecxclude excluded characters...
+        namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete])
+        if interleave_by_partition:
+            # interleave by partitions, but adjust partitions with regard to excluded characters
+            seek=0
+            for p in names:
+                fh.write('[%s: %s]\n' % (interleave_by_partition,p))
+                if len(newpartition[p])>0:
+                    for taxon in undelete:
+                        fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
+                        fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n')
+                    fh.write('\n')
+                else:
+                    fh.write('[empty]\n\n')
+                seek+=len(newpartition[p])
+        elif interleave:
+            for seek in range(0,nchar_adjusted,blocksize):
+                for taxon in undelete:
+                    fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
+                    fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
+                fh.write('\n')
+        else:
+            for taxon in undelete:
+                if blocksize<nchar_adjusted:
+                    fh.write(safename(taxon,mrbayes=mrbayes)+'\n')
+                else:
+                    fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
+                for seek in range(0,nchar_adjusted,blocksize):
+                    fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
+        fh.write(';\nend;\n')
+        if append_sets:
+            fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes))
+        fh.close()
+        return filename
+
+    def append_sets(self,exclude=[],delete=[],mrbayes=False):
+        """Appends a sets block to <filename>."""
+        if not self.charsets and not self.taxsets and not self.charpartitions:
+            return ''
+        sets=['\nbegin sets']
+        # - now if characters have been excluded, the character sets need to be adjusted,
+        #   so that they still point to the right character positions
+        # calculate a list of offsets: for each deleted character, the following character position
+        # in the new file will have an additional offset of -1
+        offset=0
+        offlist=[]
+        for c in range(self.nchar):
+            if c in exclude:
+                offset+=1
+                offlist.append(-1)  # dummy value as these character positions are excluded
+            else:
+                offlist.append(c-offset)
+        # now adjust each of the character sets
+        for n,ns in self.charsets.items():
+            cset=[offlist[c] for c in ns if c not in exclude]
+            if cset: 
+                sets.append('charset %s = %s' % (safename(n),_compact4nexus(cset))) 
+        for n,s in self.taxsets.items():
+            tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
+            if tset:
+                sets.append('taxset %s = %s' % (safename(n),' '.join(tset))) 
+        for n,p in self.charpartitions.items():
+            # as characters have been excluded, the partitions must be adjusted
+            # if a partition is empty, it will be omitted from the charpartition command
+            # (although paup allows charpartition part=t1:,t2:,t3:1-100)
+            names=_sort_keys_by_values(p)
+            newpartition={}
+            for sn in names:
+                nsp=[offlist[c] for c in p[sn] if c not in exclude]
+                if nsp:
+                    newpartition[sn]=nsp
+            if newpartition:
+                sets.append('charpartition %s = %s' % (safename(n),\
+                ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition])))
+        # now write charpartititions, much easier than charpartitions
+        for n,p in self.taxpartitions.items():
+            names=_sort_keys_by_values(p)
+            newpartition={}
+            for sn in names:
+                nsp=[t for t in p[sn] if t not in delete]
+                if nsp:
+                    newpartition[sn]=nsp
+            if newpartition:
+                sets.append('taxpartition %s = %s' % (safename(n),\
+                ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition])))
+        # add 'end' and return everything
+        sets.append('end;\n')
+        return ';\n'.join(sets)
+        f.close()
+    
+    def export_fasta(self, filename=None, width=70):
+        """Writes matrix into a fasta file: (self, filename=None, width=70)."""       
+        if not filename:
+            if '.' in filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
+                filename='.'.join(self.filename.split('.')[:-1])+'.fas'
+            else:
+                filename=self.filename+'.fas'
+        fh=open(filename,'w')
+        for taxon in self.taxlabels:
+            fh.write('>'+safename(taxon)+'\n')
+            for i in range(0, len(self.matrix[taxon].tostring()), width):
+                fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n')    
+        fh.close()
+
+    def constant(self,matrix=None,delete=[],exclude=[]):
+        """Return a list with all constant characters."""
+        if not matrix:
+            matrix=self.matrix
+        undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
+        if not undelete:
+            return None
+        elif len(undelete)==1:
+            return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
+        # get the first sequence and expand all ambiguous values
+        constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for 
+                x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude]
+        for taxon in undelete[1:]:
+            newconstant=[]
+            for site in constant:
+                #print '%d (paup=%d)' % (site[0],site[0]+1),
+                seqsite=matrix[taxon][site[0]].upper()
+                #print seqsite,'checked against',site[1],'\t',
+                if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]: 
+                    # missing or same as before  -> ok
+                    newconstant.append(site)
+                elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
+                    # subset of an ambig or only missing in previous -> take subset
+                    newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite)))
+                elif seqsite in self.ambiguous_values:  # is it an ambig: check the intersection with prev. values
+                    intersect=sets.Set(self.ambiguous_values[seqsite]).intersection(sets.Set(site[1]))
+                    if intersect:
+                        newconstant.append((site[0],''.join(intersect)))
+                    #    print 'ok'
+                    #else:
+                    #    print 'failed'
+                #else:
+                #    print 'failed'
+            constant=newconstant
+        cpos=[s[0] for s in constant]
+        return constant
+        # return [x[0] for x in constant]
+
+    def cstatus(self,site,delete=[],narrow=True):
+        """Summarize character.
+        narrow=True:  paup-mode (a c ? --> ac; ? ? ? --> ?)
+        narrow=false:           (a c ? --> a c g t -; ? ? ? --> a c g t -)
+        """
+        undelete=[t for t in self.taxlabels if t not in delete]
+        if not undelete:
+            return None
+        cstatus=[]
+        for t in undelete:
+            c=self.matrix[t][site].upper()
+            if self.options.get('gapmode')=='missing' and c==self.gap:
+                c=self.missing
+            if narrow and c==self.missing:
+                if c not in cstatus:
+                    cstatus.append(c)
+            else:
+                cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus])
+        if self.missing in cstatus and narrow and len(cstatus)>1:
+            cstatus=[c for c in cstatus if c!=self.missing]
+        cstatus.sort()
+        return cstatus
+
+    def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]):
+        """Calculates a stepmatrix for weighted parsimony.
+        See Wheeler (1990), Cladistics 6:269-275 and
+        Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196
+        """    
+        m=StepMatrix(self.unambiguous_letters,self.gap)
+        for site in [s for s in range(self.nchar) if s not in exclude]:
+            cstatus=self.cstatus(site,delete)
+            for i,b1 in enumerate(cstatus[:-1]):
+                for b2 in cstatus[i+1:]:
+                    m.add(b1.upper(),b2.upper(),1)
+        return m.transformation().weighting().smprint(name=name)
+
+    def crop_matrix(self,matrix=None, delete=[], exclude=[]):
+        """Return a matrix without deleted taxa and excluded characters."""
+        if not matrix:
+            matrix=self.matrix
+        if [t for t in delete if not self._check_taxlabels(t)]:
+            raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(self.taxlabels))
+        if exclude!=[]:
+            undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
+            if not undelete:
+                return {}
+            m=[matrix[k].tostring() for k in undelete]
+            zipped_m=zip(*m)
+            sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude]
+            if sitesm==[]:
+                return dict([(t,Seq('',self.alphabet)) for t in undelete])
+            else:
+                zipped_sitesm=zip(*sitesm)
+                m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)]
+                return dict(zip(undelete,m))
+        else:
+            return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
+           
+    def bootstrap(self,matrix=None,delete=[],exclude=[]):
+        """Return a bootstrapped matrix."""
+        if not matrix:
+            matrix=self.matrix
+        seqobjects=isinstance(matrix[matrix.keys()[0]],Seq)         # remember if Seq objects
+        cm=self.crop_matrix(delete=delete,exclude=exclude)          # crop data out
+        if not cm:                                                  # everything deleted?
+            return {}
+        elif len(cm[cm.keys()[0]])==0:                              # everything excluded?
+            return cm
+        undelete=[t for t in self.taxlabels if t in cm]  
+        if seqobjects:
+            sitesm=zip(*[cm[t].tostring() for t in undelete])
+            alphabet=matrix[matrix.keys()[0]].alphabet
+        else:
+            sitesm=zip(*[cm[t] for t in undelete])
+        bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))]
+        bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
+        if seqobjects:
+            bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
+        return dict(zip(undelete,bootstrapseqs)) 
+
+    def add_sequence(self,name,sequence):
+        """Adds a sequence to the matrix."""
+        if not name:
+            raise NexusError, 'New sequence must have a name'
+        diff=self.nchar-len(sequence)
+        if diff<0:
+            self.insert_gap(self.nchar,-diff)
+        elif diff>0:
+            sequence+=self.missing*diff
+        
+        self.matrix[name]=Seq(sequence,self.alphabet)
+        self.ntax+=1
+        self.taxlabels.append(name)
+        #taxlabels?
+
+    def insert_gap(self,pos,n=1,leftgreedy=False):
+        """Add a gap into the matrix and adjust charsets and partitions.
+        
+        pos=0: first position
+        pos=nchar: last position
+        """
+
+        def _adjust(set,x,d,leftgreedy=False):
+            """Adjusts chartacter sets if gaps are inserted, taking care of
+            new gaps within a coherent character set.""" 
+            # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3  8 9 10 11 13 14 15
+            # then the adjusted set will be 1 2 3  8 9 10 11 12 13 14 15 16 17 18 
+            # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18
+            set.sort()
+            addpos=0
+            for i,c in enumerate(set):
+                if c>=x:
+                    set[i]=c+d
+                # if we add gaps within a group of characters, we want the gap position included in this group
+                if c==x:
+                    if leftgreedy or (i>0 and set[i-1]==c-1):  
+                        addpos=i
+            if addpos>0:
+                set[addpos:addpos]=range(x,x+d)
+            return set
+
+        if pos<0 or pos>self.nchar:
+            raise NexusError('Illegal gap position: %d' % pos)
+        if n==0:
+            return
+        sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
+        sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n
+        # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\
+        #        i,taxon in enumerate(self.taxlabels)])
+        zipped=zip(*sitesm)
+        mapped=map(''.join,zipped)
+        listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)]
+        self.matrix=dict(listed) 
+        self.nchar+=n
+        # now adjust character sets
+        for i,s in self.charsets.items():
+            self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy)
+        for p in self.charpartitions:
+            for sp,s in self.charpartitions[p].items():
+                self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy)
+        # now adjust character state labels
+        self.charlabels=self._adjust_charlabels(insert=[pos]*n)
+        return self.charlabels
+      
+    def _adjust_charlabels(self,exclude=None,insert=None):
+        """Return adjusted indices of self.charlabels if characters are excluded or inserted."""
+        if exclude and insert:
+            raise NexusError, 'Can\'t exclude and insert at the same time'
+        if not self.charlabels:
+            return None
+        labels=self.charlabels.keys()
+        labels.sort()
+        newcharlabels={}
+        if exclude:
+            exclude.sort()
+            exclude.append(sys.maxint)
+            excount=0
+            for c in labels:
+                if not c in exclude:
+                    while c>exclude[excount]:
+                        excount+=1
+                    newcharlabels[c-excount]=self.charlabels[c]
+        elif insert:
+            insert.sort()
+            insert.append(sys.maxint)
+            icount=0
+            for c in labels:
+                while c>=insert[icount]:
+                    icount+=1
+                newcharlabels[c+icount]=self.charlabels[c]
+        else:
+            return self.charlabels
+        return newcharlabels
+
+    def invert(self,charlist):
+        """Returns all character indices that are not in charlist."""
+        return [c for c in range(self.nchar) if c not in charlist]
+
+    def gaponly(self,include_missing=False):
+        """Return gap-only sites."""
+        gap=sets.Set(self.gap)
+        if include_missing:
+            gap.add(self.missing)
+        sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
+        gaponly=[i for i,site in enumerate(sitesm) if sets.Set(site).issubset(gap)]
+        return gaponly 
+        
+    def terminal_gap_to_missing(self,missing=None,skip_n=True):
+        """Replaces all terminal gaps with missing character.
+        
+        Mixtures like ???------??------- are properly resolved."""
+        
+        if not missing:
+            missing=self.missing
+        replace=[self.missing,self.gap]
+        if not skip_n:
+            replace.extend(['n','N'])
+        for taxon in self.taxlabels:
+            sequence=self.matrix[taxon].tostring()
+            length=len(sequence)
+            start,end=get_start_end(sequence,skiplist=replace)
+            sequence=sequence[:end+1]+missing*(length-end-1)
+            sequence=start*missing+sequence[start:]
+            assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon
+            self.matrix[taxon]=Seq(sequence,self.alphabet)
+
author	davidmurphy
date	Thu, 27 Oct 2011 12:09:09 -0400
parents
children