Mercurial > repos > davidmurphy > codonlogo
diff corebio/seq_io/_nexus/__init__.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/corebio/seq_io/_nexus/__init__.py Thu Oct 27 12:09:09 2011 -0400 @@ -0,0 +1,1612 @@ +# Nexus.py - a NEXUS parser +# +# Copyright 2005 by Frank Kauff & Cymon J. Cox. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +# +# Bug reports welcome: fkauff@duke.edu +# + +"""Parse the contents of a nexus file. +Based upon 'NEXUS: An extensible file format for systematic information' +Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 + +Authors: Frank Kauff and Cymon J. Cox +""" + +import os,sys, math, random, copy +import sets + +# --- Changes from Bio.Nexus --- +# Renamed Nexus.py to __init__.py. Helps with api documentation. +# One further change in file, tagged with 'GEC' +#from Bio.Alphabet import IUPAC +#from Bio.Data import IUPACData +#from Bio.Seq import Seq + + +from corebio.seq import Seq, Alphabet, protein_alphabet +import corebio.data as data +from corebio.utils import Struct + +IUPACData = Struct( + ambiguous_dna_letters = data.dna_extended_letters, + ambiguous_rna_letters = data.rna_extended_letters, + ambiguous_dna_values = data.dna_ambiguity, + ambiguous_rna_values = data.rna_ambiguity, + protein_letters = data.amino_acid_letters, + unambiguous_dna_letters = data.dna_letters, + unambiguous_rna_letters = data.rna_letters, + ) + +IUPAC = Struct( + ambiguous_dna = Alphabet(IUPACData.ambiguous_dna_letters+'-?'), + ambiguous_rna = Alphabet(IUPACData.ambiguous_rna_letters+'-?'), + protein = protein_alphabet #Alphabet(IUPACData.protein_letters+'-?') + ) + + + +# End Changes + + +from Trees import Tree,NodeData + +C = False + +#try: +# import cnexus +#except ImportError: +# C=False +#else: +# C=True + +INTERLEAVE=70 +SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\ + 'matrix','tree', 'utree','translate'] +KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets'] +PUNCTUATION='()[]{}/\,;:=*\'"`+-<>' +MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' +WHITESPACE=' \t\n' +#SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments +SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored +CHARSET='chars' +TAXSET='taxa' + +class NexusError(Exception): pass + +class CharBuffer: + """Helps reading NEXUS-words and characters from a buffer.""" + def __init__(self,string): + if string: + self.buffer=list(string) + else: + self.buffer=[] + + def peek(self): + if self.buffer: + return self.buffer[0] + else: + return None + + def peek_nonwhitespace(self): + b=''.join(self.buffer).strip() + if b: + return b[0] + else: + return None + + def next(self): + if self.buffer: + return self.buffer.pop(0) + else: + return None + + def next_nonwhitespace(self): + while True: + p=self.next() + if p is None: + break + if p not in WHITESPACE: + return p + return None + + def skip_whitespace(self): + while self.buffer[0] in WHITESPACE: + self.buffer=self.buffer[1:] + + def next_until(self,target): + for t in target: + try: + pos=self.buffer.index(t) + except ValueError: + pass + else: + found=''.join(self.buffer[:pos]) + self.buffer=self.buffer[pos:] + return found + else: + return None + + def peek_word(self,word): + return ''.join(self.buffer[:len(word)])==word + + def next_word(self): + """Return the next NEXUS word from a string, dealing with single and double quotes, + whitespace and punctuation. + """ + + word=[] + quoted=False + first=self.next_nonwhitespace() # get first character + if not first: # return empty if only whitespace left + return None + word.append(first) + if first=="'": # word starts with a quote + quoted=True + elif first in PUNCTUATION: # if it's punctuation, return immediately + return first + while True: + c=self.peek() + if c=="'": # a quote? + word.append(self.next()) # store quote + if self.peek()=="'": # double quote + skip=self.next() # skip second quote + elif quoted: # second single quote ends word + break + elif quoted: + word.append(self.next()) # if quoted, then add anything + elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop + break + else: + word.append(self.next()) # standard character + return ''.join(word) + + def rest(self): + """Return the rest of the string without parsing.""" + return ''.join(self.buffer) + +class StepMatrix: + """Calculate a stepmatrix for weighted parsimony. + See Wheeler (1990), Cladistics 6:269-275. + """ + + def __init__(self,symbols,gap): + self.data={} + self.symbols=[s for s in symbols] + self.symbols.sort() + if gap: + self.symbols.append(gap) + for x in self.symbols: + for y in [s for s in self.symbols if s!=x]: + self.set(x,y,0) + + def set(self,x,y,value): + if x>y: + x,y=y,x + self.data[x+y]=value + + def add(self,x,y,value): + if x>y: + x,y=y,x + self.data[x+y]+=value + + def sum(self): + return reduce(lambda x,y:x+y,self.data.values()) + + def transformation(self): + total=self.sum() + if total!=0: + for k in self.data: + self.data[k]=self.data[k]/float(total) + return self + + def weighting(self): + for k in self.data: + if self.data[k]!=0: + self.data[k]=-math.log(self.data[k]) + return self + + def smprint(self,name='your_name_here'): + matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols)) + matrix+=' %s\n' % ' '.join(self.symbols) + for x in self.symbols: + matrix+='[%s]'.ljust(8) % x + for y in self.symbols: + if x==y: + matrix+=' . ' + else: + if x>y: + x1,y1=y,x + else: + x1,y1=x,y + if self.data[x1+y1]==0: + matrix+='inf. ' + else: + matrix+='%2.2f'.ljust(10) % (self.data[x1+y1]) + matrix+='\n' + matrix+=';\n' + return matrix + +def safename(name,mrbayes=False): + """Return a taxon identifier according to NEXUS standard. + Wrap quotes around names with punctuation or whitespace, and double single quotes. + mrbayes=True: write names without quotes, whitespace or punctuation for mrbayes. + """ + if mrbayes: + safe=name.replace(' ','_') + safe=''.join([c for c in safe if c in MRBAYESSAFE]) + else: + safe=name.replace("'","''") + if sets.Set(safe).intersection(sets.Set(WHITESPACE+PUNCTUATION)): + safe="'"+safe+"'" + return safe + +def quotestrip(word): + """Remove quotes and/or double quotes around identifiers.""" + if not word: + return None + while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): + word=word[1:-1] + return word + +def get_start_end(sequence, skiplist=['-','?']): + """Return position of first and last character which is not in skiplist (defaults to ['-','?']).""" + + length=len(sequence) + if length==0: + return None,None + end=length-1 + while end>=0 and (sequence[end] in skiplist): + end-=1 + start=0 + while start<length and (sequence[start] in skiplist): + start+=1 + return start,end + +def _sort_keys_by_values(p): + """Returns a sorted list of keys of p sorted by values of p.""" + startpos=[(p[pn],pn) for pn in p if p[pn]] + startpos.sort() + return zip(*startpos)[1] + +def _make_unique(l): + """Check that all values in list are unique and return a pruned and sorted list.""" + l=list(sets.Set(l)) + l.sort() + return l + +def _seqmatrix2strmatrix(matrix): + """Converts a Seq-object matrix to a plain sequence-string matrix.""" + return dict([(t,matrix[t].tostring()) for t in matrix]) + +def _compact4nexus(orig_list): + """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) + into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). + """ + + if not orig_list: + return '' + orig_list=list(sets.Set(orig_list)) + orig_list.sort() + shortlist=[] + clist=orig_list[:] + clist.append(clist[-1]+.5) # dummy value makes it easier + while len(clist)>1: + step=1 + for i,x in enumerate(clist): + if x==clist[0]+i*step: # are we still in the right step? + continue + elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]: + # second element, and possibly at least 3 elements to link, + # and the next one is in the right step + step=x-clist[0] + else: # pattern broke, add all values before current position to new list + sub=clist[:i] + if len(sub)==1: + shortlist.append(str(sub[0]+1)) + else: + if step==1: + shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1)) + else: + shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step)) + clist=clist[i:] + break + return ' '.join(shortlist) + +def combine(matrices): + """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. + + combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] + Character sets, character partitions and taxon sets are prefixed, readjusted and present in + the combined matrix. + """ + + if not matrices: + return None + name=matrices[0][0] + combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix + mixed_datatypes=(len(sets.Set([n[1].datatype for n in matrices]))>1) + if mixed_datatypes: + combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself! + # raise NexusError, 'Matrices must be of same datatype' + combined.charlabels=None + combined.statelabels=None + combined.interleave=False + combined.translate=None + + # rename taxon sets and character sets and name them with prefix + for cn,cs in combined.charsets.items(): + combined.charsets['%s.%s' % (name,cn)]=cs + del combined.charsets[cn] + for tn,ts in combined.taxsets.items(): + combined.taxsets['%s.%s' % (name,tn)]=ts + del combined.taxsets[tn] + # previous partitions usually don't make much sense in combined matrix + # just initiate one new partition parted by single matrices + combined.charpartitions={'combined':{name:range(combined.nchar)}} + for n,m in matrices[1:]: # add all other matrices + both=[t for t in combined.taxlabels if t in m.taxlabels] + combined_only=[t for t in combined.taxlabels if t not in both] + m_only=[t for t in m.taxlabels if t not in both] + for t in both: + # concatenate sequences and unify gap and missing character symbols + combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) + # replace date of missing taxa with symbol for missing data + for t in combined_only: + combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet) + for t in m_only: + combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\ + Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) + combined.taxlabels.extend(m_only) # new taxon list + for cn,cs in m.charsets.items(): # adjust character sets for new matrix + combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs] + if m.taxsets: + if not combined.taxsets: + combined.taxsets={} + combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()])) # update taxon sets + combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar) # update new charpartition + # update charlabels + if m.charlabels: + if not combined.charlabels: + combined.charlabels={} + combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()])) + combined.nchar+=m.nchar # update nchar and ntax + combined.ntax+=len(m_only) + + return combined + +def _kill_comments_and_break_lines(text): + """Delete []-delimited comments out of a file and break into lines separated by ';'. + + stripped_text=_kill_comments_and_break_lines(text): + Nested and multiline comments are allowed. [ and ] symbols within single + or double quotes are ignored, newline ends a quote, all symbols with quotes are + treated the same (thus not quoting inside comments like [this character ']' ends a comment]) + Special [&...] and [\...] comments remain untouched, if not inside standard comment. + Quotes inside special [& and [\ are treated as normal characters, + but no nesting inside these special comments allowed (like [& [\ ]]). + ';' ist deleted from end of line. + + NOTE: this function is very slow for large files, and obsolete when using C extension cnexus + """ + contents=CharBuffer(text) + newtext=[] + newline=[] + quotelevel='' + speciallevel=False + commlevel=0 + while True: + #plain=contents.next_until(["'",'"','[',']','\n',';']) # search for next special character + #if not plain: + # newline.append(contents.rest) # not found, just add the rest + # break + #newline.append(plain) # add intermediate text + t=contents.next() # and get special character + if t is None: + break + if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation + quotelevel='' + elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation + quotelevel=t + elif not quotelevel and t=='[': # opening bracket outside a quote + if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel: + speciallevel=True + else: + commlevel+=1 + elif not quotelevel and t==']': # closing bracket ioutside a quote + if speciallevel: + speciallevel=False + else: + commlevel-=1 + if commlevel<0: + raise NexusError, 'Nexus formatting error: unmatched ]' + continue + if commlevel==0: # copy if we're not in comment + if t==';' and not quotelevel: + newtext.append(''.join(newline)) + newline=[] + else: + newline.append(t) + #level of comments should be 0 at the end of the file + if newline: + newtext.append('\n'.join(newline)) + if commlevel>0: + raise NexusError, 'Nexus formatting error: unmatched [' + return newtext + + +def _adjust_lines(lines): + """Adjust linebreaks to match ';', strip leading/trailing whitespace + + list_of_commandlines=_adjust_lines(input_text) + Lines are adjusted so that no linebreaks occur within a commandline + (except matrix command line) + """ + formatted_lines=[] + for l in lines: + #Convert line endings + l=l.replace('\r\n','\n').replace('\r','\n').strip() + if l.lower().startswith('matrix'): + formatted_lines.append(l) + else: + l=l.replace('\n',' ') + if l: + formatted_lines.append(l) + return formatted_lines + +def _replace_parenthesized_ambigs(seq,rev_ambig_values): + """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" + + opening=seq.find('(') + while opening>-1: + closing=seq.find(')') + if closing<0: + raise NexusError, 'Missing closing parenthesis in: '+seq + elif closing<opening: + raise NexusError, 'Missing opening parenthesis in: '+seq + ambig=[x for x in seq[opening+1:closing]] + ambig.sort() + ambig=''.join(ambig) + ambig_code=rev_ambig_values[ambig.upper()] + if ambig!=ambig.upper(): + ambig_code=ambig_code.lower() + seq=seq[:opening]+ambig_code+seq[closing+1:] + opening=seq.find('(') + return seq + + +class Commandline: + """Represent a commandline as command and options.""" + + def __init__(self, line, title): + self.options={} + options=[] + self.command=None + try: + #Assume matrix (all other command lines have been stripped of \n) + self.command, options = line.strip().split('\n', 1) + except ValueError: #Not matrix + #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...) + self.command=line.split()[0] + options=' '.join(line.split()[1:]) + self.command = self.command.strip().lower() + if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved + self.options=options.strip() + else: + if len(options) > 0: + try: + options = options.replace('=', ' = ').split() + valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))] + indices = [] + for sl in valued_indices: + indices.extend(sl) + token_indices = [n for n in range(len(options)) if n not in indices] + for opt in valued_indices: + #self.options[options[opt[0]].lower()] = options[opt[2]].lower() + self.options[options[opt[0]].lower()] = options[opt[2]] + for token in token_indices: + self.options[options[token].lower()] = None + except ValueError: + raise NexusError, 'Incorrect formatting in line: %s' % line + +class Block: + """Represent a NEXUS block with block name and list of commandlines .""" + def __init__(self,title=None): + self.title=title + self.commandlines=[] + +class Nexus(object): + + __slots__=['original_taxon_order','__dict__'] + + def __init__(self, input=None): + self.ntax=0 # number of taxa + self.nchar=0 # number of characters + self.taxlabels=[] # labels for taxa, ordered by their id + self.charlabels=None # ... and for characters + self.statelabels=None # ... and for states + self.datatype='dna' # (standard), dna, rna, nucleotide, protein + self.respectcase=False # case sensitivity + self.missing='?' # symbol for missing characters + self.gap='-' # symbol for gap + self.symbols=None # set of symbols + self.equate=None # set of symbol synonyms + self.matchchar=None # matching char for matrix representation + self.labels=None # left, right, no + self.transpose=False # whether matrix is transposed + self.interleave=False # whether matrix is interleaved + self.tokens=False # unsupported + self.eliminate=None # unsupported + self.matrix=None # ... + self.unknown_blocks=[] # blocks we don't care about + self.taxsets={} + self.charsets={} + self.charpartitions={} + self.taxpartitions={} + self.trees=[] # list of Trees (instances of tree class) + self.translate=None # Dict to translate taxon <-> taxon numbers + self.structured=[] # structured input representation + self.set={} # dict of the set command to set various options + self.options={} # dict of the options command in the data block + + # some defaults + self.options['gapmode']='missing' + + if input: + self.read(input) + + def get_original_taxon_order(self): + """Included for backwards compatibility.""" + return self.taxlabels + def set_original_taxon_order(self,value): + """Included for backwards compatibility.""" + self.taxlabels=value + original_taxon_order=property(get_original_taxon_order,set_original_taxon_order) + + def read(self,input): + """Read and parse NEXUS imput (filename, file-handle, string.""" + + # 1. Assume we have the name of a file in the execution dir + # Note we need to add parsing of the path to dir/filename + try: + file_contents = open(os.path.expanduser(input),'rU').read() + self.filename=input + except (TypeError,IOError,AttributeError): + #2 Assume we have a string from a fh.read() + #if isinstance(input, str): + # file_contents = input + # self.filename='input_string' + #3 Assume we have a file object + if hasattr(input,'read'): # file objects or StringIO objects + file_contents=input.read() + # GEC : Change next line so that StringIO objects work + #if input.name: + if hasattr(input, 'name'): + self.filename=input.name + else: + self.filename='Unknown_nexus_file' + else: + print input.strip()[:6] + raise NexusError, 'Unrecognized input: %s ...' % input[:100] + if C: + decommented=cnexus.scanfile(file_contents) + #check for unmatched parentheses + if decommented=='[' or decommented==']': + raise NexusError, 'Unmatched %s' % decommented + # cnexus can't return lists, so in analogy we separate commandlines with chr(7) + # (a character that shoudn't be part of a nexus file under normal circumstances) + commandlines=_adjust_lines(decommented.split(chr(7))) + else: + commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents)) + # get rid of stupid 'NEXUS token' + try: + if commandlines[0][:6].upper()=='#NEXUS': + commandlines[0]=commandlines[0][6:].strip() + except: + pass + # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands + nexus_block_gen = self._get_nexus_block(commandlines) + while 1: + try: + title, contents = nexus_block_gen.next() + except StopIteration: + break + if title in KNOWN_NEXUS_BLOCKS: + self._parse_nexus_block(title, contents) + else: + self._unknown_nexus_block(title, contents) + + def _get_nexus_block(self,file_contents): + """Generator for looping through Nexus blocks.""" + inblock=False + blocklines=[] + while file_contents: + cl=file_contents.pop(0) + if cl.lower().startswith('begin'): + if not inblock: + inblock=True + title=cl.split()[1].lower() + else: + raise NexusError('Illegal block nesting in block %s' % title) + elif cl.lower().startswith('end'): + if inblock: + inblock=False + yield title,blocklines + blocklines=[] + else: + raise NexusError('Unmatched \'end\'.') + elif inblock: + blocklines.append(cl) + + def _unknown_nexus_block(self,title, contents): + block = Block() + block.commandlines.append(contents) + block.title = title + self.unknown_blocks.append(block) + + def _parse_nexus_block(self,title, contents): + """Parse a known Nexus Block """ + # attached the structered block representation + self._apply_block_structure(title, contents) + #now check for taxa,characters,data blocks. If this stuff is defined more than once + #the later occurences will override the previous ones. + block=self.structured[-1] + for line in block.commandlines: + try: + getattr(self,'_'+line.command)(line.options) + except AttributeError: + raise + raise NexusError, 'Unknown command: %s ' % line.command + + def _dimensions(self,options): + if options.has_key('ntax'): + self.ntax=eval(options['ntax']) + if options.has_key('nchar'): + self.nchar=eval(options['nchar']) + + def _format(self,options): + # print options + # we first need to test respectcase, then symbols (which depends on respectcase) + # then datatype (which, if standard, depends on symbols and respectcase in order to generate + # dicts for ambiguous values and alphabet + if options.has_key('respectcase'): + self.respectcase=True + # adjust symbols to for respectcase + if options.has_key('symbols'): + self.symbols=options['symbols'] + if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ + (self.symbold.startswith("'") and self.symbols.endswith("'")): + self.symbols=self.symbols[1:-1].replace(' ','') + if not self.respectcase: + self.symbols=self.symbols.lower()+self.symbols.upper() + self.symbols=list(sets.Set(self.symbols)) + if options.has_key('datatype'): + self.datatype=options['datatype'].lower() + if self.datatype=='dna' or self.datatype=='nucleotide': + self.alphabet=IUPAC.ambiguous_dna + self.ambiguous_values=IUPACData.ambiguous_dna_values + self.unambiguous_letters=IUPACData.unambiguous_dna_letters + elif self.datatype=='rna': + self.alphabet=IUPAC.ambiguous_rna + self.ambiguous_values=IUPACData.ambiguous_rna_values + self.unambiguous_letters=IUPACData.unambiguous_rna_letters + elif self.datatype=='protein': + self.alphabet=IUPAC.protein + self.ambiguous_values={'B':'DN','Z':'EQ','X':IUPACData.protein_letters} # that's how PAUP handles it + self.unambiguous_letters=IUPACData.protein_letters+'*' # stop-codon + elif self.datatype=='standard': + raise NexusError('Datatype standard is not yet supported.') + #self.alphabet=None + #self.ambiguous_values={} + #if not self.symbols: + # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states + #self.unambiguous_letters=self.symbols + else: + raise NexusError, 'Unsupported datatype: '+self.datatype + self.valid_characters=''.join(self.ambiguous_values.keys())+self.unambiguous_letters + if not self.respectcase: + self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper() + #we have to sort the reverse ambig coding dict key characters: + #to be sure that it's 'ACGT':'N' and not 'GTCA':'N' + rev=dict([(i[1],i[0]) for i in self.ambiguous_values.items() if i[0]!='X']) + self.rev_ambiguous_values={} + for (k,v) in rev.items(): + key=[c for c in k] + key.sort() + self.rev_ambiguous_values[''.join(key)]=v + #overwrite symbols for datype rna,dna,nucleotide + if self.datatype in ['dna','rna','nucleotide']: + self.symbols=self.alphabet.letters + if self.missing not in self.ambiguous_values: + self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap + self.ambiguous_values[self.gap]=self.gap + elif self.datatype=='standard': + if not self.symbols: + self.symbols=['1','0'] + if options.has_key('missing'): + self.missing=options['missing'][0] + if options.has_key('gap'): + self.gap=options['gap'][0] + if options.has_key('equate'): + self.equate=options['equate'] + if options.has_key('matchchar'): + self.matchchar=options['matchchar'][0] + if options.has_key('labels'): + self.labels=options['labels'] + if options.has_key('transpose'): + raise NexusError, 'TRANSPOSE is not supported!' + self.transpose=True + if options.has_key('interleave'): + if options['interleave']==None or options['interleave'].lower()=='yes': + self.interleave=True + if options.has_key('tokens'): + self.tokens=True + if options.has_key('notokens'): + self.tokens=False + + + def _set(self,options): + self.set=options; + + def _options(self,options): + self.options=options; + + def _eliminate(self,options): + self.eliminate=options + + def _taxlabels(self,options): + """Get taxon labels.""" + self.taxlabels=[] + opts=CharBuffer(options) + while True: + taxon=quotestrip(opts.next_word()) + if not taxon: + break + self.taxlabels.append(taxon) + + def _check_taxlabels(self,taxon): + """Check for presence of taxon in self.taxlabels.""" + # According to NEXUS standard, underscores shall be treated as spaces..., + # so checking for identity is more difficult + nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels]) + nexid=taxon.replace(' ','_') + return nextaxa.get(nexid) + + def _charlabels(self,options): + self.charlabels={} + opts=CharBuffer(options) + while True: + try: + # get id and state + w=opts.next_word() + if w is None: # McClade saves and reads charlabel-lists with terminal comma?! + break + identifier=self._resolve(w,set_type=CHARSET) + state=quotestrip(opts.next_word()) + self.charlabels[identifier]=state + # check for comma or end of command + c=opts.next_nonwhitespace() + if c is None: + break + elif c!=',': + raise NexusError,'Missing \',\' in line %s.' % options + except NexusError: + raise + except: + raise NexusError,'Format error in line %s.' % options + + def _charstatelabels(self,options): + # warning: charstatelabels supports only charlabels-syntax! + self._charlabels(options) + + def _statelabels(self,options): + #self.charlabels=options + #print 'Command statelabels is not supported and will be ignored.' + pass + + def _matrix(self,options): + if not self.ntax or not self.nchar: + raise NexusError,'Dimensions must be specified before matrix!' + taxlabels_present=(self.taxlabels!=[]) + self.matrix={} + taxcount=0 + block_interleave=0 + #eliminate empty lines and leading/trailing whitespace + lines=[l.strip() for l in options.split('\n') if l.strip()<>''] + lineiter=iter(lines) + while 1: + try: + l=lineiter.next() + except StopIteration: + if taxcount<self.ntax: + raise NexusError, 'Not enough taxa in matrix.' + elif taxcount>self.ntax: + raise NexusError, 'Too many taxa in matrix.' + else: + break + # count the taxa and check for interleaved matrix + taxcount+=1 + ##print taxcount + if taxcount>self.ntax: + if not self.interleave: + raise NexusError, 'Too many taxa in matrix - should matrix be interleaved?' + else: + taxcount=1 + block_interleave=1 + #get taxon name and sequence + linechars=CharBuffer(l) + id=quotestrip(linechars.next_word()) + l=linechars.rest().strip() + if taxlabels_present and not self._check_taxlabels(id): + raise NexusError,'Taxon '+id+' not found in taxlabels.' + chars='' + if self.interleave: + #interleaved matrix + #print 'In interleave' + if l: + chars=''.join(l.split()) + else: + chars=''.join(lineiter.next().split()) + else: + #non-interleaved matrix + chars=''.join(l.split()) + while len(chars)<self.nchar: + l=lineiter.next() + chars+=''.join(l.split()) + iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet) + #first taxon has the reference sequence if matchhar is used + if taxcount==1: + refseq=iupac_seq + else: + if self.matchchar: + while 1: + p=iupac_seq.tostring().find(self.matchchar) + if p==-1: + break + iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet) + #check for invalid characters + for i,c in enumerate(iupac_seq.tostring()): + if c not in self.valid_characters and c!=self.gap and c!=self.missing: + raise NexusError, 'Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\ + % (id,c,l[i-10:i+10]) + #add sequence to matrix + if block_interleave==0: + while self.matrix.has_key(id): + if id.split('.')[-1].startswith('copy'): + id='.'.join(id.split('.')[:-1])+'.copy'+str(eval('0'+id.split('.')[-1][4:])+1) + else: + id+='.copy' + #raise NexusError, id+' already in matrix!\nError in: '+l + self.matrix[id]=iupac_seq + # add taxon name only if taxlabels is not alredy present + if not taxlabels_present: + self.taxlabels.append(id) + else: + taxon_present=self._check_taxlabels(id) + if taxon_present: + self.matrix[taxon_present]+=iupac_seq + else: + raise NexusError, 'Taxon %s not in first block of interleaved matrix.' % id + #check all sequences for length according to nchar + for taxon in self.matrix: + if len(self.matrix[taxon])!=self.nchar: + raise NexusError,'Nchar ('+str(self.nchar)+') does not match data for taxon '+taxon + + def _translate(self,options): + self.translate={} + opts=CharBuffer(options) + while True: + try: + # get id and state + identifier=int(opts.next_word()) + label=quotestrip(opts.next_word()) + self.translate[identifier]=label + # check for comma or end of command + c=opts.next_nonwhitespace() + if c is None: + break + elif c!=',': + raise NexusError,'Missing \',\' in line %s.' % options + except NexusError: + raise + except: + raise NexusError,'Format error in line %s.' % options + + def _utree(self,options): + """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" + self._tree(options) + + def _tree(self,options): + opts=CharBuffer(options) + name=opts.next_word() + if opts.next_nonwhitespace()!='=': + raise NexusError,'Syntax error in tree description: %s' % options[:50] + rooted=False + weight=1.0 + while opts.peek_nonwhitespace()=='[': + open=opts.next_nonwhitespace() + symbol=opts.next() + if symbol!='&': + raise NexusError,'Illegal special comment [%s...] in tree description: %s' % (symbol, options[:50]) + special=opts.next() + value=opts.next_until(']') + closing=opts.next() + if special=='R': + rooted=True + elif special=='U': + rooted=False + elif special=='W': + weight=float(value) + tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip()) + # if there's an active translation table, translate + if self.translate: + for n in tree.get_terminals(): + try: + tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)]) + except (ValueError,KeyError): + raise NexusError,'Unable to substitue %s using \'translate\' data.' % tree.node(n).data.taxon + self.trees.append(tree) + + def _apply_block_structure(self,title,lines): + block=Block('') + block.title = title + for line in lines: + block.commandlines.append(Commandline(line, title)) + self.structured.append(block) + + def _taxset(self, options): + name,taxa=self._get_indices(options,set_type=TAXSET) + self.taxsets[name]=_make_unique(taxa) + + def _charset(self, options): + name,sites=self._get_indices(options,set_type=CHARSET) + self.charsets[name]=_make_unique(sites) + + def _taxpartition(self, options): + taxpartition={} + quotelevel=False + opts=CharBuffer(options) + name=self._name_n_vector(opts) + if not name: + raise NexusError, 'Formatting error in taxpartition: %s ' % options + # now collect thesubbpartitions and parse them + # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... + # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words + sub='' + while True: + w=opts.next() + if w is None or (w==',' and not quotelevel): + subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':') + taxpartition[subname]=_make_unique(subindices) + sub='' + if w is None: + break + else: + if w=="'": + quotelevel=not quotelevel + sub+=w + self.taxpartitions[name]=taxpartition + + def _charpartition(self, options): + charpartition={} + quotelevel=False + opts=CharBuffer(options) + name=self._name_n_vector(opts) + if not name: + raise NexusError, 'Formatting error in charpartition: %s ' % options + # now collect thesubbpartitions and parse them + # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... + sub='' + while True: + w=opts.next() + if w is None or (w==',' and not quotelevel): + subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':') + charpartition[subname]=_make_unique(subindices) + sub='' + if w is None: + break + else: + if w=="'": + quotelevel=not quotelevel + sub+=w + self.charpartitions[name]=charpartition + + def _get_indices(self,options,set_type=CHARSET,separator='='): + """Parse the taxset/charset specification + '1 2 3 - 5 dog cat 10- 20 \\ 3' --> [0,1,2,3,4,'dog','cat',10,13,16,19] + """ + opts=CharBuffer(options) + name=self._name_n_vector(opts,separator=separator) + indices=self._parse_list(opts,set_type=set_type) + if indices is None: + raise NexusError, 'Formatting error in line: %s ' % options + return name,indices + + def _name_n_vector(self,opts,separator='='): + """Extract name and check that it's not in vector format.""" + rest=opts.rest() + name=opts.next_word() + if not name: + raise NexusError, 'Formatting error in line: %s ' % rest + name=quotestrip(name) + if opts.peek_nonwhitespace=='(': + open=opts.next_nonwhitespace() + qualifier=open.next_word() + close=opts.next_nonwhitespace() + if qualifier.lower()=='vector': + raise NexusError, 'Unsupported VECTOR format in line %s' % (options) + elif qualifier.lower()!='standard': + raise NexusError, 'Unknown qualifier %s in line %s' % (qualifier,options) + if opts.next_nonwhitespace()!=separator: + raise NexusError, 'Formatting error in line: %s ' % rest + return name + + def _parse_list(self,options_buffer,set_type): + """Parse a NEXUS list: [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17-21], + (assuming dog is taxon no. 17 and cat is taxon no. 21). + """ + plain_list=[] + if options_buffer.peek_nonwhitespace(): + try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError + while True: + identifier=options_buffer.next_word() # next list element + if not identifier: # end of list? + break + start=self._resolve(identifier,set_type=set_type) + if options_buffer.peek_nonwhitespace()=='-': # followd by - + end=start + step=1 + # get hyphen and end of range + hyphen=options_buffer.next_nonwhitespace() + end=self._resolve(options_buffer.next_word(),set_type=set_type) + if set_type==CHARSET: + if options_buffer.peek_nonwhitespace()=='\\': # followd by \ + backslash=options_buffer.next_nonwhitespace() + step=int(options_buffer.next_word()) # get backslash and step + plain_list.extend(range(start,end+1,step)) + else: + if type(start)==list or type(end)==list: + raise NexusError, 'Name if character sets not allowed in range definition: %s' % identifier + start=self.taxlabels.index(start) + end=self.taxlabels.index(end) + taxrange=self.taxlabels[start:end+1] + plain_list.extend(taxrange) + else: + if type(start)==list: # start was the name of charset or taxset + plain_list.extend(start) + else: # start was an ordinary identifier + plain_list.append(start) + except NexusError: + raise + except: + return None + return plain_list + + def _resolve(self,identifier,set_type=None): + """Translate identifier in list into character/taxon index. + Characters (which are referred to by their index in Nexus.py): + Plain numbers are returned minus 1 (Nexus indices to python indices) + Text identifiers are translaterd into their indices (if plain character indentifiers), + the first hit in charlabels is returned (charlabels don't need to be unique) + or the range of indices is returned (if names of character sets). + Taxa (which are referred to by their unique name in Nexus.py): + Plain numbers are translated in their taxon name, underscores and spaces are considered equal. + Names are returned unchanged (if plain taxon identifiers), or the names in + the corresponding taxon set is returned + """ + identifier=quotestrip(identifier) + if not set_type: + raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') + if set_type==CHARSET: + try: + n=int(identifier) + except ValueError: + if self.charlabels and identifier in self.charlabels.values(): + for k in self.charlabels: + if self.charlabels[k]==identifier: + return k + elif self.charsets and identifier in self.charsets: + return self.charsets[identifier] + else: + raise NexusError, 'Unknown character identifier: %s' % identifier + else: + if n<=self.nchar: + return n-1 + else: + raise NexusError, 'Illegal character identifier: %d>nchar (=%d).' % (identifier,self.nchar) + elif set_type==TAXSET: + try: + n=int(identifier) + except ValueError: + taxlabels_id=self._check_taxlabels(identifier) + if taxlabels_id: + return taxlabels_id + elif self.taxsets and identifier in self.taxsets: + return self.taxsets[identifier] + else: + raise NexusError, 'Unknown taxon identifier: %s' % identifier + else: + if n>0 and n<=self.ntax: + return self.taxlabels[n-1] + else: + raise NexusError, 'Illegal taxon identifier: %d>ntax (=%d).' % (identifier,self.ntax) + else: + raise NexusError('Unknown set specification: %s.'% set_type) + + def _stateset(self, options): + #Not implemented + pass + + def _changeset(self, options): + #Not implemented + pass + + def _treeset(self, options): + #Not implemented + pass + + def _treepartition(self, options): + #Not implemented + pass + + def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False, + exclude=[], delete=[], charpartition=None, comment='',mrbayes=False): + """Writes a nexus file for each partition in charpartition. + Only non-excluded characters and non-deleted taxa are included, just the data block is written. + """ + + if not matrix: + matrix=self.matrix + if not matrix: + return + if not filename: + filename=self.filename + if charpartition: + pfilenames={} + for p in charpartition: + total_exclude=[]+exclude + total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]]) + total_exclude=_make_unique(total_exclude) + pcomment=comment+'\nPartition: '+p+'\n' + dot=filename.rfind('.') + if dot>0: + pfilename=filename[:dot]+'_'+p+'.data' + else: + pfilename=filename+'_'+p + pfilenames[p]=pfilename + self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize, + interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False, + mrbayes=mrbayes) + return pfilenames + else: + fn=self.filename+'.data' + self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave, + exclude=exclude,delete=delete,comment=comment,append_sets=False, + mrbayes=mrbayes) + return fn + + def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\ + blocksize=None, interleave=False, interleave_by_partition=False,\ + comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False): + """ Writes a nexus file with data and sets block. Character sets and partitions + are appended by default, and are adjusted according + to excluded characters (i.e. character sets still point to the same sites (not necessarily same positions), + without including the deleted characters. + """ + if not matrix: + matrix=self.matrix + if not matrix: + return + if not filename: + filename=self.filename + if [t for t in delete if not self._check_taxlabels(t)]: + raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(sets.Set(self.taxlabels))) + if interleave_by_partition: + if not interleave_by_partition in self.charpartitions: + raise NexusError, 'Unknown partition: '+interleave_by_partition + else: + partition=self.charpartitions[interleave_by_partition] + # we need to sort the partition names by starting position before we exclude characters + names=_sort_keys_by_values(partition) + newpartition={} + for p in partition: + newpartition[p]=[c for c in partition[p] if c not in exclude] + # how many taxa and how many characters are left? + undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] + cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete)) + ntax_adjusted=len(undelete) + nchar_adjusted=len(cropped_matrix[undelete[0]]) + if not undelete or (undelete and undelete[0]==''): + return + if isinstance(filename,str): + try: + fh=open(filename,'w') + except IOError: + raise NexusError, 'Could not open %s for writing.' % filename + elif isinstance(filename,file): + fh=filename + if not omit_NEXUS: + fh.write('#NEXUS\n') + if comment: + fh.write('['+comment+']\n') + fh.write('begin data;\n') + fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) + fh.write('\tformat datatype='+self.datatype) + if self.respectcase: + fh.write(' respectcase') + if self.missing: + fh.write(' missing='+self.missing) + if self.gap: + fh.write(' gap='+self.gap) + if self.matchchar: + fh.write(' matchchar='+self.matchchar) + if self.labels: + fh.write(' labels='+self.labels) + if self.equate: + fh.write(' equate='+self.equate) + if interleave or interleave_by_partition: + fh.write(' interleave') + fh.write(';\n') + #if self.taxlabels: + # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') + if self.charlabels: + newcharlabels=self._adjust_charlabels(exclude=exclude) + clkeys=newcharlabels.keys() + clkeys.sort() + fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n') + fh.write('matrix\n') + if not blocksize: + if interleave: + blocksize=70 + else: + blocksize=self.nchar + # delete deleted taxa and ecxclude excluded characters... + namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete]) + if interleave_by_partition: + # interleave by partitions, but adjust partitions with regard to excluded characters + seek=0 + for p in names: + fh.write('[%s: %s]\n' % (interleave_by_partition,p)) + if len(newpartition[p])>0: + for taxon in undelete: + fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) + fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n') + fh.write('\n') + else: + fh.write('[empty]\n\n') + seek+=len(newpartition[p]) + elif interleave: + for seek in range(0,nchar_adjusted,blocksize): + for taxon in undelete: + fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) + fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') + fh.write('\n') + else: + for taxon in undelete: + if blocksize<nchar_adjusted: + fh.write(safename(taxon,mrbayes=mrbayes)+'\n') + else: + fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) + for seek in range(0,nchar_adjusted,blocksize): + fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') + fh.write(';\nend;\n') + if append_sets: + fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes)) + fh.close() + return filename + + def append_sets(self,exclude=[],delete=[],mrbayes=False): + """Appends a sets block to <filename>.""" + if not self.charsets and not self.taxsets and not self.charpartitions: + return '' + sets=['\nbegin sets'] + # - now if characters have been excluded, the character sets need to be adjusted, + # so that they still point to the right character positions + # calculate a list of offsets: for each deleted character, the following character position + # in the new file will have an additional offset of -1 + offset=0 + offlist=[] + for c in range(self.nchar): + if c in exclude: + offset+=1 + offlist.append(-1) # dummy value as these character positions are excluded + else: + offlist.append(c-offset) + # now adjust each of the character sets + for n,ns in self.charsets.items(): + cset=[offlist[c] for c in ns if c not in exclude] + if cset: + sets.append('charset %s = %s' % (safename(n),_compact4nexus(cset))) + for n,s in self.taxsets.items(): + tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete] + if tset: + sets.append('taxset %s = %s' % (safename(n),' '.join(tset))) + for n,p in self.charpartitions.items(): + # as characters have been excluded, the partitions must be adjusted + # if a partition is empty, it will be omitted from the charpartition command + # (although paup allows charpartition part=t1:,t2:,t3:1-100) + names=_sort_keys_by_values(p) + newpartition={} + for sn in names: + nsp=[offlist[c] for c in p[sn] if c not in exclude] + if nsp: + newpartition[sn]=nsp + if newpartition: + sets.append('charpartition %s = %s' % (safename(n),\ + ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition]))) + # now write charpartititions, much easier than charpartitions + for n,p in self.taxpartitions.items(): + names=_sort_keys_by_values(p) + newpartition={} + for sn in names: + nsp=[t for t in p[sn] if t not in delete] + if nsp: + newpartition[sn]=nsp + if newpartition: + sets.append('taxpartition %s = %s' % (safename(n),\ + ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition]))) + # add 'end' and return everything + sets.append('end;\n') + return ';\n'.join(sets) + f.close() + + def export_fasta(self, filename=None, width=70): + """Writes matrix into a fasta file: (self, filename=None, width=70).""" + if not filename: + if '.' in filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: + filename='.'.join(self.filename.split('.')[:-1])+'.fas' + else: + filename=self.filename+'.fas' + fh=open(filename,'w') + for taxon in self.taxlabels: + fh.write('>'+safename(taxon)+'\n') + for i in range(0, len(self.matrix[taxon].tostring()), width): + fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n') + fh.close() + + def constant(self,matrix=None,delete=[],exclude=[]): + """Return a list with all constant characters.""" + if not matrix: + matrix=self.matrix + undelete=[t for t in self.taxlabels if t in matrix and t not in delete] + if not undelete: + return None + elif len(undelete)==1: + return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] + # get the first sequence and expand all ambiguous values + constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for + x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude] + for taxon in undelete[1:]: + newconstant=[] + for site in constant: + #print '%d (paup=%d)' % (site[0],site[0]+1), + seqsite=matrix[taxon][site[0]].upper() + #print seqsite,'checked against',site[1],'\t', + if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]: + # missing or same as before -> ok + newconstant.append(site) + elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap): + # subset of an ambig or only missing in previous -> take subset + newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite))) + elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values + intersect=sets.Set(self.ambiguous_values[seqsite]).intersection(sets.Set(site[1])) + if intersect: + newconstant.append((site[0],''.join(intersect))) + # print 'ok' + #else: + # print 'failed' + #else: + # print 'failed' + constant=newconstant + cpos=[s[0] for s in constant] + return constant + # return [x[0] for x in constant] + + def cstatus(self,site,delete=[],narrow=True): + """Summarize character. + narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) + narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) + """ + undelete=[t for t in self.taxlabels if t not in delete] + if not undelete: + return None + cstatus=[] + for t in undelete: + c=self.matrix[t][site].upper() + if self.options.get('gapmode')=='missing' and c==self.gap: + c=self.missing + if narrow and c==self.missing: + if c not in cstatus: + cstatus.append(c) + else: + cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus]) + if self.missing in cstatus and narrow and len(cstatus)>1: + cstatus=[c for c in cstatus if c!=self.missing] + cstatus.sort() + return cstatus + + def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]): + """Calculates a stepmatrix for weighted parsimony. + See Wheeler (1990), Cladistics 6:269-275 and + Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196 + """ + m=StepMatrix(self.unambiguous_letters,self.gap) + for site in [s for s in range(self.nchar) if s not in exclude]: + cstatus=self.cstatus(site,delete) + for i,b1 in enumerate(cstatus[:-1]): + for b2 in cstatus[i+1:]: + m.add(b1.upper(),b2.upper(),1) + return m.transformation().weighting().smprint(name=name) + + def crop_matrix(self,matrix=None, delete=[], exclude=[]): + """Return a matrix without deleted taxa and excluded characters.""" + if not matrix: + matrix=self.matrix + if [t for t in delete if not self._check_taxlabels(t)]: + raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(self.taxlabels)) + if exclude!=[]: + undelete=[t for t in self.taxlabels if t in matrix and t not in delete] + if not undelete: + return {} + m=[matrix[k].tostring() for k in undelete] + zipped_m=zip(*m) + sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude] + if sitesm==[]: + return dict([(t,Seq('',self.alphabet)) for t in undelete]) + else: + zipped_sitesm=zip(*sitesm) + m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)] + return dict(zip(undelete,m)) + else: + return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete]) + + def bootstrap(self,matrix=None,delete=[],exclude=[]): + """Return a bootstrapped matrix.""" + if not matrix: + matrix=self.matrix + seqobjects=isinstance(matrix[matrix.keys()[0]],Seq) # remember if Seq objects + cm=self.crop_matrix(delete=delete,exclude=exclude) # crop data out + if not cm: # everything deleted? + return {} + elif len(cm[cm.keys()[0]])==0: # everything excluded? + return cm + undelete=[t for t in self.taxlabels if t in cm] + if seqobjects: + sitesm=zip(*[cm[t].tostring() for t in undelete]) + alphabet=matrix[matrix.keys()[0]].alphabet + else: + sitesm=zip(*[cm[t] for t in undelete]) + bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))] + bootstrapseqs=map(''.join,zip(*bootstrapsitesm)) + if seqobjects: + bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs] + return dict(zip(undelete,bootstrapseqs)) + + def add_sequence(self,name,sequence): + """Adds a sequence to the matrix.""" + if not name: + raise NexusError, 'New sequence must have a name' + diff=self.nchar-len(sequence) + if diff<0: + self.insert_gap(self.nchar,-diff) + elif diff>0: + sequence+=self.missing*diff + + self.matrix[name]=Seq(sequence,self.alphabet) + self.ntax+=1 + self.taxlabels.append(name) + #taxlabels? + + def insert_gap(self,pos,n=1,leftgreedy=False): + """Add a gap into the matrix and adjust charsets and partitions. + + pos=0: first position + pos=nchar: last position + """ + + def _adjust(set,x,d,leftgreedy=False): + """Adjusts chartacter sets if gaps are inserted, taking care of + new gaps within a coherent character set.""" + # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15 + # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18 + # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18 + set.sort() + addpos=0 + for i,c in enumerate(set): + if c>=x: + set[i]=c+d + # if we add gaps within a group of characters, we want the gap position included in this group + if c==x: + if leftgreedy or (i>0 and set[i-1]==c-1): + addpos=i + if addpos>0: + set[addpos:addpos]=range(x,x+d) + return set + + if pos<0 or pos>self.nchar: + raise NexusError('Illegal gap position: %d' % pos) + if n==0: + return + sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) + sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n + # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\ + # i,taxon in enumerate(self.taxlabels)]) + zipped=zip(*sitesm) + mapped=map(''.join,zipped) + listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)] + self.matrix=dict(listed) + self.nchar+=n + # now adjust character sets + for i,s in self.charsets.items(): + self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy) + for p in self.charpartitions: + for sp,s in self.charpartitions[p].items(): + self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy) + # now adjust character state labels + self.charlabels=self._adjust_charlabels(insert=[pos]*n) + return self.charlabels + + def _adjust_charlabels(self,exclude=None,insert=None): + """Return adjusted indices of self.charlabels if characters are excluded or inserted.""" + if exclude and insert: + raise NexusError, 'Can\'t exclude and insert at the same time' + if not self.charlabels: + return None + labels=self.charlabels.keys() + labels.sort() + newcharlabels={} + if exclude: + exclude.sort() + exclude.append(sys.maxint) + excount=0 + for c in labels: + if not c in exclude: + while c>exclude[excount]: + excount+=1 + newcharlabels[c-excount]=self.charlabels[c] + elif insert: + insert.sort() + insert.append(sys.maxint) + icount=0 + for c in labels: + while c>=insert[icount]: + icount+=1 + newcharlabels[c+icount]=self.charlabels[c] + else: + return self.charlabels + return newcharlabels + + def invert(self,charlist): + """Returns all character indices that are not in charlist.""" + return [c for c in range(self.nchar) if c not in charlist] + + def gaponly(self,include_missing=False): + """Return gap-only sites.""" + gap=sets.Set(self.gap) + if include_missing: + gap.add(self.missing) + sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) + gaponly=[i for i,site in enumerate(sitesm) if sets.Set(site).issubset(gap)] + return gaponly + + def terminal_gap_to_missing(self,missing=None,skip_n=True): + """Replaces all terminal gaps with missing character. + + Mixtures like ???------??------- are properly resolved.""" + + if not missing: + missing=self.missing + replace=[self.missing,self.gap] + if not skip_n: + replace.extend(['n','N']) + for taxon in self.taxlabels: + sequence=self.matrix[taxon].tostring() + length=len(sequence) + start,end=get_start_end(sequence,skiplist=replace) + sequence=sequence[:end+1]+missing*(length-end-1) + sequence=start*missing+sequence[start:] + assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon + self.matrix[taxon]=Seq(sequence,self.alphabet) +