view mytools/sequence.py @ 5:84e2c77a6abd

Uploaded
author xuebing
date Fri, 16 Mar 2012 14:01:16 -0400
parents 39217fa39ff2
children
line wrap: on
line source

#!@WHICHPYTHON@

import copy, string, sys

#------------------ Alphabet -------------------

class Alphabet(object):
    """Biological alphabet class.
    This defines the set of symbols from which various objects can be built, e.g. sequences and motifs.
    The symbol set is immutable and accessed as a tuple.
    symstr: symbols in alphabet as either a tuple or string
    complement: dictionary defining letters and their complement
    """
    def __init__(self, symstr, complement = None):
        """Construct an alphabet from a string or tuple of characters.
        Lower case characters will be converted to upper case.
        An optional mapping for complements may be provided.
        Example:
        >>> alpha = sequence.Alphabet('ACGTttga', {'A':'C', 'G':'T'})
        >>> alpha.getSymbols()
        will construct the DNA alphabet and output:
        ('A', 'C', 'G', 'T')
        """
        symlst = []
        for s in [str(sym).upper()[0] for sym in symstr]:
            if not s in symlst:
                symlst.append(s)
        self.symbols = tuple(symlst)
        if complement != None:
            # expand the mapping and check for contradictions
            cmap = {}
            for s in self.symbols:
                c = complement.get(s, None)
                if c != None:
                    if s in cmap and cmap[s] != c:
                        raise RuntimeError("Alphabet complement map "
                                "contains contradictory mapping")
                    cmap[s] = c
                    cmap[c] = s
            # replace mapping with indicies
            cimap = {}
            for idx in range (len(self.symbols)):
                s = self.symbols[idx]
                if s in cmap:
                    cimap[cmap[s]] = idx
            # create tuple
            cidxlst = []
            for idx in range (len(self.symbols)):
                cidxlst.append(cimap.get(self.symbols[idx], None))
            self.complements = tuple(cidxlst)
        else:
            self.complements = None

    def getSymbols(self):
        """Retrieve a tuple with all symbols, immutable membership and order"""
        return self.symbols

    def getComplements(self):
        """Retrieve a tuple with all complement indicies, immutable"""
        return self.complements

    def isValidSymbol(self, sym):
        """Check if the symbol is a member of alphabet"""
        return any([s==sym for s in self.symbols])

    def getIndex(self, sym):
        """Retrieve the index of the symbol (immutable)"""
        for idx in range (len(self.symbols)):
            if self.symbols[idx] == sym:
                return idx
        raise RuntimeError("Symbol " + sym + " does not exist in alphabet")

    def isComplementable(self):
        return self.complements != None

    def getComplement(self, sym):
        """Retrieve the complement of the symbol (immutable)"""
        return self.symbols[self.complements[self.getIndex(sym)]];

    def isValidString(self, symstr):
        """Check if the string contains only symbols that belong to the alphabet"""
        found = True
        for sym in symstr:
            if self.isValidSymbol(sym) == False:
                return False
        return True

    def getLen(self):
        """Retrieve the number of symbols in (the length of) the alphabet"""
        return len(self.symbols)

# pre-defined alphabets that can be specified by their name
predefAlphabets = [
    ("DNA"                , Alphabet('ACGT', {'A':'T', 'G':'C'})),
    ("RNA"                , Alphabet('ACGU')),
    ("Extended DNA"       , Alphabet('ACGTYRN')),
    ("Protein"            , Alphabet('ACDEFGHIKLMNPQRSTVWY')),
    ("Extended Protein"   , Alphabet('ACDEFGHIKLMNPQRSTVWYX')),
    ("TM Labels"          , Alphabet('MIO'))
]

def getAlphabet(name):
    """Retrieve a pre-defined alphabet by name.
    Currently, "Protein", "DNA", "RNA", "Extended DNA", "Extended Protein" and "TM Labels" are available.
    Example:
    >>> alpha = sequence.getAlphabet('Protein')
    >>> alpha.getSymbols()
    will retrieve the 20 amino acid alphabet and output the tuple:
    ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')
    """
    for (xname, xalpha) in predefAlphabets:
        if xname == name:
            return xalpha
    return None

#------------------ Sequence -------------------

class Sequence(object):
    """Biological sequence class. Sequence data is immutable.

    data: the sequence data as a tuple or string
    alpha: the alphabet from which symbols are taken
    name: the sequence name, if any
    info: can contain additional sequence information apart from the name
    """
    def __init__(self, sequence, alpha = None, name = "", seqinfo = ""):
        """Create a sequence with sequence data.
        Specifying the alphabet is optional, so is the name and info.
        Example:
        >>> myseq = sequence.Sequence('MVSAKKVPAIAMSFGVSF')
        will create a sequence with name "", and assign one of the predefined alphabets on basis of what symbols were used.
        >>> myseq.getAlphabet().getSymbols()
        will most likely output the standard protein alphabet:
        ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')
        """
        if type(sequence) is str:
            self.data = tuple(sequence.upper())
        elif type(sequence) is tuple:
            self.data = sequence
        elif type(sequence) is list:
            self.data = tuple([s.upper() for s in sequence])
        else:
            raise RuntimeError("Sequence data is not specified correctly: must be string or tuple")
        # Resolve choice of alphabet
        validAlphabet = False
        if alpha == None:                                   # Alphabet is not set, attempt to set it automatically...
            for (xname, xalpha) in predefAlphabets:         # Iterate through each predefined alphabet, in order
                if xalpha.isValidString( self.data ):        # This alphabet works, go with it
                    self.alpha = alpha = xalpha
                    validAlphabet = True
                    break
        self.name = name
        self.info = seqinfo
        if validAlphabet == False:            # we were either unsuccessful above or the alphabet was specified so test it
            if type(alpha) is str:            # check if name is a predefined alphabet
                for (xname, xalpha) in predefAlphabets:   # Iterate through each predefined alphabet, check for name
                    if (xname == alpha):
                        alpha = xalpha
                        break
            if type(alpha) is Alphabet:       # the alphabet is specified
                if alpha.isValidString(self.data) == False:
                    raise RuntimeError("Invalid alphabet specified: "+"".join(alpha.getSymbols())+" is not compatible with sequence '"+"".join(self.data)+"'")
                else:
                    self.alpha = alpha
            else:
                raise RuntimeError("Could not identify alphabet from sequence")

    #basic getters and setters for the class
    def getName(self):
        """Get the name of the sequence"""
        return self.name
    def getInfo(self):
        """Get additional info of the sequence (e.g. from the defline in a FASTA file)"""
        return self.info
    def getAlphabet(self):
        """Retrieve the alphabet that is assigned to this sequence"""
        return self.alpha
    def setName(self, name):
        """Change the name of the sequence"""
        self.name = name
    def setAlphabet(self, alpha):
        """Set the alphabet, throws an exception if it is not compatible with the sequence data"""
        if type(alpha) is Alphabet:
            if alpha.isValid( sequence ) == False:
                raise RuntimeError( "Invalid alphabet specified" )
    #sequence functions
    def getSequence(self):
        """Retrieve the sequence data (a tuple of symbols)"""
        return self.data
    def getString(self):
        """Retrieve the sequence data as a string (copy of actual data)"""
        return "".join(self.data)
    def getLen(self):
        """Get the length of the sequence (number of symbols)"""
        return len(self.data)
    def getSite(self, position, length = 1):
        """Retrieve a site in the sequence of desired length.
        Note that positions go from 0 to length-1, and that if the requested site
        extends beyond those the method throws an exception.
        """
        if position >= 0 and position <= self.getLen() - length:
            if length == 1:
                return self.data[position]
            else:
                return self.data[position:position+length]
        else:
            raise RuntimeError( "Attempt to access invalid position in sequence "+self.getName() )

    def nice(self):
        """ A short description of the sequence """
        print self.getName(), ":", self.getLen()

def readStrings(filename):
    """ Read one or more lines of text from a file--for example an alignment.
    Return as a list of strings.
    filename: name of file
    """
    txtlist = []
    f = open(filename)
    for line in f.readlines():
        txtlist.extend(line.split())
    return txtlist

def readFASTA(filename, alpha = None):
    """ Read one or more sequences from a file in FASTA format.
    filename: name of file to load sequences from
    alpha: alphabet that is used (if left unspecified, an attempt is made to identify the alphabet for each individual sequence)
    """
    seqlist = []
    seqname = None
    seqinfo = None
    seqdata = []
    fh = open(filename)
    thisline = fh.readline()
    while (thisline):
        if (thisline[0] == '>'): # new sequence
            if (seqname):        # take care of the data that is already in the buffer before processing the new sequence
                try:
                    seqnew = Sequence(seqdata, alpha, seqname, seqinfo)
                    seqlist.append(seqnew)
                except RuntimeError, e:
                    print >> sys.stderr, "Warning: "+seqname+" is invalid (ignored): ", e
            seqinfo = thisline[1:-1]         # everything on the defline is "info"
            seqname = seqinfo.split()[0]     # up to first space
            seqdata = []
        else:  # pull out the sequence data
            cleanline = thisline.split()
            for line in cleanline:
                seqdata.extend(tuple(line.strip('*'))) # sometimes a line ends with an asterisk in FASTA files
        thisline = fh.readline()

    if (seqname):
        try:
            seqnew = Sequence(seqdata, alpha, seqname, seqinfo)
            seqlist.append(seqnew)
        except RuntimeError, e:
            print >> sys.stderr, "Warning: " + seqname + " is invalid (ignored): ", e
    else:
        raise RuntimeError("No sequences on FASTA format found in this file")
    fh.close()
    return seqlist

def _writeOneFASTA(sequence, filehandle):
    """Write one sequence in FASTA format to an already open file"""
    filehandle.write(">" + sequence.getName()+"\n")
    data = sequence.getSequence()
    lines = ( sequence.getLen() - 1) / 60 + 1
    for i in range(lines):
        #note: python lets us get the last line (var length) free
        #lineofseq = data[i*60 : (i+1)*60] + "\n"
        lineofseq = "".join(data[i*60 : (i+1)*60]) + "\n"
        filehandle.write(lineofseq)

def writeFASTA(sequence, filename):
    """Write a list (or a single) of sequences to a file in the FASTA format"""
    fh = open(filename, "w")
    if isinstance(sequence, Sequence):
        _writeOneFASTA(sequence, fh)
    else:
        for seq in sequence:
            if isinstance(seq, Sequence):
                _writeOneFASTA(seq, fh)
            else:
                print >> sys.stderr, "Warning: could not write " + seq.getName() + " (ignored)."
    fh.flush()
    fh.close()

#------------------ Distrib -------------------

class Distrib(object):
    """Class for storing a multinomial probability distribution over the symbols in an alphabet"""
    def __init__(self, alpha, pseudo_count = 0.0):
        self.alpha = alpha
        self.tot = pseudo_count * self.alpha.getLen()
        self.cnt = [pseudo_count for _ in range( self.alpha.getLen() )]

    def __deepcopy__(self, memo):
        dup = Distrib(self.alpha)
        dup.tot = copy.deepcopy(self.tot, memo)
        dup.cnt = copy.deepcopy(self.cnt, memo)
        return dup

    def count(self, syms = None ):
        """Count an observation of a symbol"""
        if syms == None:
            syms = self.alpha.getSymbols()
        for sym in syms:
            idx = self.alpha.getIndex( sym )
            self.cnt[idx] += 1.0
            self.tot += 1

    def complement(self):
        """Complement the counts, throw an error if this is impossible"""
        if not self.alpha.isComplementable():
            raise RuntimeError("Attempt to complement a Distrib "
                    "based on a non-complementable alphabet.")
        coms = self.alpha.getComplements()
        new_count = []
        for idx in range(len(coms)):
            cidx = coms[idx]
            if cidx == None:
                cidx = idx
            new_count.append(self.cnt[cidx])
        self.cnt = new_count
        return self

    def reset(self):
        """Reset the distribution, that is, restart counting."""
        self.tot = 0
        self.cnt = [0.0 for _ in range( self.alpha.getLen() )]

    def getFreq(self, sym = None):
        """Determine the probability distribution from the current counts.
        The order in which probabilities are given follow the order of the symbols in the alphabet."""
        if self.tot > 0:
            if sym == None:
                freq = tuple([ y / self.tot for y in self.cnt ])
                return freq
            else:
                idx = self.alpha.getIndex( sym )
                return self.cnt[idx] / self.tot
        return None

    def pretty(self):
        """Retrieve the probabilites for all symbols and return as a pretty table (a list of text strings)"""
        table = ["".join(["%4s " % s for s in self.alpha.getSymbols()])]
        table.append("".join(["%3.2f " % y for y in Distrib.getFreq(self)]))
        return table

    def getSymbols(self):
        """Get the symbols in the alphabet in the same order as probabilities are given."""
        return self.alpha.getSymbols()

    def getAlphabet(self):
        """Get the alphabet over which the distribution is defined."""
        return self.alpha

#------------------ Motif (and subclasses) -------------------

class Motif(object):
    """ Sequence motif class--defining a pattern that can be searched in sequences.
    This class is not intended for direct use. Instead use and develop sub-classes (see below).
    """
    def __init__(self, alpha):
        self.len = 0
        self.alpha = alpha

    def getLen(self):
        """Get the length of the motif"""
        return self.len

    def getAlphabet(self):
        """Get the alphabet that is used in the motif"""
        return self.alpha

    def isAlphabet(self, seqstr):
        """Check if the sequence can be processed by this motif"""
        mystr = seqstr
        if type(seqstr) is Sequence:
            mystr = seqstr.getString()
        return self.getAlphabet().isValidString(mystr)

import re

class RegExp(Motif):
    """A motif class that defines the pattern in terms of a regular expression"""
    def __init__(self, alpha, re_string):
        Motif.__init__(self, alpha)
        self.pattern = re.compile(re_string)

    def match(self, seq):
        """Find matches to the motif in a specified sequence.
        The method is a generator, hence subsequent hits can be retrieved using next().
        The returned result is a tuple (position, match-sequence, score), where score is
        always 1.0 since a regular expression is either true or false (not returned).
        """
        myseq = seq
        if not type(seq) is Sequence:
            myseq = Sequence(seq, self.alpha)
        mystr = myseq.getString()
        if not Motif.isAlphabet(self, mystr):
            raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName())
        for m in re.finditer(self.pattern, mystr):
            yield (m.start(), m.group(), 1.0)

import math, time

# Variables used by the PWM for creating an EPS file
_colour_def = (
    "/black [0 0 0] def\n"
    "/red [0.8 0 0] def\n"
    "/green [0 0.5 0] def\n"
    "/blue [0 0 0.8] def\n"
    "/yellow [1 1 0] def\n"
    "/purple [0.8 0 0.8] def\n"
    "/magenta [1.0 0 1.0] def\n"
    "/cyan [0 1.0 1.0] def\n"
    "/pink [1.0 0.8 0.8] def\n"
    "/turquoise [0.2 0.9 0.8] def\n"
    "/orange [1 0.7 0] def\n"
    "/lightred [0.8 0.56 0.56] def\n"
    "/lightgreen [0.35 0.5 0.35] def\n"
    "/lightblue [0.56 0.56 0.8] def\n"
    "/lightyellow [1 1 0.71] def\n"
    "/lightpurple [0.8 0.56 0.8] def\n"
    "/lightmagenta [1.0 0.7 1.0] def\n"
    "/lightcyan [0.7 1.0 1.0] def\n"
    "/lightpink [1.0 0.9 0.9] def\n"
    "/lightturquoise [0.81 0.9 0.89] def\n"
    "/lightorange [1 0.91 0.7] def\n")
_colour_dict = (
    "/fullColourDict <<\n"
    " (G)  orange\n"
    " (T)  green\n"
    " (C)  blue\n"
    " (A)  red\n"
    " (U)  green\n"
    ">> def\n"
    "/mutedColourDict <<\n"
    " (G)  lightorange\n"
    " (T)  lightgreen\n"
    " (C)  lightblue\n"
    " (A)  lightred\n"
    " (U)  lightgreen\n"
    ">> def\n"
    "/colorDict fullColourDict def\n")

_eps_defaults = {
    'LOGOTYPE': 'NA',
    'FONTSIZE': '12',
    'TITLEFONTSIZE': '12',
    'SMALLFONTSIZE': '6',
    'TOPMARGIN': '0.9',
    'BOTTOMMARGIN': '0.9',
    'YAXIS': 'true',
    'YAXISLABEL': 'bits',
    'XAXISLABEL': '',
    'TITLE': '',
    'ERRORBARFRACTION': '1.0',
    'SHOWINGBOX': 'false',
    'BARBITS': '2.0',
    'TICBITS': '1',
    'COLORDEF': _colour_def,
    'COLORDICT': _colour_dict,
    'SHOWENDS': 'false',
    'NUMBERING': 'true',
    'OUTLINE': 'false',
}
class PWM(Motif):
    """This motif subclass defines a pattern in terms of a position weight matrix.
    An alphabet must be provided. A pseudo-count to be added to each count is
    optional.  A uniform background distribution is used by default.
    """
    def __init__(self, alpha):
        Motif.__init__(self, alpha)                     # set alphabet of this multinomial distribution
        self.background = Distrib(alpha)                # the default background ...
        self.background.count(alpha.getSymbols())       # ... is uniform
        self.nsites = 0

    def setFromAlignment(self, aligned, pseudo_count = 0.0):
        """Set the probabilities in the PWM from an alignment.
        The alignment is a list of equal-length strings (see readStrings), OR
        a list of Sequence.
        """
        self.cols = -1
        self.nsites = len(aligned)
        seqs = []
        # Below we create a list of Sequence from the alignment,
        # while doing some error checking, and figure out the number of columns
        for s in aligned:
            # probably a text string, so we make a nameless sequence from it
            if not type(s) is Sequence:
                s=Sequence(s, Motif.getAlphabet(self))
            else:
            # it was a sequence, so we check that the alphabet in
            # this motif will be able to process it
                if not Motif.isAlphabet(self, s):
                    raise RuntimeError("Motif alphabet is not valid for sequence " + s.getName())
            if self.cols == -1:
                self.cols = s.getLen()
            elif self.cols != s.getLen():
                raise RuntimeError("Sequences in alignment are not of equal length")
            seqs.append(s)
        # The line below initializes the list of Distrib (one for each column of the alignment)
        self.counts = [Distrib(Motif.getAlphabet(self), pseudo_count) for _ in range(self.cols)]
        # Next, we do the counting, column by column
        for c in range( self.cols ):     # iterate through columns
            for s in seqs:               # iterate through rows
                # determine the index of the symbol we find at this position (row, column c)
                self.counts[c].count(s.getSite(c))
        # Update the length
        self.len = self.cols

    def reverseComplement(self):
        """Reverse complement the PWM"""
        i = 0
        j = len(self.counts)-1
        while (i < j):
            temp = self.counts[i];
            self.counts[i] = self.counts[j]
            self.counts[j] = temp
            self.counts[i].complement()
            self.counts[j].complement()
            i += 1;
            j -= 1;
        if i == j:
            self.counts[i].complement()
        return self

    def getNSites(self):
        """Get the number of sites that made the PWM"""
        return self.nsites

    def setBackground(self, distrib):
        """Set the background distribution"""
        if not distrib.getAlphabet() == Motif.getAlphabet(self):
            raise RuntimeError("Incompatible alphabets")
        self.background = distrib

    def getFreq(self, col = None, sym = None):
        """Get the probabilities for all positions in the PWM (a list of Distribs)"""
        if (col == None):
            return [y.getFreq() for y in self.counts]
        else:
            return self.counts[col].getFreq(sym)

    def pretty(self):
        """Retrieve the probabilites for all positions in the PWM as a pretty table (a list of text strings)"""
        #table = ["".join(["%8s " % s for s in self.alpha.getSymbols()])]
        table = []
        for row in PWM.getFreq(self):
            table.append("".join(["%8.6f " % y for y in row]))
        return table

    def logoddsPretty(self, bkg):
        """Retrieve the (base-2) log-odds for all positions in the PWM as a pretty table (a list of text strings)"""
        table = []
        for row in PWM.getFreq(self):
            #table.append("".join(["%8.6f " % (math.log((row[i]+1e-6)/bkg[i])/math.log(2)) for i in range(len(row))]))
            table.append("".join(["%8.6f " % (math.log((row[i])/bkg[i])/math.log(2)) for i in range(len(row))]))
            #table.append("".join(["%8.6f " % row[i] for i in range(len(row))]))
        return table


    def consensus_sequence(self):
        """
        Get the consensus sequence corresponding to a PWM.
        Consensus sequence is the letter in each column
        with the highest probability.
        """
        consensus = ""
        alphabet = Motif.getAlphabet(self).getSymbols()
        for pos in range(self.cols):
            best_letter = alphabet[0]
            best_p = self.counts[pos].getFreq(best_letter)
            for letter in alphabet[1:]:
                p = self.counts[pos].getFreq(letter)
                if p > best_p:
                    best_p = p
                    best_letter = letter
            consensus += best_letter
        return consensus


    def consensus(self):
        """
        Get the consensus corresponding to a PWM.
        Consensus at each column of motif is a list of
        characters with non-zero probabilities.
        """
        consensus = []
        for pos in range(self.cols):
            matches = []
            for letter in Motif.getAlphabet(self).getSymbols():
                p = self.counts[pos].getFreq(letter)
                if p > 0:
                    matches += letter
            consensus.append(matches)
        return consensus


    def getScore(self, seq, start):
        """Score this particular list of symbols using the PFM (background needs to be set separately)"""
        sum = 0.0
        seqdata = seq.getSequence()[start : start+self.cols]
        for pos in range(len(seqdata)):
            q = self.counts[pos].getFreq(seqdata[pos])
            if q == 0:
                q = 0.0001 # to avoid log(0) == -Infinity
            logodds = math.log(q / self.background.getFreq(seqdata[pos]))
            sum += logodds
        return sum

    def match(self, seq, _LOG0 = -10):
        """Find matches to the motif in a specified sequence.
        The method is a generator, hence subsequent hits can be retrieved using next().
        The returned result is a tuple (position, match-sequence, score).
        The optional parameter _LOG0 specifies a lower bound on reported logodds scores.
        """
        myseq = seq
        if not type(seq) is Sequence:
            myseq = Sequence(seq, self.alpha)
        if not Motif.isAlphabet(self, myseq):
            raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName())
        for pos in range(myseq.getLen() - self.cols):
            score = PWM.getScore(self, myseq, pos)
            if score > _LOG0:
                yield (pos, "".join(myseq.getSite(pos, self.cols)), score)

    def writeEPS(self, program, template_file, eps_fh, 
            timestamp = time.localtime()):
        """Write out a DNA motif to EPS format."""
        small_dfmt = "%d.%m.%Y %H:%M"
        full_dfmt = "%d.%m.%Y %H:%M:%S %Z"
        small_date = time.strftime(small_dfmt, timestamp)
        full_date = time.strftime(full_dfmt, timestamp)
        points_per_cm = 72.0 / 2.54
        height = 4.5
        width = self.getLen() * 0.8 + 2
        width = min(30, width)
        points_height = int(height * points_per_cm)
        points_width = int(width * points_per_cm)
        defaults = _eps_defaults.copy()
        defaults['CREATOR'] = program
        defaults['CREATIONDATE'] = full_date
        defaults['LOGOHEIGHT'] = str(height)
        defaults['LOGOWIDTH'] = str(width)
        defaults['FINEPRINT'] = program + ' ' + small_date
        defaults['CHARSPERLINE'] = str(self.getLen())
        defaults['BOUNDINGHEIGHT'] = str(points_height)
        defaults['BOUNDINGWIDTH'] = str(points_width)
        defaults['LOGOLINEHEIGHT'] = str(height)
        with open(template_file, 'r') as template_fh:
            m_var = re.compile("\{\$([A-Z]+)\}")
            for line in template_fh:
                last = 0
                match = m_var.search(line)
                while (match):
                    if (last < match.start()):
                        prev = line[last:match.start()]
                        eps_fh.write(prev)
                    key = match.group(1)
                    if (key == "DATA"):
                        eps_fh.write("\nStartLine\n")
                        for pos in range(self.getLen()):
                            eps_fh.write("({0:d}) startstack\n".format(pos+1))
                            stack = []
                            # calculate the stack information content
                            alpha_ic = 2
                            h = 0
                            for sym in self.getAlphabet().getSymbols():
                                freq = self.getFreq(pos, sym)
                                if (freq == 0):
                                    continue
                                h -= (freq * math.log(freq, 2))
                            stack_ic = alpha_ic - h
                            # calculate the heights of each symbol
                            for sym in self.getAlphabet().getSymbols():
                                freq = self.getFreq(pos, sym)
                                if (freq == 0):
                                    continue
                                stack.append((freq * stack_ic, sym))
                            stack.sort();
                            # output the symbols
                            for symh, sym in stack:
                                eps_fh.write(" {0:f} ({1:s}) numchar\n".format(
                                        symh, sym))
                            eps_fh.write("endstack\n\n")
                        eps_fh.write("EndLine\n")
                    elif (key in defaults):
                        eps_fh.write(defaults[key])
                    else:
                        raise RuntimeError('Unknown variable "' + key + 
                                '" in EPS template')
                    last = match.end();
                    match = m_var.search(line, last)
                if (last < len(line)):
                    eps_fh.write(line[last:])


#------------------ Main method -------------------
# Executed if you run this file from the operating system prompt, e.g.
# > python sequence.py

if __name__=='__main__':
    alpha = getAlphabet('Extended DNA')
    #seqs = readFASTA('pos.fasta')
    seqs = []
    aln = readStrings('tmp0')
    #regexp = RegExp(alpha, '[AG]G.[DE]TT[AS].')
    pwm = PWM(alpha)
    pwm.setFromAlignment(aln)
    for row in pwm.pretty():
        print row
    for s in seqs:
        print s.getName(), s.getLen(), s.getAlphabet().getSymbols()
        for m in regexp.match( s ):
            print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])
        for m in pwm.match( s ):
            print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])