view commons/tools/OrientSequences.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python

"""
Interface to orient sequences before making a multiple alignment.
Use hashing or suffix tree to get an idea of the appropriate strand.
Use 'orienter' by default, otherwise use 'mummer'.
"""

import sys
import os
import glob
import getopt

from commons.core.seq.BioseqDB import BioseqDB
import pyRepet.seq.fastaDB
from commons.core.checker.CheckerUtils import CheckerUtils

class OrientSequences( object ):
    """
    Interface to orient sequences before making a multiple alignment.
    Use hashing or suffix tree to get an idea of the appropriate strand.
    Use 'orienter' by default, otherwise use 'mummer'.
    """
    
    def __init__(self, inFileName="", minMatchLength=10, prgToOrient = "orienter", outFileName="", clean=False, verbosity=1):
        """
        Constructor.
        """
        self._inFileName = inFileName
        self._minMatchLength = minMatchLength
        self._prgToOrient = prgToOrient
        self._outFileName = outFileName
        self._clean = clean
        self._verbose = verbosity
        
    def help( self ):
        """
        Display the help on stdout.
        """
        print
        print "usage:",sys.argv[0].split("/")[-1],"[options]"
        print "options:"
        print "     -h: this help"
        print "     -i: name of the input file (format='fasta')"
        print "     -m: minimum match length (default=10)"
        print "     -p: program to use first (default=orienter/mummer)"
        print "     -o: name of the output file (default=inFileName+'.oriented')"
        print "     -c: clean"
        print "     -v: verbosity level (0/default=1/2)"
        print
        
    def setAttributesFromCmdLine( self ):
        """
        Set the attributes from the command-line.
        """
        try:
            opts, args = getopt.getopt(sys.argv[1:],"hi:m:p:o:cv:")
        except getopt.GetoptError, err:
            print str(err); self.help(); sys.exit(1)
        for o,a in opts:
            if o == "-h":
                self.help(); sys.exit(0)
            elif o == "-i":
                self.setInputFileName( a )
            elif o == "-m":
                self.setMinMatchLength( a )
            elif o == "-p":
                self.setPrgToOrient( a )
            elif o == "-o":
                self.setOutputFileName( a )
            elif o == "-c":
                self.setClean()
            elif o == "-v":
                self.setVerbosityLevel( a )
                
    def setInputFileName( self, inFileName ):
        self._inFileName = inFileName
        
    def setMinMatchLength( self, minMatchLength ):
        self._minMatchLength = int(minMatchLength)
        
    def setPrgToOrient( self, prgToOrient ):
        self._prgToOrient = prgToOrient
        
    def setOutputFileName( self, outFileName ):
        self._outFileName = outFileName
        
    def setClean( self ):
        self._clean = True
        
    def setVerbosityLevel( self, verbose ):
        self._verbose = int(verbose)
        
    def checkAttributes( self ):
        """
        Check the attributes are valid before running the algorithm.
        """
        if self._inFileName == "":
            print "ERROR: missing input file name"
            self.help(); sys.exit(1)
        if not os.path.exists( self._inFileName ):
            print "ERROR: input file '%s' doesn't exist" % ( self._inFileName )
            self.help(); sys.exit(1)
        if self._prgToOrient not in [ "orienter", "mummer" ]:
            print "ERROR: unknown program '%s'" % ( self._prgToOrient )
            self.help(); sys.exit(1)
        if self._outFileName == "":
            self._outFileName = "%s.oriented" % ( self._inFileName )
            
    def useOrienter( self ):
        """
        Use 'orienter'.
        @return: exit value of 'orienter'
        """
        prg = "orienter"
        cmd = prg
        cmd += " -k"
#        cmd += " -l %i" % ( self._minMatchLength )
        cmd += " -v %i" % ( self._verbose )
        cmd += " %s" % ( self._inFileName )
        log = os.system( cmd )
        if log == 0 and self._outFileName.split(".")[-1] != "oriented":
            os.rename( self._inFileName + ".oriented", self._outFileName )
        return log
    
    def compareInputSequencesWithMummer( self, nbInSeq ):
        """
        Launch MUmmer on two single-sequence fasta files to find all maximal matches regardless of their uniqueness and record stdout.
        Only N(N-1)/2 comparisons are made.
        @param nbInSeq: number of input sequences
        @type nbInSeq: integer
        """
        if self._verbose > 0:
            print "aligning input sequences..."
            sys.stdout.flush()
        if not CheckerUtils.isExecutableInUserPath( "mummer" ):
            msg = "ERROR: 'mummer' is not in your PATH"
            sys.stderr.write( "%s\n" % ( msg ) )
            sys.exit(1)
        lInFiles = glob.glob( "batch_*.fa" )
        for i in range( 1, nbInSeq+1 ):
            for j in range( i+1, nbInSeq+1 ):
                if self._verbose > 1:
                    print "launch MUmmer on %i versus %i" % ( i, j )
                    sys.stdout.flush()
                prg = "mummer"
                cmd = prg
                cmd += " -maxmatch"
                cmd += " -l %i" % ( self._minMatchLength )
                cmd += " -b"
                cmd += " -F"
                cmd += " batch_%s.fa" % ( str(j).zfill( len( str( len(lInFiles) ) ) ) )
                cmd += " batch_%s.fa" % ( str(i).zfill( len( str( len(lInFiles) ) ) ) )
                cmd += " > mummer_%i_vs_%i.txt" % ( i, j )
                if self._verbose < 3:
                    cmd += " 2> /dev/null"
                returnStatus = os.system( cmd )
                if returnStatus != 0:
                    msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus )
                    sys.stderr.write( "%s\n" % ( msg ) )
                    sys.exit(1)
                    
    def parseMummerOutput( self, mummerFileName, nameSeq1, nameSeq2 ):
        """
        Parse the output from MUmmer.
        @param mummerFileName: file with the output from MUmmer
        @type mummerFileName: string
        @param nameSeq1: name of the first sequence in the pairwise comparison
        @type nameSeq1: string
        @param nameSeq2: name of the first sequence in the pairwise comparison
        @type nameSeq2: string
        @return: dictionary whose keys are strands and values the number of maximal matches
        """
        if self._verbose > 1:
            print "parse '%s'" % ( mummerFileName )
            sys.stdout.flush()
        dStrand2NbMatches = {}
        mummerF = open( mummerFileName, "r" )
        strand = "direct"
        nbMatches = 0
        line = mummerF.readline()
        while True:
            if line == "":
                break
            if line[0] == ">":
                if nameSeq1 not in line:
                    print "ERROR"
                    print nameSeq1
                    print nameSeq2
                    sys.exit(1)
                if "Reverse" in line:
                    dStrand2NbMatches[ strand ] = nbMatches
                    strand = "reverse"
                    nbMatches = 0
            else:
                nbMatches += 1
            line = mummerF.readline()
        dStrand2NbMatches[ strand ] = nbMatches
        mummerF.close()
        if self._verbose > 1:
            print " direct: %i maximal matches" % ( dStrand2NbMatches["direct"] )
            print " reverse: %i maximal matches" % ( dStrand2NbMatches["reverse"] )
            sys.stdout.flush()
        return dStrand2NbMatches
    
    def getCumulativeMatchLengthsOnBothStrandForEachPairwiseComparison( self, lInHeaders, nbInSeq ):
        """
        For each pairwise comparison, retrieve the number of matches on both strand.
        @param lInHeaders: list of the sequence headers
        @type lInHeaders: list of strings
        @param nbInSeq: number of input sequences
        @type nbInSeq: integer
        @return: dictionary whose keys are pairwise comparisons and values are number of matches on both strands
        """
        dMatrix = {}
        for i in range( 1, nbInSeq+1 ):
            for j in range( i+1, nbInSeq+1 ):
                pairComp = "%i_vs_%i" % ( i, j )
                dStrand2NbMatches = self.parseMummerOutput( "mummer_%s.txt" % ( pairComp ), lInHeaders[i-1], lInHeaders[j-1] )
                dMatrix[ pairComp ] = dStrand2NbMatches
        return dMatrix
    
    def showResultsAsOrienter( self, nbInSeq, dMatrix ):
        """
        Not used for the moment but can be useful...
        """
        for i in range( 1, nbInSeq+1 ):
            for j in range( i+1, nbInSeq+1 ):
                pairComp = "%i_vs_%i" % ( i, j )
                string = "aligning %i with %i" % ( i, j )
                string += " %i %i" % ( dMatrix[pairComp]["direct"], dMatrix[pairComp]["reverse"] )
                string += " max=%i" % ( max( dMatrix[pairComp]["direct"], dMatrix[pairComp]["reverse"] ) )
                if dMatrix[pairComp]["reverse"] > dMatrix[pairComp]["direct"]:
                    string += " strand=-"
                    string += " nb=%i" % ( dMatrix[pairComp]["reverse"] )
                else:
                    string += " strand=+"
                    string += " nb=%i" % ( dMatrix[pairComp]["direct"] )
                print string; sys.stdout.flush()
                
    def getSequencesToReverseFromMatrix( self, dMatrix, lNewHeaders ):
        """
        Retrieve the sequences than need to be re-oriented.
        @param dMatrix: 
        @type dMatrix: 
        @param lInHeaders: list of the new sequence headers
        @type lInHeaders: list of strings
        @return: list of headers corresponding to sequences than need to be re-oriented
        """
        lSeqToOrient = []
        
        for i in range( 1, len(lNewHeaders)+1 ):
            for j in range( i+1, len(lNewHeaders)+1 ):
                comp = "%i_vs_%i" % ( i, j )
                nbDirectMatches = dMatrix[ comp ][ "direct" ]
                nbReverseMatches = dMatrix[ comp ][ "reverse" ]
                if self._verbose > 1:
                    print "%s: direct=%i reverse=%i" % ( comp, nbDirectMatches, nbReverseMatches )
                if nbReverseMatches > nbDirectMatches and lNewHeaders[i-1] not in lSeqToOrient:
                    if lNewHeaders[ j-1 ] not in lSeqToOrient:
                        if self._verbose > 2:
                            "need to reverse '%s'" % ( lNewHeaders[j-1] )
                        lSeqToOrient.append( lNewHeaders[ j-1 ] )
        return lSeqToOrient
    
    
    def getSequencesToReverse( self ):
        """
        Return a list with the headers of the sequences to reverse. 
        """
        lSequenceHeadersToReverse = []
        
        if self._prgToOrient == "orienter":
            exitStatus = self.useOrienter()
            if exitStatus == 0:
                self.end()
                sys.exit(0)
            if exitStatus != 0:
                print "\nWARNING: 'orienter' had a problem, switching to 'mummer'"
                sys.stdout.flush()
                
        lInHeaders = pyRepet.seq.fastaDB.dbHeaders( self._inFileName )
        nbInSeq = len( lInHeaders )
        if self._verbose > 0:
            print "nb of input sequences: %i" % ( nbInSeq )
            sys.stdout.flush()
            
        pyRepet.seq.fastaDB.shortenSeqHeaders( self._inFileName, 1 )
        tmpFileName = "%s.shortH" % ( self._inFileName )
        lNewHeaders = pyRepet.seq.fastaDB.dbHeaders( tmpFileName )
        dNew2Init = pyRepet.seq.fastaDB.retrieveLinksNewInitialHeaders( "%slink" % ( tmpFileName ) )
        
        pyRepet.seq.fastaDB.dbSplit( tmpFileName, nbSeqPerBatch=1, newDir=True )
        os.chdir( "batches" )
        self.compareInputSequencesWithMummer( nbInSeq )
        dMatrix = self.getCumulativeMatchLengthsOnBothStrandForEachPairwiseComparison( lNewHeaders, nbInSeq )
        os.chdir( ".." )
        
        lNewHeadersToReverse = self.getSequencesToReverseFromMatrix( dMatrix, lNewHeaders )
        for newH in lNewHeadersToReverse:
            lSequenceHeadersToReverse.append( dNew2Init[ newH ] )
        if self._verbose > 0:
            print "nb of sequences to reverse: %i" % ( len(lNewHeadersToReverse) )
            for initH in lSequenceHeadersToReverse: print " %s" % ( initH )
            sys.stdout.flush()
            
        if self._clean:
            os.remove( tmpFileName )
            os.remove( "%slink" % ( tmpFileName ) )
            
        return lSequenceHeadersToReverse
    
    def orientInputSequences( self, lSequenceHeadersToReverse, tmpFileName="" ):
        """
        Save input sequences while re-orienting those needing it.
        @param lSequenceHeadersToReverse: list of headers corresponding to sequences than need to be re-oriented
        @type lSequenceHeadersToReverse: list of strings
        @param tmpFileName: name of a fasta file (inFileName by default)
        @type tmpFileName: string
        """
        if self._verbose > 0:
            print "saving oriented sequences..."
            sys.stdout.flush()
        if tmpFileName == "":
            tmpFileName = self._inFileName
        inDB = BioseqDB( tmpFileName )
        outDB = BioseqDB()
        for bs in inDB.db:
            if bs.header in lSequenceHeadersToReverse:
                bs.reverseComplement()
                bs.header += " re-oriented"
            outDB.add( bs )
        outDB.save( self._outFileName )
        
    def clean( self ):
        if os.path.exists( "batches" ):
            os.system( "rm -rf batches" )
        if os.path.exists( "orienter_error.log" ):
            os.remove( "orienter_error.log" )
        for f in glob.glob( "core.*" ):
            os.remove( f )
            
    def start( self ):
        """
        Useful commands before running the program.
        """
        self.checkAttributes()
        if self._verbose > 0:
            print "START %s" % ( type(self).__name__ )
            print "input file: %s" % ( self._inFileName )
            sys.stdout.flush()
            
    def end( self ):
        """
        Useful commands before ending the program.
        """
        if self._clean:
            self.clean()
        if self._verbose > 0:
            print "END %s" % ( type(self).__name__ )
            sys.stdout.flush()
            
    def run( self ):
        """
        Run the program.
        """
        self.start()
        lSequenceHeadersToReverse = self.getSequencesToReverse()
        self.orientInputSequences( lSequenceHeadersToReverse )
        self.end()
        
if __name__ == "__main__":
    i = OrientSequences()
    i.setAttributesFromCmdLine()
    i.run()