view commons/core/coord/MatchUtils.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents 769e306b7933
children
line wrap: on
line source

# Copyright INRA (Institut National de la Recherche Agronomique)
# http://www.inra.fr
# http://urgi.versailles.inra.fr
#
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software.  You can  use, 
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info". 
#
# As a counterpart to the access to the source code and  rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty  and the software's author,  the holder of the
# economic rights,  and the successive licensors  have only  limited
# liability. 
#
# In this respect, the user's attention is drawn to the risks associated
# with loading,  using,  modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean  that it is complicated to manipulate,  and  that  also
# therefore means  that it is reserved for developers  and  experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or 
# data to be ensured and,  more generally, to use and operate it in the 
# same conditions as regards security. 
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.

import math
import os
import sys
from commons.core.coord.Match import Match
from commons.core.checker.RepetException import RepetException

## Static methods for the manipulation of Match instances
#
class MatchUtils ( object ):
    
    ## Return a list with Match instances from the given file
    #
    # @param inFile name of a file in the Match format
    # @return a list of Match instances
    #
    def getMatchListFromFile(inFile ):
        lMatchInstances = []
        inFileHandler = open( inFile, "r" )
        while True:
            line = inFileHandler.readline()
            if line == "":
                break
            if line[0:10] == "query.name":
                continue
            m = Match()
            m.setFromString( line )
            lMatchInstances.append( m )
        inFileHandler.close()
        return lMatchInstances
    
    getMatchListFromFile = staticmethod( getMatchListFromFile )
    
    ##  Split a Match list in several Match lists according to the subject
    #
    #  @param lMatches a list of Match instances
    #  @return a dictionary which keys are subject names and values Match lists
    #
    def getDictOfListsWithSubjectAsKey( lMatches ):
        dSubject2MatchList = {}
        for iMatch in lMatches:
            if not dSubject2MatchList.has_key( iMatch.range_subject.seqname ):
                dSubject2MatchList[ iMatch.range_subject.seqname ] = []
            dSubject2MatchList[ iMatch.range_subject.seqname ].append( iMatch )
        return dSubject2MatchList
    
    getDictOfListsWithSubjectAsKey = staticmethod( getDictOfListsWithSubjectAsKey )
    
    ##  Split a Match list in several Match lists according to the query
    #
    #  @param lMatches a list of Match instances
    #  @return a dictionary which keys are query names and values Match lists
    #
    def getDictOfListsWithQueryAsKey ( lMatches ):
        dQuery2MatchList = {}
        for iMatch in lMatches:
            if not dQuery2MatchList.has_key( iMatch.range_query.seqname ):
                dQuery2MatchList[ iMatch.range_query.seqname ] = []
            dQuery2MatchList[ iMatch.range_query.seqname ].append( iMatch )
        return dQuery2MatchList
    
    getDictOfListsWithQueryAsKey = staticmethod( getDictOfListsWithQueryAsKey )   
         
    ## Write Match instances contained in the given list
    #
    # @param lMatches a list of Match instances
    # @param fileName name of the file to write the Match instances
    # @param mode the open mode of the file ""w"" or ""a"" 
    #
    def writeListInFile( lMatches, fileName, mode="w", header=None ):
        fileHandler = open( fileName, mode )
        if header:
            fileHandler.write( header )
        for iMatch in lMatches:
            iMatch.write( fileHandler )
        fileHandler.close()
        
    writeListInFile = staticmethod( writeListInFile )

    ## Give path id list from a list of Match instances
    #
    # @param lMatch list of Match instances
    #
    # @return lId integer list
    #
    def getIdListFromMatchList(lMatch):
        lId = []
        for iMatch in lMatch:
            lId.append(iMatch.id)
        return lId
    
    getIdListFromMatchList = staticmethod(getIdListFromMatchList)
    
    ## Remove duplicated matches in a match list
    # ## replace old PyRepet.MatchDB.rmvDoublons()
    # @param lMatch list of Match instances
    #
    # @return lMatchesUniq match unique list
    #
    def rmvDuplicateMatches(lMatch):
        lMatchesUniq = []
        for match in lMatch:
            if len(lMatchesUniq) == 0:
                lMatchesUniq.append( match )
            else:
                nbDoublons = 0
                for m in lMatchesUniq:
                    if match.isDoublonWith( m ):
                        nbDoublons += 1
                if nbDoublons == 0:
                    lMatchesUniq.append( match )
                    
        for match1 in lMatchesUniq:
            for match2 in lMatchesUniq:
                if match1.id != match2.id:
                    if match1.isDoublonWith( match2 ):
                        raise RepetException ( "*** Error: doublon not removed" )
        return lMatchesUniq
    
    rmvDuplicateMatches = staticmethod(rmvDuplicateMatches)

    ## Return the list of queries 'included' in subjects when two different databanks are used.
    ##replace old pyRepet.MatchDB.filterDiffQrySbj()
    #
    # @param iBioseqDB bioseqDB databank of queries
    #
    # @param thresIdentity float identity threshold
    #
    # @param thresLength float length threshold
    #
    # @param verbose int verbosity
    #
    # @return lMatches match list to keep according to length and identity thresholds
    #TODO: don't take into account match for sequence against itself. To do ? 
    def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity=0.95, thresLength=0.98, verbose=0 ):
        if verbose > 0:
            print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % ( thresIdentity, thresLength ); sys.stdout.flush()

        thresIdentityPerc = math.floor( thresIdentity*100 )
        lQryToKeep = []
        dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile))

        for seqH in iBioseqDB.idx.keys():
            # keep it if it has no match
            if not dQry2Matches.has_key( seqH ):
                if seqH not in lQryToKeep:
                    lQryToKeep.append( seqH )
            else:
                isConditionsMet = False
                for match in dQry2Matches[ seqH ]:
                    # check if they are above the thresholds
                    if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength:
                        isConditionsMet = True
                        break
                if not isConditionsMet and  seqH not in lQryToKeep:
                    lQryToKeep.append( seqH )
        return lQryToKeep
    
    filterDiffQrySbj = staticmethod(filterDiffQrySbj)
    
    ## Count the number of distinct matches involved in at least one match above the thresholds.
    ##replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres()
    # @param thresIdentity float identity threshold
    #
    # @param thresLength float length threshold 
    #
    def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity=0.95, thresLength=0.98, whatToCount="query" ):
        thresIdentityPerc = math.floor( thresIdentity*100 )
        countSbj = 0
        if whatToCount.lower() == "query":
            dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches)
        else:
            dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches)
            
        for qry in dMatches.keys():
            countMatch = 0
            for match in dMatches[ qry ]:
                
                if match.identity >= thresIdentityPerc and getattr(match,whatToCount.lower() +"_length_perc") >= thresLength:
                    countMatch += 1
            if countMatch > 0:
                countSbj += 1
        return countSbj
    
    getNbDistinctSequencesInsideMatchesWithThresh = staticmethod(getNbDistinctSequencesInsideMatchesWithThresh)
    
    ## Convert a 'match' file (output from Matcher) into an 'align' file
    ## replace old parser.tab2align
    #
    # @param inFileName  a string input file name
    #
    def convertMatchFileToAlignFile(inFileName):
        basename = os.path.splitext(inFileName)[0]
        outFileName = "%s.align" % basename
        outFile = open(outFileName, "w")
        
        lMatches = MatchUtils.getMatchListFromFile(inFileName) 
        
        for match in lMatches:
            string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity() )
            outFile.write( string )
            
        outFile.close()
        
    convertMatchFileToAlignFile = staticmethod(convertMatchFileToAlignFile)
    
    ## Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file)
    # Use coverage on query for arc value
    #
    # @param matchFileName string input match file name
    # @param outFileName string output abc file name
    # @param coverage float query coverage filter threshold
    #
    @staticmethod
    def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0):
        with open(matchFileName) as inF:
            with open(outFileName, "w") as outF:
                inF.readline()
                inLine = inF.readline()
                while inLine:
                    splittedLine = inLine.split("\t")
                    if float(splittedLine[4]) >= coverage:
                        outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]])
                        outLine += "\n"
                        outF.write(outLine)
                    inLine = inF.readline()

    ## Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines. 
    ## replace old parser.tabnum2id
    #
    # @param fileName  a string input file name
    # @param  outputFileName  a string output file name (optional)
    #
    def generateMatchFileWithNewPathId(fileName, outputFileName=None):
        if outputFileName is None:   
            outFile = open(fileName, "w")
        else:
            outFile = open(outputFileName, "w")      
        outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n")
      
        lMatches = MatchUtils.getMatchListFromFile(fileName) 
        count = 1
        dMatchKeyIdcount = {}
        
        for match in lMatches:
            key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName()
            if not key_id in dMatchKeyIdcount.keys():
                newPath = count
                count += 1
                dMatchKeyIdcount[ key_id ] = newPath
            else:
                newPath = dMatchKeyIdcount[ key_id ]
                
            match.id = newPath
            outFile.write( match.toString()+"\n" )  
        outFile.close()
        
    generateMatchFileWithNewPathId = staticmethod(generateMatchFileWithNewPathId)