diff TEisotools-1.1.a/commons/core/coord/MatchUtils.py @ 16:836ce3d9d47a draft default tip

Uploaded
author urgi-team
date Thu, 21 Jul 2016 07:42:47 -0400
parents 255c852351c5
children
line wrap: on
line diff
--- a/TEisotools-1.1.a/commons/core/coord/MatchUtils.py	Thu Jul 21 07:36:44 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,316 +0,0 @@
-# Copyright INRA (Institut National de la Recherche Agronomique)
-# http://www.inra.fr
-# http://urgi.versailles.inra.fr
-#
-# This software is governed by the CeCILL license under French law and
-# abiding by the rules of distribution of free software.  You can  use,
-# modify and/ or redistribute the software under the terms of the CeCILL
-# license as circulated by CEA, CNRS and INRIA at the following URL
-# "http://www.cecill.info".
-#
-# As a counterpart to the access to the source code and  rights to copy,
-# modify and redistribute granted by the license, users are provided only
-# with a limited warranty  and the software's author,  the holder of the
-# economic rights,  and the successive licensors  have only  limited
-# liability.
-#
-# In this respect, the user's attention is drawn to the risks associated
-# with loading,  using,  modifying and/or developing or reproducing the
-# software by the user in light of its specific status of free software,
-# that may mean  that it is complicated to manipulate,  and  that  also
-# therefore means  that it is reserved for developers  and  experienced
-# professionals having in-depth computer knowledge. Users are therefore
-# encouraged to load and test the software's suitability as regards their
-# requirements in conditions enabling the security of their systems and/or
-# data to be ensured and,  more generally, to use and operate it in the
-# same conditions as regards security.
-#
-# The fact that you are presently reading this means that you have had
-# knowledge of the CeCILL license and that you accept its terms.
-
-import os
-import sys
-import math
-from commons.core.coord.Match import Match
-from commons.core.utils.FileUtils import FileUtils
-from commons.core.checker.RepetException import RepetException
-
-# # Static methods for the manipulation of Match instances
-#
-class MatchUtils (object):
-
-    # # Return a list with Match instances from the given file
-    #
-    # @param inFile name of a file in the Match format
-    # @return a list of Match instances
-    #
-    @staticmethod
-    def getMatchListFromFile(inFile):
-        lMatchInstances = []
-        inFileHandler = open(inFile, "r")
-        while True:
-            line = inFileHandler.readline()
-            if line == "":
-                break
-            if line[0:10] == "query.name":
-                continue
-            m = Match()
-            m.setFromString(line)
-            lMatchInstances.append(m)
-        inFileHandler.close()
-        return lMatchInstances
-
-    # #  Split a Match list in several Match lists according to the subject
-    #
-    #  @param lMatches a list of Match instances
-    #  @return a dictionary which keys are subject names and values Match lists
-    #
-    @staticmethod
-    def getDictOfListsWithSubjectAsKey(lMatches):
-        dSubject2MatchList = {}
-        for iMatch in lMatches:
-            if not dSubject2MatchList.has_key(iMatch.range_subject.seqname):
-                dSubject2MatchList[ iMatch.range_subject.seqname ] = []
-            dSubject2MatchList[ iMatch.range_subject.seqname ].append(iMatch)
-        return dSubject2MatchList
-
-    # #  Split a Match list in several Match lists according to the query
-    #
-    #  @param lMatches a list of Match instances
-    #  @return a dictionary which keys are query names and values Match lists
-    #
-    @staticmethod
-    def getDictOfListsWithQueryAsKey (lMatches):
-        dQuery2MatchList = {}
-        for iMatch in lMatches:
-            if not dQuery2MatchList.has_key(iMatch.range_query.seqname):
-                dQuery2MatchList[ iMatch.range_query.seqname ] = []
-            dQuery2MatchList[ iMatch.range_query.seqname ].append(iMatch)
-        return dQuery2MatchList
-
-    # # Write Match instances contained in the given list
-    #
-    # @param lMatches a list of Match instances
-    # @param fileName name of the file to write the Match instances
-    # @param mode the open mode of the file ""w"" or ""a""
-    #
-    @staticmethod
-    def writeListInFile(lMatches, fileName, mode = "w", header = None):
-        fileHandler = open(fileName, mode)
-        if header:
-            fileHandler.write(header)
-        for iMatch in lMatches:
-            iMatch.write(fileHandler)
-        fileHandler.close()
-
-    # # Give path id list from a list of Match instances
-    #
-    # @param lMatch list of Match instances
-    # @return lId integer list
-    #
-    @staticmethod
-    def getIdListFromMatchList(lMatch):
-        lId = []
-        for iMatch in lMatch:
-            lId.append(iMatch.id)
-        return lId
-
-    # # Remove duplicated matches in a match list
-    # ## replace old PyRepet.MatchDB.rmvDoublons()
-    #
-    # @param lMatch list of Match instances
-    # @return lMatchesUniq match unique list
-    #
-    @staticmethod
-    def rmvDuplicateMatches(lMatch):
-        lMatchesUniq = []
-        for match in lMatch:
-            if len(lMatchesUniq) == 0:
-                lMatchesUniq.append(match)
-            else:
-                nbDoublons = 0
-                for m in lMatchesUniq:
-                    if match.isDoublonWith(m):
-                        nbDoublons += 1
-                if nbDoublons == 0:
-                    lMatchesUniq.append(match)
-
-        for match1 in lMatchesUniq:
-            for match2 in lMatchesUniq:
-                if match1.id != match2.id:
-                    if match1.isDoublonWith(match2):
-                        raise RepetException ("*** Error: doublon not removed")
-        return lMatchesUniq
-
-    # # Return the list of queries 'included' in subjects when two different databanks are used.
-    # #replace old pyRepet.MatchDB.filterDiffQrySbj()
-    #
-    # @param iBioseqDB bioseqDB databank of queries
-    # @param thresIdentity float identity threshold
-    # @param thresLength float length threshold
-    # @param verbose int verbosity
-    #
-    # @return lMatches match list to keep according to length and identity thresholds
-    # TODO: don't take into account match for sequence against itself. To do ?
-    @staticmethod
-    def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity = 0.95, thresLength = 0.98, verbose = 0):
-        if verbose > 0:
-            print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % (thresIdentity, thresLength); sys.stdout.flush()
-
-        thresIdentityPerc = math.floor(thresIdentity * 100)
-        lQryToKeep = []
-        dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile))
-
-        for seqH in iBioseqDB.idx.keys():
-            # keep it if it has no match
-            if not dQry2Matches.has_key(seqH):
-                if seqH not in lQryToKeep:
-                    lQryToKeep.append(seqH)
-            else:
-                isConditionsMet = False
-                for match in dQry2Matches[ seqH ]:
-                    # check if they are above the thresholds
-                    if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength:
-                        isConditionsMet = True
-                        break
-                if not isConditionsMet and  seqH not in lQryToKeep:
-                    lQryToKeep.append(seqH)
-        return lQryToKeep
-
-    # # Count the number of distinct matches involved in at least one match above the thresholds.
-    # #replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres()
-    #
-    # @param thresIdentity float identity threshold
-    # @param thresLength float length threshold
-    #
-    @staticmethod
-    def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity = 0.95, thresLength = 0.98, whatToCount = "query"):
-        thresIdentityPerc = math.floor(thresIdentity * 100)
-        countSbj = 0
-        if whatToCount.lower() == "query":
-            dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches)
-        else:
-            dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches)
-
-        for qry in dMatches.keys():
-            countMatch = 0
-            for match in dMatches[ qry ]:
-
-                if match.identity >= thresIdentityPerc and getattr(match, whatToCount.lower() + "_length_perc") >= thresLength:
-                    countMatch += 1
-            if countMatch > 0:
-                countSbj += 1
-        return countSbj
-
-    # # Convert a 'match' file (output from Matcher) into an 'align' file
-    # # replace old parser.tab2align
-    #
-    # @param inFileName  a string input file name
-    #
-    @staticmethod
-    def convertMatchFileToAlignFile(inFileName):
-        basename = os.path.splitext(inFileName)[0]
-        outFileName = "%s.align" % basename
-        outFile = open(outFileName, "w")
-
-        lMatches = MatchUtils.getMatchListFromFile(inFileName)
-
-        for match in lMatches:
-            string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity())
-            outFile.write(string)
-
-        outFile.close()
-
-    # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file)
-    # for each match, compute coverage on the smallest seq using this expression
-    #           Min (|QMatchEnd - QMatchStart + 1|,|SMatchEnd - SMatchStart + 1|)
-    #coverage = -----------------------------------------------------------------
-    #                             Min (QLength,SLength)
-    #
-    # Use this coverage for arc value. Matches of a sequence on itself are filtered.
-    #
-    # @param matchFileName string input match file name
-    # @param outFileName string output abc file name
-    # @param coverage float coverage filter threshold
-    #
-    @staticmethod
-    def convertMatchFileIntoABCFileWithCoverageComputeOnSmallestSeq(matchFileName, outFileName, coverageThreshold = 0):
-        sSeqNames = set()
-        sSeqNamesUsed = set()
-        with open(outFileName, "w") as outF:
-            with open(matchFileName) as inF:
-                inF.readline()
-                for inLine in inF:
-                    splittedLine = inLine.split("\t")
-                    qName = splittedLine[0]
-                    sName = splittedLine[6]
-                    sSeqNames.add(qName)
-                    sSeqNames.add(sName)
-                    if qName != sName:
-                        matchMin = min(abs(float(splittedLine[2]) - float(splittedLine[1]) + 1), abs(float(splittedLine[8]) - float(splittedLine[7]) + 1))
-                        seqMin = min(int(splittedLine[3]) / float(splittedLine[4]), int(splittedLine[9]) / float(splittedLine[10]))
-                        coverage = round(matchMin / seqMin, 2)
-                        if coverage >= coverageThreshold:
-                            outF.write("%s\n" % "\t".join([qName, sName, str(coverage)]))
-                            sSeqNamesUsed.add(qName)
-                            sSeqNamesUsed.add(sName)
-
-        with open("%s.unused" % outFileName, "w") as outSeqNameOfUnusedMatchesF:
-            for seqName in sorted(sSeqNames - sSeqNamesUsed):
-                outSeqNameOfUnusedMatchesF.write("%s\n" % seqName)
-
-        if FileUtils.isEmpty(outFileName):
-            print "WARNING: '%s' is empty." % outFileName
-
-    # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file)
-    # Use coverage on query for arc value
-    #
-    # @param matchFileName string input match file name
-    # @param outFileName string output abc file name
-    # @param coverage float query coverage filter threshold
-    #
-    @staticmethod
-    def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0):
-        with open(matchFileName) as inF:
-                with open(outFileName, "w") as outF:
-                    inF.readline()
-                    for inLine in inF:
-                        splittedLine = inLine.split("\t")
-                        if float(splittedLine[4]) >= coverage:
-                            outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]])
-                            outLine += "\n"
-                            outF.write(outLine)
-
-        if FileUtils.isEmpty(outFileName):
-            print "WARNING: '%s' is empty." % outFileName
-
-    # # Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines.
-    # # replace old parser.tabnum2id
-    #
-    # @param fileName  a string input file name
-    # @param  outputFileName  a string output file name (optional)
-    #
-    @staticmethod
-    def generateMatchFileWithNewPathId(fileName, outputFileName = None):
-        if outputFileName is None:
-            outFile = open(fileName, "w")
-        else:
-            outFile = open(outputFileName, "w")
-        outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n")
-
-        lMatches = MatchUtils.getMatchListFromFile(fileName)
-        count = 1
-        dMatchKeyIdcount = {}
-
-        for match in lMatches:
-            key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName()
-            if not key_id in dMatchKeyIdcount.keys():
-                newPath = count
-                count += 1
-                dMatchKeyIdcount[ key_id ] = newPath
-            else:
-                newPath = dMatchKeyIdcount[ key_id ]
-
-            match.id = newPath
-            outFile.write(match.toString() + "\n")
-        outFile.close()