Mercurial > repos > urgi-team > teiso
diff TEisotools-1.1.a/commons/core/coord/MatchUtils.py @ 16:836ce3d9d47a draft default tip
Uploaded
author | urgi-team |
---|---|
date | Thu, 21 Jul 2016 07:42:47 -0400 |
parents | 255c852351c5 |
children |
line wrap: on
line diff
--- a/TEisotools-1.1.a/commons/core/coord/MatchUtils.py Thu Jul 21 07:36:44 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,316 +0,0 @@ -# Copyright INRA (Institut National de la Recherche Agronomique) -# http://www.inra.fr -# http://urgi.versailles.inra.fr -# -# This software is governed by the CeCILL license under French law and -# abiding by the rules of distribution of free software. You can use, -# modify and/ or redistribute the software under the terms of the CeCILL -# license as circulated by CEA, CNRS and INRIA at the following URL -# "http://www.cecill.info". -# -# As a counterpart to the access to the source code and rights to copy, -# modify and redistribute granted by the license, users are provided only -# with a limited warranty and the software's author, the holder of the -# economic rights, and the successive licensors have only limited -# liability. -# -# In this respect, the user's attention is drawn to the risks associated -# with loading, using, modifying and/or developing or reproducing the -# software by the user in light of its specific status of free software, -# that may mean that it is complicated to manipulate, and that also -# therefore means that it is reserved for developers and experienced -# professionals having in-depth computer knowledge. Users are therefore -# encouraged to load and test the software's suitability as regards their -# requirements in conditions enabling the security of their systems and/or -# data to be ensured and, more generally, to use and operate it in the -# same conditions as regards security. -# -# The fact that you are presently reading this means that you have had -# knowledge of the CeCILL license and that you accept its terms. - -import os -import sys -import math -from commons.core.coord.Match import Match -from commons.core.utils.FileUtils import FileUtils -from commons.core.checker.RepetException import RepetException - -# # Static methods for the manipulation of Match instances -# -class MatchUtils (object): - - # # Return a list with Match instances from the given file - # - # @param inFile name of a file in the Match format - # @return a list of Match instances - # - @staticmethod - def getMatchListFromFile(inFile): - lMatchInstances = [] - inFileHandler = open(inFile, "r") - while True: - line = inFileHandler.readline() - if line == "": - break - if line[0:10] == "query.name": - continue - m = Match() - m.setFromString(line) - lMatchInstances.append(m) - inFileHandler.close() - return lMatchInstances - - # # Split a Match list in several Match lists according to the subject - # - # @param lMatches a list of Match instances - # @return a dictionary which keys are subject names and values Match lists - # - @staticmethod - def getDictOfListsWithSubjectAsKey(lMatches): - dSubject2MatchList = {} - for iMatch in lMatches: - if not dSubject2MatchList.has_key(iMatch.range_subject.seqname): - dSubject2MatchList[ iMatch.range_subject.seqname ] = [] - dSubject2MatchList[ iMatch.range_subject.seqname ].append(iMatch) - return dSubject2MatchList - - # # Split a Match list in several Match lists according to the query - # - # @param lMatches a list of Match instances - # @return a dictionary which keys are query names and values Match lists - # - @staticmethod - def getDictOfListsWithQueryAsKey (lMatches): - dQuery2MatchList = {} - for iMatch in lMatches: - if not dQuery2MatchList.has_key(iMatch.range_query.seqname): - dQuery2MatchList[ iMatch.range_query.seqname ] = [] - dQuery2MatchList[ iMatch.range_query.seqname ].append(iMatch) - return dQuery2MatchList - - # # Write Match instances contained in the given list - # - # @param lMatches a list of Match instances - # @param fileName name of the file to write the Match instances - # @param mode the open mode of the file ""w"" or ""a"" - # - @staticmethod - def writeListInFile(lMatches, fileName, mode = "w", header = None): - fileHandler = open(fileName, mode) - if header: - fileHandler.write(header) - for iMatch in lMatches: - iMatch.write(fileHandler) - fileHandler.close() - - # # Give path id list from a list of Match instances - # - # @param lMatch list of Match instances - # @return lId integer list - # - @staticmethod - def getIdListFromMatchList(lMatch): - lId = [] - for iMatch in lMatch: - lId.append(iMatch.id) - return lId - - # # Remove duplicated matches in a match list - # ## replace old PyRepet.MatchDB.rmvDoublons() - # - # @param lMatch list of Match instances - # @return lMatchesUniq match unique list - # - @staticmethod - def rmvDuplicateMatches(lMatch): - lMatchesUniq = [] - for match in lMatch: - if len(lMatchesUniq) == 0: - lMatchesUniq.append(match) - else: - nbDoublons = 0 - for m in lMatchesUniq: - if match.isDoublonWith(m): - nbDoublons += 1 - if nbDoublons == 0: - lMatchesUniq.append(match) - - for match1 in lMatchesUniq: - for match2 in lMatchesUniq: - if match1.id != match2.id: - if match1.isDoublonWith(match2): - raise RepetException ("*** Error: doublon not removed") - return lMatchesUniq - - # # Return the list of queries 'included' in subjects when two different databanks are used. - # #replace old pyRepet.MatchDB.filterDiffQrySbj() - # - # @param iBioseqDB bioseqDB databank of queries - # @param thresIdentity float identity threshold - # @param thresLength float length threshold - # @param verbose int verbosity - # - # @return lMatches match list to keep according to length and identity thresholds - # TODO: don't take into account match for sequence against itself. To do ? - @staticmethod - def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity = 0.95, thresLength = 0.98, verbose = 0): - if verbose > 0: - print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % (thresIdentity, thresLength); sys.stdout.flush() - - thresIdentityPerc = math.floor(thresIdentity * 100) - lQryToKeep = [] - dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile)) - - for seqH in iBioseqDB.idx.keys(): - # keep it if it has no match - if not dQry2Matches.has_key(seqH): - if seqH not in lQryToKeep: - lQryToKeep.append(seqH) - else: - isConditionsMet = False - for match in dQry2Matches[ seqH ]: - # check if they are above the thresholds - if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength: - isConditionsMet = True - break - if not isConditionsMet and seqH not in lQryToKeep: - lQryToKeep.append(seqH) - return lQryToKeep - - # # Count the number of distinct matches involved in at least one match above the thresholds. - # #replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() - # - # @param thresIdentity float identity threshold - # @param thresLength float length threshold - # - @staticmethod - def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity = 0.95, thresLength = 0.98, whatToCount = "query"): - thresIdentityPerc = math.floor(thresIdentity * 100) - countSbj = 0 - if whatToCount.lower() == "query": - dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches) - else: - dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches) - - for qry in dMatches.keys(): - countMatch = 0 - for match in dMatches[ qry ]: - - if match.identity >= thresIdentityPerc and getattr(match, whatToCount.lower() + "_length_perc") >= thresLength: - countMatch += 1 - if countMatch > 0: - countSbj += 1 - return countSbj - - # # Convert a 'match' file (output from Matcher) into an 'align' file - # # replace old parser.tab2align - # - # @param inFileName a string input file name - # - @staticmethod - def convertMatchFileToAlignFile(inFileName): - basename = os.path.splitext(inFileName)[0] - outFileName = "%s.align" % basename - outFile = open(outFileName, "w") - - lMatches = MatchUtils.getMatchListFromFile(inFileName) - - for match in lMatches: - string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity()) - outFile.write(string) - - outFile.close() - - # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) - # for each match, compute coverage on the smallest seq using this expression - # Min (|QMatchEnd - QMatchStart + 1|,|SMatchEnd - SMatchStart + 1|) - #coverage = ----------------------------------------------------------------- - # Min (QLength,SLength) - # - # Use this coverage for arc value. Matches of a sequence on itself are filtered. - # - # @param matchFileName string input match file name - # @param outFileName string output abc file name - # @param coverage float coverage filter threshold - # - @staticmethod - def convertMatchFileIntoABCFileWithCoverageComputeOnSmallestSeq(matchFileName, outFileName, coverageThreshold = 0): - sSeqNames = set() - sSeqNamesUsed = set() - with open(outFileName, "w") as outF: - with open(matchFileName) as inF: - inF.readline() - for inLine in inF: - splittedLine = inLine.split("\t") - qName = splittedLine[0] - sName = splittedLine[6] - sSeqNames.add(qName) - sSeqNames.add(sName) - if qName != sName: - matchMin = min(abs(float(splittedLine[2]) - float(splittedLine[1]) + 1), abs(float(splittedLine[8]) - float(splittedLine[7]) + 1)) - seqMin = min(int(splittedLine[3]) / float(splittedLine[4]), int(splittedLine[9]) / float(splittedLine[10])) - coverage = round(matchMin / seqMin, 2) - if coverage >= coverageThreshold: - outF.write("%s\n" % "\t".join([qName, sName, str(coverage)])) - sSeqNamesUsed.add(qName) - sSeqNamesUsed.add(sName) - - with open("%s.unused" % outFileName, "w") as outSeqNameOfUnusedMatchesF: - for seqName in sorted(sSeqNames - sSeqNamesUsed): - outSeqNameOfUnusedMatchesF.write("%s\n" % seqName) - - if FileUtils.isEmpty(outFileName): - print "WARNING: '%s' is empty." % outFileName - - # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) - # Use coverage on query for arc value - # - # @param matchFileName string input match file name - # @param outFileName string output abc file name - # @param coverage float query coverage filter threshold - # - @staticmethod - def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0): - with open(matchFileName) as inF: - with open(outFileName, "w") as outF: - inF.readline() - for inLine in inF: - splittedLine = inLine.split("\t") - if float(splittedLine[4]) >= coverage: - outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]]) - outLine += "\n" - outF.write(outLine) - - if FileUtils.isEmpty(outFileName): - print "WARNING: '%s' is empty." % outFileName - - # # Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines. - # # replace old parser.tabnum2id - # - # @param fileName a string input file name - # @param outputFileName a string output file name (optional) - # - @staticmethod - def generateMatchFileWithNewPathId(fileName, outputFileName = None): - if outputFileName is None: - outFile = open(fileName, "w") - else: - outFile = open(outputFileName, "w") - outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n") - - lMatches = MatchUtils.getMatchListFromFile(fileName) - count = 1 - dMatchKeyIdcount = {} - - for match in lMatches: - key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName() - if not key_id in dMatchKeyIdcount.keys(): - newPath = count - count += 1 - dMatchKeyIdcount[ key_id ] = newPath - else: - newPath = dMatchKeyIdcount[ key_id ] - - match.id = newPath - outFile.write(match.toString() + "\n") - outFile.close()