Mercurial > repos > urgi-team > teiso
diff TEisotools-1.0/commons/core/coord/MatchUtils.py @ 6:20ec0d14798e draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 05:00:24 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/commons/core/coord/MatchUtils.py Wed Jul 20 05:00:24 2016 -0400 @@ -0,0 +1,316 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import os +import sys +import math +from commons.core.coord.Match import Match +from commons.core.utils.FileUtils import FileUtils +from commons.core.checker.RepetException import RepetException + +# # Static methods for the manipulation of Match instances +# +class MatchUtils (object): + + # # Return a list with Match instances from the given file + # + # @param inFile name of a file in the Match format + # @return a list of Match instances + # + @staticmethod + def getMatchListFromFile(inFile): + lMatchInstances = [] + inFileHandler = open(inFile, "r") + while True: + line = inFileHandler.readline() + if line == "": + break + if line[0:10] == "query.name": + continue + m = Match() + m.setFromString(line) + lMatchInstances.append(m) + inFileHandler.close() + return lMatchInstances + + # # Split a Match list in several Match lists according to the subject + # + # @param lMatches a list of Match instances + # @return a dictionary which keys are subject names and values Match lists + # + @staticmethod + def getDictOfListsWithSubjectAsKey(lMatches): + dSubject2MatchList = {} + for iMatch in lMatches: + if not dSubject2MatchList.has_key(iMatch.range_subject.seqname): + dSubject2MatchList[ iMatch.range_subject.seqname ] = [] + dSubject2MatchList[ iMatch.range_subject.seqname ].append(iMatch) + return dSubject2MatchList + + # # Split a Match list in several Match lists according to the query + # + # @param lMatches a list of Match instances + # @return a dictionary which keys are query names and values Match lists + # + @staticmethod + def getDictOfListsWithQueryAsKey (lMatches): + dQuery2MatchList = {} + for iMatch in lMatches: + if not dQuery2MatchList.has_key(iMatch.range_query.seqname): + dQuery2MatchList[ iMatch.range_query.seqname ] = [] + dQuery2MatchList[ iMatch.range_query.seqname ].append(iMatch) + return dQuery2MatchList + + # # Write Match instances contained in the given list + # + # @param lMatches a list of Match instances + # @param fileName name of the file to write the Match instances + # @param mode the open mode of the file ""w"" or ""a"" + # + @staticmethod + def writeListInFile(lMatches, fileName, mode = "w", header = None): + fileHandler = open(fileName, mode) + if header: + fileHandler.write(header) + for iMatch in lMatches: + iMatch.write(fileHandler) + fileHandler.close() + + # # Give path id list from a list of Match instances + # + # @param lMatch list of Match instances + # @return lId integer list + # + @staticmethod + def getIdListFromMatchList(lMatch): + lId = [] + for iMatch in lMatch: + lId.append(iMatch.id) + return lId + + # # Remove duplicated matches in a match list + # ## replace old PyRepet.MatchDB.rmvDoublons() + # + # @param lMatch list of Match instances + # @return lMatchesUniq match unique list + # + @staticmethod + def rmvDuplicateMatches(lMatch): + lMatchesUniq = [] + for match in lMatch: + if len(lMatchesUniq) == 0: + lMatchesUniq.append(match) + else: + nbDoublons = 0 + for m in lMatchesUniq: + if match.isDoublonWith(m): + nbDoublons += 1 + if nbDoublons == 0: + lMatchesUniq.append(match) + + for match1 in lMatchesUniq: + for match2 in lMatchesUniq: + if match1.id != match2.id: + if match1.isDoublonWith(match2): + raise RepetException ("*** Error: doublon not removed") + return lMatchesUniq + + # # Return the list of queries 'included' in subjects when two different databanks are used. + # #replace old pyRepet.MatchDB.filterDiffQrySbj() + # + # @param iBioseqDB bioseqDB databank of queries + # @param thresIdentity float identity threshold + # @param thresLength float length threshold + # @param verbose int verbosity + # + # @return lMatches match list to keep according to length and identity thresholds + # TODO: don't take into account match for sequence against itself. To do ? + @staticmethod + def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity = 0.95, thresLength = 0.98, verbose = 0): + if verbose > 0: + print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % (thresIdentity, thresLength); sys.stdout.flush() + + thresIdentityPerc = math.floor(thresIdentity * 100) + lQryToKeep = [] + dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile)) + + for seqH in iBioseqDB.idx.keys(): + # keep it if it has no match + if not dQry2Matches.has_key(seqH): + if seqH not in lQryToKeep: + lQryToKeep.append(seqH) + else: + isConditionsMet = False + for match in dQry2Matches[ seqH ]: + # check if they are above the thresholds + if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength: + isConditionsMet = True + break + if not isConditionsMet and seqH not in lQryToKeep: + lQryToKeep.append(seqH) + return lQryToKeep + + # # Count the number of distinct matches involved in at least one match above the thresholds. + # #replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() + # + # @param thresIdentity float identity threshold + # @param thresLength float length threshold + # + @staticmethod + def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity = 0.95, thresLength = 0.98, whatToCount = "query"): + thresIdentityPerc = math.floor(thresIdentity * 100) + countSbj = 0 + if whatToCount.lower() == "query": + dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches) + else: + dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches) + + for qry in dMatches.keys(): + countMatch = 0 + for match in dMatches[ qry ]: + + if match.identity >= thresIdentityPerc and getattr(match, whatToCount.lower() + "_length_perc") >= thresLength: + countMatch += 1 + if countMatch > 0: + countSbj += 1 + return countSbj + + # # Convert a 'match' file (output from Matcher) into an 'align' file + # # replace old parser.tab2align + # + # @param inFileName a string input file name + # + @staticmethod + def convertMatchFileToAlignFile(inFileName): + basename = os.path.splitext(inFileName)[0] + outFileName = "%s.align" % basename + outFile = open(outFileName, "w") + + lMatches = MatchUtils.getMatchListFromFile(inFileName) + + for match in lMatches: + string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity()) + outFile.write(string) + + outFile.close() + + # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) + # for each match, compute coverage on the smallest seq using this expression + # Min (|QMatchEnd - QMatchStart + 1|,|SMatchEnd - SMatchStart + 1|) + #coverage = ----------------------------------------------------------------- + # Min (QLength,SLength) + # + # Use this coverage for arc value. Matches of a sequence on itself are filtered. + # + # @param matchFileName string input match file name + # @param outFileName string output abc file name + # @param coverage float coverage filter threshold + # + @staticmethod + def convertMatchFileIntoABCFileWithCoverageComputeOnSmallestSeq(matchFileName, outFileName, coverageThreshold = 0): + sSeqNames = set() + sSeqNamesUsed = set() + with open(outFileName, "w") as outF: + with open(matchFileName) as inF: + inF.readline() + for inLine in inF: + splittedLine = inLine.split("\t") + qName = splittedLine[0] + sName = splittedLine[6] + sSeqNames.add(qName) + sSeqNames.add(sName) + if qName != sName: + matchMin = min(abs(float(splittedLine[2]) - float(splittedLine[1]) + 1), abs(float(splittedLine[8]) - float(splittedLine[7]) + 1)) + seqMin = min(int(splittedLine[3]) / float(splittedLine[4]), int(splittedLine[9]) / float(splittedLine[10])) + coverage = round(matchMin / seqMin, 2) + if coverage >= coverageThreshold: + outF.write("%s\n" % "\t".join([qName, sName, str(coverage)])) + sSeqNamesUsed.add(qName) + sSeqNamesUsed.add(sName) + + with open("%s.unused" % outFileName, "w") as outSeqNameOfUnusedMatchesF: + for seqName in sorted(sSeqNames - sSeqNamesUsed): + outSeqNameOfUnusedMatchesF.write("%s\n" % seqName) + + if FileUtils.isEmpty(outFileName): + print "WARNING: '%s' is empty." % outFileName + + # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) + # Use coverage on query for arc value + # + # @param matchFileName string input match file name + # @param outFileName string output abc file name + # @param coverage float query coverage filter threshold + # + @staticmethod + def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0): + with open(matchFileName) as inF: + with open(outFileName, "w") as outF: + inF.readline() + for inLine in inF: + splittedLine = inLine.split("\t") + if float(splittedLine[4]) >= coverage: + outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]]) + outLine += "\n" + outF.write(outLine) + + if FileUtils.isEmpty(outFileName): + print "WARNING: '%s' is empty." % outFileName + + # # Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines. + # # replace old parser.tabnum2id + # + # @param fileName a string input file name + # @param outputFileName a string output file name (optional) + # + @staticmethod + def generateMatchFileWithNewPathId(fileName, outputFileName = None): + if outputFileName is None: + outFile = open(fileName, "w") + else: + outFile = open(outputFileName, "w") + outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n") + + lMatches = MatchUtils.getMatchListFromFile(fileName) + count = 1 + dMatchKeyIdcount = {} + + for match in lMatches: + key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName() + if not key_id in dMatchKeyIdcount.keys(): + newPath = count + count += 1 + dMatchKeyIdcount[ key_id ] = newPath + else: + newPath = dMatchKeyIdcount[ key_id ] + + match.id = newPath + outFile.write(match.toString() + "\n") + outFile.close()