Mercurial > repos > urgi-team > teiso
view TEisotools-1.0/commons/core/coord/MatchUtils.py @ 6:20ec0d14798e draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 05:00:24 -0400 |
parents | |
children |
line wrap: on
line source
# Copyright INRA (Institut National de la Recherche Agronomique) # http://www.inra.fr # http://urgi.versailles.inra.fr # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. import os import sys import math from commons.core.coord.Match import Match from commons.core.utils.FileUtils import FileUtils from commons.core.checker.RepetException import RepetException # # Static methods for the manipulation of Match instances # class MatchUtils (object): # # Return a list with Match instances from the given file # # @param inFile name of a file in the Match format # @return a list of Match instances # @staticmethod def getMatchListFromFile(inFile): lMatchInstances = [] inFileHandler = open(inFile, "r") while True: line = inFileHandler.readline() if line == "": break if line[0:10] == "query.name": continue m = Match() m.setFromString(line) lMatchInstances.append(m) inFileHandler.close() return lMatchInstances # # Split a Match list in several Match lists according to the subject # # @param lMatches a list of Match instances # @return a dictionary which keys are subject names and values Match lists # @staticmethod def getDictOfListsWithSubjectAsKey(lMatches): dSubject2MatchList = {} for iMatch in lMatches: if not dSubject2MatchList.has_key(iMatch.range_subject.seqname): dSubject2MatchList[ iMatch.range_subject.seqname ] = [] dSubject2MatchList[ iMatch.range_subject.seqname ].append(iMatch) return dSubject2MatchList # # Split a Match list in several Match lists according to the query # # @param lMatches a list of Match instances # @return a dictionary which keys are query names and values Match lists # @staticmethod def getDictOfListsWithQueryAsKey (lMatches): dQuery2MatchList = {} for iMatch in lMatches: if not dQuery2MatchList.has_key(iMatch.range_query.seqname): dQuery2MatchList[ iMatch.range_query.seqname ] = [] dQuery2MatchList[ iMatch.range_query.seqname ].append(iMatch) return dQuery2MatchList # # Write Match instances contained in the given list # # @param lMatches a list of Match instances # @param fileName name of the file to write the Match instances # @param mode the open mode of the file ""w"" or ""a"" # @staticmethod def writeListInFile(lMatches, fileName, mode = "w", header = None): fileHandler = open(fileName, mode) if header: fileHandler.write(header) for iMatch in lMatches: iMatch.write(fileHandler) fileHandler.close() # # Give path id list from a list of Match instances # # @param lMatch list of Match instances # @return lId integer list # @staticmethod def getIdListFromMatchList(lMatch): lId = [] for iMatch in lMatch: lId.append(iMatch.id) return lId # # Remove duplicated matches in a match list # ## replace old PyRepet.MatchDB.rmvDoublons() # # @param lMatch list of Match instances # @return lMatchesUniq match unique list # @staticmethod def rmvDuplicateMatches(lMatch): lMatchesUniq = [] for match in lMatch: if len(lMatchesUniq) == 0: lMatchesUniq.append(match) else: nbDoublons = 0 for m in lMatchesUniq: if match.isDoublonWith(m): nbDoublons += 1 if nbDoublons == 0: lMatchesUniq.append(match) for match1 in lMatchesUniq: for match2 in lMatchesUniq: if match1.id != match2.id: if match1.isDoublonWith(match2): raise RepetException ("*** Error: doublon not removed") return lMatchesUniq # # Return the list of queries 'included' in subjects when two different databanks are used. # #replace old pyRepet.MatchDB.filterDiffQrySbj() # # @param iBioseqDB bioseqDB databank of queries # @param thresIdentity float identity threshold # @param thresLength float length threshold # @param verbose int verbosity # # @return lMatches match list to keep according to length and identity thresholds # TODO: don't take into account match for sequence against itself. To do ? @staticmethod def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity = 0.95, thresLength = 0.98, verbose = 0): if verbose > 0: print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % (thresIdentity, thresLength); sys.stdout.flush() thresIdentityPerc = math.floor(thresIdentity * 100) lQryToKeep = [] dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile)) for seqH in iBioseqDB.idx.keys(): # keep it if it has no match if not dQry2Matches.has_key(seqH): if seqH not in lQryToKeep: lQryToKeep.append(seqH) else: isConditionsMet = False for match in dQry2Matches[ seqH ]: # check if they are above the thresholds if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength: isConditionsMet = True break if not isConditionsMet and seqH not in lQryToKeep: lQryToKeep.append(seqH) return lQryToKeep # # Count the number of distinct matches involved in at least one match above the thresholds. # #replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() # # @param thresIdentity float identity threshold # @param thresLength float length threshold # @staticmethod def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity = 0.95, thresLength = 0.98, whatToCount = "query"): thresIdentityPerc = math.floor(thresIdentity * 100) countSbj = 0 if whatToCount.lower() == "query": dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches) else: dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches) for qry in dMatches.keys(): countMatch = 0 for match in dMatches[ qry ]: if match.identity >= thresIdentityPerc and getattr(match, whatToCount.lower() + "_length_perc") >= thresLength: countMatch += 1 if countMatch > 0: countSbj += 1 return countSbj # # Convert a 'match' file (output from Matcher) into an 'align' file # # replace old parser.tab2align # # @param inFileName a string input file name # @staticmethod def convertMatchFileToAlignFile(inFileName): basename = os.path.splitext(inFileName)[0] outFileName = "%s.align" % basename outFile = open(outFileName, "w") lMatches = MatchUtils.getMatchListFromFile(inFileName) for match in lMatches: string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity()) outFile.write(string) outFile.close() # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) # for each match, compute coverage on the smallest seq using this expression # Min (|QMatchEnd - QMatchStart + 1|,|SMatchEnd - SMatchStart + 1|) #coverage = ----------------------------------------------------------------- # Min (QLength,SLength) # # Use this coverage for arc value. Matches of a sequence on itself are filtered. # # @param matchFileName string input match file name # @param outFileName string output abc file name # @param coverage float coverage filter threshold # @staticmethod def convertMatchFileIntoABCFileWithCoverageComputeOnSmallestSeq(matchFileName, outFileName, coverageThreshold = 0): sSeqNames = set() sSeqNamesUsed = set() with open(outFileName, "w") as outF: with open(matchFileName) as inF: inF.readline() for inLine in inF: splittedLine = inLine.split("\t") qName = splittedLine[0] sName = splittedLine[6] sSeqNames.add(qName) sSeqNames.add(sName) if qName != sName: matchMin = min(abs(float(splittedLine[2]) - float(splittedLine[1]) + 1), abs(float(splittedLine[8]) - float(splittedLine[7]) + 1)) seqMin = min(int(splittedLine[3]) / float(splittedLine[4]), int(splittedLine[9]) / float(splittedLine[10])) coverage = round(matchMin / seqMin, 2) if coverage >= coverageThreshold: outF.write("%s\n" % "\t".join([qName, sName, str(coverage)])) sSeqNamesUsed.add(qName) sSeqNamesUsed.add(sName) with open("%s.unused" % outFileName, "w") as outSeqNameOfUnusedMatchesF: for seqName in sorted(sSeqNames - sSeqNamesUsed): outSeqNameOfUnusedMatchesF.write("%s\n" % seqName) if FileUtils.isEmpty(outFileName): print "WARNING: '%s' is empty." % outFileName # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) # Use coverage on query for arc value # # @param matchFileName string input match file name # @param outFileName string output abc file name # @param coverage float query coverage filter threshold # @staticmethod def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0): with open(matchFileName) as inF: with open(outFileName, "w") as outF: inF.readline() for inLine in inF: splittedLine = inLine.split("\t") if float(splittedLine[4]) >= coverage: outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]]) outLine += "\n" outF.write(outLine) if FileUtils.isEmpty(outFileName): print "WARNING: '%s' is empty." % outFileName # # Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines. # # replace old parser.tabnum2id # # @param fileName a string input file name # @param outputFileName a string output file name (optional) # @staticmethod def generateMatchFileWithNewPathId(fileName, outputFileName = None): if outputFileName is None: outFile = open(fileName, "w") else: outFile = open(outputFileName, "w") outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n") lMatches = MatchUtils.getMatchListFromFile(fileName) count = 1 dMatchKeyIdcount = {} for match in lMatches: key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName() if not key_id in dMatchKeyIdcount.keys(): newPath = count count += 1 dMatchKeyIdcount[ key_id ] = newPath else: newPath = dMatchKeyIdcount[ key_id ] match.id = newPath outFile.write(match.toString() + "\n") outFile.close()