Mercurial > repos > yufei-luo > s_mart
diff commons/core/coord/MatchUtils.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/MatchUtils.py Fri Jan 18 04:54:14 2013 -0500 @@ -0,0 +1,288 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import math +import os +import sys +from commons.core.coord.Match import Match +from commons.core.checker.RepetException import RepetException + +## Static methods for the manipulation of Match instances +# +class MatchUtils ( object ): + + ## Return a list with Match instances from the given file + # + # @param inFile name of a file in the Match format + # @return a list of Match instances + # + def getMatchListFromFile(inFile ): + lMatchInstances = [] + inFileHandler = open( inFile, "r" ) + while True: + line = inFileHandler.readline() + if line == "": + break + if line[0:10] == "query.name": + continue + m = Match() + m.setFromString( line ) + lMatchInstances.append( m ) + inFileHandler.close() + return lMatchInstances + + getMatchListFromFile = staticmethod( getMatchListFromFile ) + + ## Split a Match list in several Match lists according to the subject + # + # @param lMatches a list of Match instances + # @return a dictionary which keys are subject names and values Match lists + # + def getDictOfListsWithSubjectAsKey( lMatches ): + dSubject2MatchList = {} + for iMatch in lMatches: + if not dSubject2MatchList.has_key( iMatch.range_subject.seqname ): + dSubject2MatchList[ iMatch.range_subject.seqname ] = [] + dSubject2MatchList[ iMatch.range_subject.seqname ].append( iMatch ) + return dSubject2MatchList + + getDictOfListsWithSubjectAsKey = staticmethod( getDictOfListsWithSubjectAsKey ) + + ## Split a Match list in several Match lists according to the query + # + # @param lMatches a list of Match instances + # @return a dictionary which keys are query names and values Match lists + # + def getDictOfListsWithQueryAsKey ( lMatches ): + dQuery2MatchList = {} + for iMatch in lMatches: + if not dQuery2MatchList.has_key( iMatch.range_query.seqname ): + dQuery2MatchList[ iMatch.range_query.seqname ] = [] + dQuery2MatchList[ iMatch.range_query.seqname ].append( iMatch ) + return dQuery2MatchList + + getDictOfListsWithQueryAsKey = staticmethod( getDictOfListsWithQueryAsKey ) + + ## Write Match instances contained in the given list + # + # @param lMatches a list of Match instances + # @param fileName name of the file to write the Match instances + # @param mode the open mode of the file ""w"" or ""a"" + # + def writeListInFile( lMatches, fileName, mode="w", header=None ): + fileHandler = open( fileName, mode ) + if header: + fileHandler.write( header ) + for iMatch in lMatches: + iMatch.write( fileHandler ) + fileHandler.close() + + writeListInFile = staticmethod( writeListInFile ) + + ## Give path id list from a list of Match instances + # + # @param lMatch list of Match instances + # + # @return lId integer list + # + def getIdListFromMatchList(lMatch): + lId = [] + for iMatch in lMatch: + lId.append(iMatch.id) + return lId + + getIdListFromMatchList = staticmethod(getIdListFromMatchList) + + ## Remove duplicated matches in a match list + # ## replace old PyRepet.MatchDB.rmvDoublons() + # @param lMatch list of Match instances + # + # @return lMatchesUniq match unique list + # + def rmvDuplicateMatches(lMatch): + lMatchesUniq = [] + for match in lMatch: + if len(lMatchesUniq) == 0: + lMatchesUniq.append( match ) + else: + nbDoublons = 0 + for m in lMatchesUniq: + if match.isDoublonWith( m ): + nbDoublons += 1 + if nbDoublons == 0: + lMatchesUniq.append( match ) + + for match1 in lMatchesUniq: + for match2 in lMatchesUniq: + if match1.id != match2.id: + if match1.isDoublonWith( match2 ): + raise RepetException ( "*** Error: doublon not removed" ) + return lMatchesUniq + + rmvDuplicateMatches = staticmethod(rmvDuplicateMatches) + + ## Return the list of queries 'included' in subjects when two different databanks are used. + ##replace old pyRepet.MatchDB.filterDiffQrySbj() + # + # @param iBioseqDB bioseqDB databank of queries + # + # @param thresIdentity float identity threshold + # + # @param thresLength float length threshold + # + # @param verbose int verbosity + # + # @return lMatches match list to keep according to length and identity thresholds + #TODO: don't take into account match for sequence against itself. To do ? + def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity=0.95, thresLength=0.98, verbose=0 ): + if verbose > 0: + print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % ( thresIdentity, thresLength ); sys.stdout.flush() + + thresIdentityPerc = math.floor( thresIdentity*100 ) + lQryToKeep = [] + dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile)) + + for seqH in iBioseqDB.idx.keys(): + # keep it if it has no match + if not dQry2Matches.has_key( seqH ): + if seqH not in lQryToKeep: + lQryToKeep.append( seqH ) + else: + isConditionsMet = False + for match in dQry2Matches[ seqH ]: + # check if they are above the thresholds + if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength: + isConditionsMet = True + break + if not isConditionsMet and seqH not in lQryToKeep: + lQryToKeep.append( seqH ) + return lQryToKeep + + filterDiffQrySbj = staticmethod(filterDiffQrySbj) + + ## Count the number of distinct matches involved in at least one match above the thresholds. + ##replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() + # @param thresIdentity float identity threshold + # + # @param thresLength float length threshold + # + def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity=0.95, thresLength=0.98, whatToCount="query" ): + thresIdentityPerc = math.floor( thresIdentity*100 ) + countSbj = 0 + if whatToCount.lower() == "query": + dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches) + else: + dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches) + + for qry in dMatches.keys(): + countMatch = 0 + for match in dMatches[ qry ]: + + if match.identity >= thresIdentityPerc and getattr(match,whatToCount.lower() +"_length_perc") >= thresLength: + countMatch += 1 + if countMatch > 0: + countSbj += 1 + return countSbj + + getNbDistinctSequencesInsideMatchesWithThresh = staticmethod(getNbDistinctSequencesInsideMatchesWithThresh) + + ## Convert a 'match' file (output from Matcher) into an 'align' file + ## replace old parser.tab2align + # + # @param inFileName a string input file name + # + def convertMatchFileToAlignFile(inFileName): + basename = os.path.splitext(inFileName)[0] + outFileName = "%s.align" % basename + outFile = open(outFileName, "w") + + lMatches = MatchUtils.getMatchListFromFile(inFileName) + + for match in lMatches: + string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity() ) + outFile.write( string ) + + outFile.close() + + convertMatchFileToAlignFile = staticmethod(convertMatchFileToAlignFile) + + ## Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) + # Use coverage on query for arc value + # + # @param matchFileName string input match file name + # @param outFileName string output abc file name + # @param coverage float query coverage filter threshold + # + @staticmethod + def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0): + with open(matchFileName) as inF: + with open(outFileName, "w") as outF: + inF.readline() + inLine = inF.readline() + while inLine: + splittedLine = inLine.split("\t") + if float(splittedLine[4]) >= coverage: + outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]]) + outLine += "\n" + outF.write(outLine) + inLine = inF.readline() + + ## Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines. + ## replace old parser.tabnum2id + # + # @param fileName a string input file name + # @param outputFileName a string output file name (optional) + # + def generateMatchFileWithNewPathId(fileName, outputFileName=None): + if outputFileName is None: + outFile = open(fileName, "w") + else: + outFile = open(outputFileName, "w") + outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n") + + lMatches = MatchUtils.getMatchListFromFile(fileName) + count = 1 + dMatchKeyIdcount = {} + + for match in lMatches: + key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName() + if not key_id in dMatchKeyIdcount.keys(): + newPath = count + count += 1 + dMatchKeyIdcount[ key_id ] = newPath + else: + newPath = dMatchKeyIdcount[ key_id ] + + match.id = newPath + outFile.write( match.toString()+"\n" ) + outFile.close() + + generateMatchFileWithNewPathId = staticmethod(generateMatchFileWithNewPathId) + \ No newline at end of file