Mercurial > repos > yufei-luo > s_mart
view commons/core/coord/MatchUtils.py @ 69:1473ab954708 draft
Corrected bug in "CollapsedReads" XML file.
author | m-zytnicki |
---|---|
date | Wed, 18 Nov 2015 10:59:02 -0500 |
parents | 769e306b7933 |
children |
line wrap: on
line source
# Copyright INRA (Institut National de la Recherche Agronomique) # http://www.inra.fr # http://urgi.versailles.inra.fr # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. import math import os import sys from commons.core.coord.Match import Match from commons.core.checker.RepetException import RepetException ## Static methods for the manipulation of Match instances # class MatchUtils ( object ): ## Return a list with Match instances from the given file # # @param inFile name of a file in the Match format # @return a list of Match instances # def getMatchListFromFile(inFile ): lMatchInstances = [] inFileHandler = open( inFile, "r" ) while True: line = inFileHandler.readline() if line == "": break if line[0:10] == "query.name": continue m = Match() m.setFromString( line ) lMatchInstances.append( m ) inFileHandler.close() return lMatchInstances getMatchListFromFile = staticmethod( getMatchListFromFile ) ## Split a Match list in several Match lists according to the subject # # @param lMatches a list of Match instances # @return a dictionary which keys are subject names and values Match lists # def getDictOfListsWithSubjectAsKey( lMatches ): dSubject2MatchList = {} for iMatch in lMatches: if not dSubject2MatchList.has_key( iMatch.range_subject.seqname ): dSubject2MatchList[ iMatch.range_subject.seqname ] = [] dSubject2MatchList[ iMatch.range_subject.seqname ].append( iMatch ) return dSubject2MatchList getDictOfListsWithSubjectAsKey = staticmethod( getDictOfListsWithSubjectAsKey ) ## Split a Match list in several Match lists according to the query # # @param lMatches a list of Match instances # @return a dictionary which keys are query names and values Match lists # def getDictOfListsWithQueryAsKey ( lMatches ): dQuery2MatchList = {} for iMatch in lMatches: if not dQuery2MatchList.has_key( iMatch.range_query.seqname ): dQuery2MatchList[ iMatch.range_query.seqname ] = [] dQuery2MatchList[ iMatch.range_query.seqname ].append( iMatch ) return dQuery2MatchList getDictOfListsWithQueryAsKey = staticmethod( getDictOfListsWithQueryAsKey ) ## Write Match instances contained in the given list # # @param lMatches a list of Match instances # @param fileName name of the file to write the Match instances # @param mode the open mode of the file ""w"" or ""a"" # def writeListInFile( lMatches, fileName, mode="w", header=None ): fileHandler = open( fileName, mode ) if header: fileHandler.write( header ) for iMatch in lMatches: iMatch.write( fileHandler ) fileHandler.close() writeListInFile = staticmethod( writeListInFile ) ## Give path id list from a list of Match instances # # @param lMatch list of Match instances # # @return lId integer list # def getIdListFromMatchList(lMatch): lId = [] for iMatch in lMatch: lId.append(iMatch.id) return lId getIdListFromMatchList = staticmethod(getIdListFromMatchList) ## Remove duplicated matches in a match list # ## replace old PyRepet.MatchDB.rmvDoublons() # @param lMatch list of Match instances # # @return lMatchesUniq match unique list # def rmvDuplicateMatches(lMatch): lMatchesUniq = [] for match in lMatch: if len(lMatchesUniq) == 0: lMatchesUniq.append( match ) else: nbDoublons = 0 for m in lMatchesUniq: if match.isDoublonWith( m ): nbDoublons += 1 if nbDoublons == 0: lMatchesUniq.append( match ) for match1 in lMatchesUniq: for match2 in lMatchesUniq: if match1.id != match2.id: if match1.isDoublonWith( match2 ): raise RepetException ( "*** Error: doublon not removed" ) return lMatchesUniq rmvDuplicateMatches = staticmethod(rmvDuplicateMatches) ## Return the list of queries 'included' in subjects when two different databanks are used. ##replace old pyRepet.MatchDB.filterDiffQrySbj() # # @param iBioseqDB bioseqDB databank of queries # # @param thresIdentity float identity threshold # # @param thresLength float length threshold # # @param verbose int verbosity # # @return lMatches match list to keep according to length and identity thresholds #TODO: don't take into account match for sequence against itself. To do ? def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity=0.95, thresLength=0.98, verbose=0 ): if verbose > 0: print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % ( thresIdentity, thresLength ); sys.stdout.flush() thresIdentityPerc = math.floor( thresIdentity*100 ) lQryToKeep = [] dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile)) for seqH in iBioseqDB.idx.keys(): # keep it if it has no match if not dQry2Matches.has_key( seqH ): if seqH not in lQryToKeep: lQryToKeep.append( seqH ) else: isConditionsMet = False for match in dQry2Matches[ seqH ]: # check if they are above the thresholds if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength: isConditionsMet = True break if not isConditionsMet and seqH not in lQryToKeep: lQryToKeep.append( seqH ) return lQryToKeep filterDiffQrySbj = staticmethod(filterDiffQrySbj) ## Count the number of distinct matches involved in at least one match above the thresholds. ##replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() # @param thresIdentity float identity threshold # # @param thresLength float length threshold # def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity=0.95, thresLength=0.98, whatToCount="query" ): thresIdentityPerc = math.floor( thresIdentity*100 ) countSbj = 0 if whatToCount.lower() == "query": dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches) else: dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches) for qry in dMatches.keys(): countMatch = 0 for match in dMatches[ qry ]: if match.identity >= thresIdentityPerc and getattr(match,whatToCount.lower() +"_length_perc") >= thresLength: countMatch += 1 if countMatch > 0: countSbj += 1 return countSbj getNbDistinctSequencesInsideMatchesWithThresh = staticmethod(getNbDistinctSequencesInsideMatchesWithThresh) ## Convert a 'match' file (output from Matcher) into an 'align' file ## replace old parser.tab2align # # @param inFileName a string input file name # def convertMatchFileToAlignFile(inFileName): basename = os.path.splitext(inFileName)[0] outFileName = "%s.align" % basename outFile = open(outFileName, "w") lMatches = MatchUtils.getMatchListFromFile(inFileName) for match in lMatches: string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity() ) outFile.write( string ) outFile.close() convertMatchFileToAlignFile = staticmethod(convertMatchFileToAlignFile) ## Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) # Use coverage on query for arc value # # @param matchFileName string input match file name # @param outFileName string output abc file name # @param coverage float query coverage filter threshold # @staticmethod def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0): with open(matchFileName) as inF: with open(outFileName, "w") as outF: inF.readline() inLine = inF.readline() while inLine: splittedLine = inLine.split("\t") if float(splittedLine[4]) >= coverage: outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]]) outLine += "\n" outF.write(outLine) inLine = inF.readline() ## Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines. ## replace old parser.tabnum2id # # @param fileName a string input file name # @param outputFileName a string output file name (optional) # def generateMatchFileWithNewPathId(fileName, outputFileName=None): if outputFileName is None: outFile = open(fileName, "w") else: outFile = open(outputFileName, "w") outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n") lMatches = MatchUtils.getMatchListFromFile(fileName) count = 1 dMatchKeyIdcount = {} for match in lMatches: key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName() if not key_id in dMatchKeyIdcount.keys(): newPath = count count += 1 dMatchKeyIdcount[ key_id ] = newPath else: newPath = dMatchKeyIdcount[ key_id ] match.id = newPath outFile.write( match.toString()+"\n" ) outFile.close() generateMatchFileWithNewPathId = staticmethod(generateMatchFileWithNewPathId)