teiso: TEisotools-1.1.a/commons/core/coord/MatchUtils.py comparison

comparison TEisotools-1.1.a/commons/core/coord/MatchUtils.py @ 16:836ce3d9d47a draft default tip

Uploaded

author	urgi-team
date	Thu, 21 Jul 2016 07:42:47 -0400
parents	255c852351c5
children

comparison

equal deleted inserted replaced

-:255c852351c5
+:836ce3d9d47a
-# Copyright INRA (Institut National de la Recherche Agronomique)
-# http://www.inra.fr
-# http://urgi.versailles.inra.fr
-#
-# This software is governed by the CeCILL license under French law and
-# abiding by the rules of distribution of free software.  You can  use,
-# modify and/ or redistribute the software under the terms of the CeCILL
-# license as circulated by CEA, CNRS and INRIA at the following URL
-# "http://www.cecill.info".
-#
-# As a counterpart to the access to the source code and  rights to copy,
-# modify and redistribute granted by the license, users are provided only
-# with a limited warranty  and the software's author,  the holder of the
-# economic rights,  and the successive licensors  have only  limited
-# liability.
-#
-# In this respect, the user's attention is drawn to the risks associated
-# with loading,  using,  modifying and/or developing or reproducing the
-# software by the user in light of its specific status of free software,
-# that may mean  that it is complicated to manipulate,  and  that  also
-# therefore means  that it is reserved for developers  and  experienced
-# professionals having in-depth computer knowledge. Users are therefore
-# encouraged to load and test the software's suitability as regards their
-# requirements in conditions enabling the security of their systems and/or
-# data to be ensured and,  more generally, to use and operate it in the
-# same conditions as regards security.
-#
-# The fact that you are presently reading this means that you have had
-# knowledge of the CeCILL license and that you accept its terms.
-import os
-import sys
-import math
-from commons.core.coord.Match import Match
-from commons.core.utils.FileUtils import FileUtils
-from commons.core.checker.RepetException import RepetException
-# # Static methods for the manipulation of Match instances
-#
-class MatchUtils (object):
-# # Return a list with Match instances from the given file
-#
-# @param inFile name of a file in the Match format
-# @return a list of Match instances
-#
-@staticmethod
-def getMatchListFromFile(inFile):
-lMatchInstances = []
-inFileHandler = open(inFile, "r")
-while True:
-line = inFileHandler.readline()
-if line == "":
-break
-if line[0:10] == "query.name":
-continue
-m = Match()
-m.setFromString(line)
-lMatchInstances.append(m)
-inFileHandler.close()
-return lMatchInstances
-# #  Split a Match list in several Match lists according to the subject
-#
-#  @param lMatches a list of Match instances
-#  @return a dictionary which keys are subject names and values Match lists
-#
-@staticmethod
-def getDictOfListsWithSubjectAsKey(lMatches):
-dSubject2MatchList = {}
-for iMatch in lMatches:
-if not dSubject2MatchList.has_key(iMatch.range_subject.seqname):
-dSubject2MatchList[ iMatch.range_subject.seqname ] = []
-dSubject2MatchList[ iMatch.range_subject.seqname ].append(iMatch)
-return dSubject2MatchList
-# #  Split a Match list in several Match lists according to the query
-#
-#  @param lMatches a list of Match instances
-#  @return a dictionary which keys are query names and values Match lists
-#
-@staticmethod
-def getDictOfListsWithQueryAsKey (lMatches):
-dQuery2MatchList = {}
-for iMatch in lMatches:
-if not dQuery2MatchList.has_key(iMatch.range_query.seqname):
-dQuery2MatchList[ iMatch.range_query.seqname ] = []
-dQuery2MatchList[ iMatch.range_query.seqname ].append(iMatch)
-return dQuery2MatchList
-# # Write Match instances contained in the given list
-#
-# @param lMatches a list of Match instances
-# @param fileName name of the file to write the Match instances
-# @param mode the open mode of the file ""w"" or ""a""
-#
-@staticmethod
-def writeListInFile(lMatches, fileName, mode = "w", header = None):
-fileHandler = open(fileName, mode)
-if header:
-fileHandler.write(header)
-for iMatch in lMatches:
-iMatch.write(fileHandler)
-fileHandler.close()
-# # Give path id list from a list of Match instances
-#
-# @param lMatch list of Match instances
-# @return lId integer list
-#
-@staticmethod
-def getIdListFromMatchList(lMatch):
-lId = []
-for iMatch in lMatch:
-lId.append(iMatch.id)
-return lId
-# # Remove duplicated matches in a match list
-# ## replace old PyRepet.MatchDB.rmvDoublons()
-#
-# @param lMatch list of Match instances
-# @return lMatchesUniq match unique list
-#
-@staticmethod
-def rmvDuplicateMatches(lMatch):
-lMatchesUniq = []
-for match in lMatch:
-if len(lMatchesUniq) == 0:
-lMatchesUniq.append(match)
-else:
-nbDoublons = 0
-for m in lMatchesUniq:
-if match.isDoublonWith(m):
-nbDoublons += 1
-if nbDoublons == 0:
-lMatchesUniq.append(match)
-for match1 in lMatchesUniq:
-for match2 in lMatchesUniq:
-if match1.id != match2.id:
-if match1.isDoublonWith(match2):
-raise RepetException ("*** Error: doublon not removed")
-return lMatchesUniq
-# # Return the list of queries 'included' in subjects when two different databanks are used.
-# #replace old pyRepet.MatchDB.filterDiffQrySbj()
-#
-# @param iBioseqDB bioseqDB databank of queries
-# @param thresIdentity float identity threshold
-# @param thresLength float length threshold
-# @param verbose int verbosity
-#
-# @return lMatches match list to keep according to length and identity thresholds
-# TODO: don't take into account match for sequence against itself. To do ?
-@staticmethod
-def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity = 0.95, thresLength = 0.98, verbose = 0):
-if verbose > 0:
-print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % (thresIdentity, thresLength); sys.stdout.flush()
-thresIdentityPerc = math.floor(thresIdentity * 100)
-lQryToKeep = []
-dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile))
-for seqH in iBioseqDB.idx.keys():
-# keep it if it has no match
-if not dQry2Matches.has_key(seqH):
-if seqH not in lQryToKeep:
-lQryToKeep.append(seqH)
-else:
-isConditionsMet = False
-for match in dQry2Matches[ seqH ]:
-# check if they are above the thresholds
-if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength:
-isConditionsMet = True
-break
-if not isConditionsMet and  seqH not in lQryToKeep:
-lQryToKeep.append(seqH)
-return lQryToKeep
-# # Count the number of distinct matches involved in at least one match above the thresholds.
-# #replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres()
-#
-# @param thresIdentity float identity threshold
-# @param thresLength float length threshold
-#
-@staticmethod
-def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity = 0.95, thresLength = 0.98, whatToCount = "query"):
-thresIdentityPerc = math.floor(thresIdentity * 100)
-countSbj = 0
-if whatToCount.lower() == "query":
-dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches)
-else:
-dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches)
-for qry in dMatches.keys():
-countMatch = 0
-for match in dMatches[ qry ]:
-if match.identity >= thresIdentityPerc and getattr(match, whatToCount.lower() + "_length_perc") >= thresLength:
-countMatch += 1
-if countMatch > 0:
-countSbj += 1
-return countSbj
-# # Convert a 'match' file (output from Matcher) into an 'align' file
-# # replace old parser.tab2align
-#
-# @param inFileName  a string input file name
-#
-@staticmethod
-def convertMatchFileToAlignFile(inFileName):
-basename = os.path.splitext(inFileName)[0]
-outFileName = "%s.align" % basename
-outFile = open(outFileName, "w")
-lMatches = MatchUtils.getMatchListFromFile(inFileName)
-for match in lMatches:
-string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity())
-outFile.write(string)
-outFile.close()
-# # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file)
-# for each match, compute coverage on the smallest seq using this expression
-#           Min (|QMatchEnd - QMatchStart + 1|,|SMatchEnd - SMatchStart + 1|)
-#coverage = -----------------------------------------------------------------
-#                             Min (QLength,SLength)
-#
-# Use this coverage for arc value. Matches of a sequence on itself are filtered.
-#
-# @param matchFileName string input match file name
-# @param outFileName string output abc file name
-# @param coverage float coverage filter threshold
-#
-@staticmethod
-def convertMatchFileIntoABCFileWithCoverageComputeOnSmallestSeq(matchFileName, outFileName, coverageThreshold = 0):
-sSeqNames = set()
-sSeqNamesUsed = set()
-with open(outFileName, "w") as outF:
-with open(matchFileName) as inF:
-inF.readline()
-for inLine in inF:
-splittedLine = inLine.split("\t")
-qName = splittedLine[0]
-sName = splittedLine[6]
-sSeqNames.add(qName)
-sSeqNames.add(sName)
-if qName != sName:
-matchMin = min(abs(float(splittedLine[2]) - float(splittedLine[1]) + 1), abs(float(splittedLine[8]) - float(splittedLine[7]) + 1))
-seqMin = min(int(splittedLine[3]) / float(splittedLine[4]), int(splittedLine[9]) / float(splittedLine[10]))
-coverage = round(matchMin / seqMin, 2)
-if coverage >= coverageThreshold:
-outF.write("%s\n" % "\t".join([qName, sName, str(coverage)]))
-sSeqNamesUsed.add(qName)
-sSeqNamesUsed.add(sName)
-with open("%s.unused" % outFileName, "w") as outSeqNameOfUnusedMatchesF:
-for seqName in sorted(sSeqNames - sSeqNamesUsed):
-outSeqNameOfUnusedMatchesF.write("%s\n" % seqName)
-if FileUtils.isEmpty(outFileName):
-print "WARNING: '%s' is empty." % outFileName
-# # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file)
-# Use coverage on query for arc value
-#
-# @param matchFileName string input match file name
-# @param outFileName string output abc file name
-# @param coverage float query coverage filter threshold
-#
-@staticmethod
-def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0):
-with open(matchFileName) as inF:
-with open(outFileName, "w") as outF:
-inF.readline()
-for inLine in inF:
-splittedLine = inLine.split("\t")
-if float(splittedLine[4]) >= coverage:
-outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]])
-outLine += "\n"
-outF.write(outLine)
-if FileUtils.isEmpty(outFileName):
-print "WARNING: '%s' is empty." % outFileName
-# # Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines.
-# # replace old parser.tabnum2id
-#
-# @param fileName  a string input file name
-# @param  outputFileName  a string output file name (optional)
-#
-@staticmethod
-def generateMatchFileWithNewPathId(fileName, outputFileName = None):
-if outputFileName is None:
-outFile = open(fileName, "w")
-else:
-outFile = open(outputFileName, "w")
-outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n")
-lMatches = MatchUtils.getMatchListFromFile(fileName)
-count = 1
-dMatchKeyIdcount = {}
-for match in lMatches:
-key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName()
-if not key_id in dMatchKeyIdcount.keys():
-newPath = count
-count += 1
-dMatchKeyIdcount[ key_id ] = newPath
-else:
-newPath = dMatchKeyIdcount[ key_id ]
-match.id = newPath
-outFile.write(match.toString() + "\n")
-outFile.close()

Mercurial > repos > urgi-team > teiso

comparison TEisotools-1.1.a/commons/core/coord/MatchUtils.py @ 16:836ce3d9d47a draft default tip