teiso: TEisotools-1.1.a/commons/core/coord/MatchUtils.py comparison

comparison TEisotools-1.1.a/commons/core/coord/MatchUtils.py @ 13:feef9a0db09d draft

Uploaded

author	urgi-team
date	Wed, 20 Jul 2016 09:04:42 -0400
parents
children

comparison

equal deleted inserted replaced

-:22b0494ec883
+:feef9a0db09d
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability.
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and,  more generally, to use and operate it in the
+# same conditions as regards security.
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+import os
+import sys
+import math
+from commons.core.coord.Match import Match
+from commons.core.utils.FileUtils import FileUtils
+from commons.core.checker.RepetException import RepetException
+# # Static methods for the manipulation of Match instances
+#
+class MatchUtils (object):
+# # Return a list with Match instances from the given file
+#
+# @param inFile name of a file in the Match format
+# @return a list of Match instances
+#
+@staticmethod
+def getMatchListFromFile(inFile):
+lMatchInstances = []
+inFileHandler = open(inFile, "r")
+while True:
+line = inFileHandler.readline()
+if line == "":
+break
+if line[0:10] == "query.name":
+continue
+m = Match()
+m.setFromString(line)
+lMatchInstances.append(m)
+inFileHandler.close()
+return lMatchInstances
+# #  Split a Match list in several Match lists according to the subject
+#
+#  @param lMatches a list of Match instances
+#  @return a dictionary which keys are subject names and values Match lists
+#
+@staticmethod
+def getDictOfListsWithSubjectAsKey(lMatches):
+dSubject2MatchList = {}
+for iMatch in lMatches:
+if not dSubject2MatchList.has_key(iMatch.range_subject.seqname):
+dSubject2MatchList[ iMatch.range_subject.seqname ] = []
+dSubject2MatchList[ iMatch.range_subject.seqname ].append(iMatch)
+return dSubject2MatchList
+# #  Split a Match list in several Match lists according to the query
+#
+#  @param lMatches a list of Match instances
+#  @return a dictionary which keys are query names and values Match lists
+#
+@staticmethod
+def getDictOfListsWithQueryAsKey (lMatches):
+dQuery2MatchList = {}
+for iMatch in lMatches:
+if not dQuery2MatchList.has_key(iMatch.range_query.seqname):
+dQuery2MatchList[ iMatch.range_query.seqname ] = []
+dQuery2MatchList[ iMatch.range_query.seqname ].append(iMatch)
+return dQuery2MatchList
+# # Write Match instances contained in the given list
+#
+# @param lMatches a list of Match instances
+# @param fileName name of the file to write the Match instances
+# @param mode the open mode of the file ""w"" or ""a""
+#
+@staticmethod
+def writeListInFile(lMatches, fileName, mode = "w", header = None):
+fileHandler = open(fileName, mode)
+if header:
+fileHandler.write(header)
+for iMatch in lMatches:
+iMatch.write(fileHandler)
+fileHandler.close()
+# # Give path id list from a list of Match instances
+#
+# @param lMatch list of Match instances
+# @return lId integer list
+#
+@staticmethod
+def getIdListFromMatchList(lMatch):
+lId = []
+for iMatch in lMatch:
+lId.append(iMatch.id)
+return lId
+# # Remove duplicated matches in a match list
+# ## replace old PyRepet.MatchDB.rmvDoublons()
+#
+# @param lMatch list of Match instances
+# @return lMatchesUniq match unique list
+#
+@staticmethod
+def rmvDuplicateMatches(lMatch):
+lMatchesUniq = []
+for match in lMatch:
+if len(lMatchesUniq) == 0:
+lMatchesUniq.append(match)
+else:
+nbDoublons = 0
+for m in lMatchesUniq:
+if match.isDoublonWith(m):
+nbDoublons += 1
+if nbDoublons == 0:
+lMatchesUniq.append(match)
+for match1 in lMatchesUniq:
+for match2 in lMatchesUniq:
+if match1.id != match2.id:
+if match1.isDoublonWith(match2):
+raise RepetException ("*** Error: doublon not removed")
+return lMatchesUniq
+# # Return the list of queries 'included' in subjects when two different databanks are used.
+# #replace old pyRepet.MatchDB.filterDiffQrySbj()
+#
+# @param iBioseqDB bioseqDB databank of queries
+# @param thresIdentity float identity threshold
+# @param thresLength float length threshold
+# @param verbose int verbosity
+#
+# @return lMatches match list to keep according to length and identity thresholds
+# TODO: don't take into account match for sequence against itself. To do ?
+@staticmethod
+def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity = 0.95, thresLength = 0.98, verbose = 0):
+if verbose > 0:
+print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % (thresIdentity, thresLength); sys.stdout.flush()
+thresIdentityPerc = math.floor(thresIdentity * 100)
+lQryToKeep = []
+dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile))
+for seqH in iBioseqDB.idx.keys():
+# keep it if it has no match
+if not dQry2Matches.has_key(seqH):
+if seqH not in lQryToKeep:
+lQryToKeep.append(seqH)
+else:
+isConditionsMet = False
+for match in dQry2Matches[ seqH ]:
+# check if they are above the thresholds
+if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength:
+isConditionsMet = True
+break
+if not isConditionsMet and  seqH not in lQryToKeep:
+lQryToKeep.append(seqH)
+return lQryToKeep
+# # Count the number of distinct matches involved in at least one match above the thresholds.
+# #replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres()
+#
+# @param thresIdentity float identity threshold
+# @param thresLength float length threshold
+#
+@staticmethod
+def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity = 0.95, thresLength = 0.98, whatToCount = "query"):
+thresIdentityPerc = math.floor(thresIdentity * 100)
+countSbj = 0
+if whatToCount.lower() == "query":
+dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches)
+else:
+dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches)
+for qry in dMatches.keys():
+countMatch = 0
+for match in dMatches[ qry ]:
+if match.identity >= thresIdentityPerc and getattr(match, whatToCount.lower() + "_length_perc") >= thresLength:
+countMatch += 1
+if countMatch > 0:
+countSbj += 1
+return countSbj
+# # Convert a 'match' file (output from Matcher) into an 'align' file
+# # replace old parser.tab2align
+#
+# @param inFileName  a string input file name
+#
+@staticmethod
+def convertMatchFileToAlignFile(inFileName):
+basename = os.path.splitext(inFileName)[0]
+outFileName = "%s.align" % basename
+outFile = open(outFileName, "w")
+lMatches = MatchUtils.getMatchListFromFile(inFileName)
+for match in lMatches:
+string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity())
+outFile.write(string)
+outFile.close()
+# # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file)
+# for each match, compute coverage on the smallest seq using this expression
+#           Min (|QMatchEnd - QMatchStart + 1|,|SMatchEnd - SMatchStart + 1|)
+#coverage = -----------------------------------------------------------------
+#                             Min (QLength,SLength)
+#
+# Use this coverage for arc value. Matches of a sequence on itself are filtered.
+#
+# @param matchFileName string input match file name
+# @param outFileName string output abc file name
+# @param coverage float coverage filter threshold
+#
+@staticmethod
+def convertMatchFileIntoABCFileWithCoverageComputeOnSmallestSeq(matchFileName, outFileName, coverageThreshold = 0):
+sSeqNames = set()
+sSeqNamesUsed = set()
+with open(outFileName, "w") as outF:
+with open(matchFileName) as inF:
+inF.readline()
+for inLine in inF:
+splittedLine = inLine.split("\t")
+qName = splittedLine[0]
+sName = splittedLine[6]
+sSeqNames.add(qName)
+sSeqNames.add(sName)
+if qName != sName:
+matchMin = min(abs(float(splittedLine[2]) - float(splittedLine[1]) + 1), abs(float(splittedLine[8]) - float(splittedLine[7]) + 1))
+seqMin = min(int(splittedLine[3]) / float(splittedLine[4]), int(splittedLine[9]) / float(splittedLine[10]))
+coverage = round(matchMin / seqMin, 2)
+if coverage >= coverageThreshold:
+outF.write("%s\n" % "\t".join([qName, sName, str(coverage)]))
+sSeqNamesUsed.add(qName)
+sSeqNamesUsed.add(sName)
+with open("%s.unused" % outFileName, "w") as outSeqNameOfUnusedMatchesF:
+for seqName in sorted(sSeqNames - sSeqNamesUsed):
+outSeqNameOfUnusedMatchesF.write("%s\n" % seqName)
+if FileUtils.isEmpty(outFileName):
+print "WARNING: '%s' is empty." % outFileName
+# # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file)
+# Use coverage on query for arc value
+#
+# @param matchFileName string input match file name
+# @param outFileName string output abc file name
+# @param coverage float query coverage filter threshold
+#
+@staticmethod
+def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0):
+with open(matchFileName) as inF:
+with open(outFileName, "w") as outF:
+inF.readline()
+for inLine in inF:
+splittedLine = inLine.split("\t")
+if float(splittedLine[4]) >= coverage:
+outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]])
+outLine += "\n"
+outF.write(outLine)
+if FileUtils.isEmpty(outFileName):
+print "WARNING: '%s' is empty." % outFileName
+# # Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines.
+# # replace old parser.tabnum2id
+#
+# @param fileName  a string input file name
+# @param  outputFileName  a string output file name (optional)
+#
+@staticmethod
+def generateMatchFileWithNewPathId(fileName, outputFileName = None):
+if outputFileName is None:
+outFile = open(fileName, "w")
+else:
+outFile = open(outputFileName, "w")
+outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n")
+lMatches = MatchUtils.getMatchListFromFile(fileName)
+count = 1
+dMatchKeyIdcount = {}
+for match in lMatches:
+key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName()
+if not key_id in dMatchKeyIdcount.keys():
+newPath = count
+count += 1
+dMatchKeyIdcount[ key_id ] = newPath
+else:
+newPath = dMatchKeyIdcount[ key_id ]
+match.id = newPath
+outFile.write(match.toString() + "\n")
+outFile.close()

Mercurial > repos > urgi-team > teiso

comparison TEisotools-1.1.a/commons/core/coord/MatchUtils.py @ 13:feef9a0db09d draft