Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.1.a/commons/core/coord/MatchUtils.py @ 13:feef9a0db09d draft
Uploaded
| author | urgi-team | 
|---|---|
| date | Wed, 20 Jul 2016 09:04:42 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 12:22b0494ec883 | 13:feef9a0db09d | 
|---|---|
| 1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
| 2 # http://www.inra.fr | |
| 3 # http://urgi.versailles.inra.fr | |
| 4 # | |
| 5 # This software is governed by the CeCILL license under French law and | |
| 6 # abiding by the rules of distribution of free software. You can use, | |
| 7 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 9 # "http://www.cecill.info". | |
| 10 # | |
| 11 # As a counterpart to the access to the source code and rights to copy, | |
| 12 # modify and redistribute granted by the license, users are provided only | |
| 13 # with a limited warranty and the software's author, the holder of the | |
| 14 # economic rights, and the successive licensors have only limited | |
| 15 # liability. | |
| 16 # | |
| 17 # In this respect, the user's attention is drawn to the risks associated | |
| 18 # with loading, using, modifying and/or developing or reproducing the | |
| 19 # software by the user in light of its specific status of free software, | |
| 20 # that may mean that it is complicated to manipulate, and that also | |
| 21 # therefore means that it is reserved for developers and experienced | |
| 22 # professionals having in-depth computer knowledge. Users are therefore | |
| 23 # encouraged to load and test the software's suitability as regards their | |
| 24 # requirements in conditions enabling the security of their systems and/or | |
| 25 # data to be ensured and, more generally, to use and operate it in the | |
| 26 # same conditions as regards security. | |
| 27 # | |
| 28 # The fact that you are presently reading this means that you have had | |
| 29 # knowledge of the CeCILL license and that you accept its terms. | |
| 30 | |
| 31 import os | |
| 32 import sys | |
| 33 import math | |
| 34 from commons.core.coord.Match import Match | |
| 35 from commons.core.utils.FileUtils import FileUtils | |
| 36 from commons.core.checker.RepetException import RepetException | |
| 37 | |
| 38 # # Static methods for the manipulation of Match instances | |
| 39 # | |
| 40 class MatchUtils (object): | |
| 41 | |
| 42 # # Return a list with Match instances from the given file | |
| 43 # | |
| 44 # @param inFile name of a file in the Match format | |
| 45 # @return a list of Match instances | |
| 46 # | |
| 47 @staticmethod | |
| 48 def getMatchListFromFile(inFile): | |
| 49 lMatchInstances = [] | |
| 50 inFileHandler = open(inFile, "r") | |
| 51 while True: | |
| 52 line = inFileHandler.readline() | |
| 53 if line == "": | |
| 54 break | |
| 55 if line[0:10] == "query.name": | |
| 56 continue | |
| 57 m = Match() | |
| 58 m.setFromString(line) | |
| 59 lMatchInstances.append(m) | |
| 60 inFileHandler.close() | |
| 61 return lMatchInstances | |
| 62 | |
| 63 # # Split a Match list in several Match lists according to the subject | |
| 64 # | |
| 65 # @param lMatches a list of Match instances | |
| 66 # @return a dictionary which keys are subject names and values Match lists | |
| 67 # | |
| 68 @staticmethod | |
| 69 def getDictOfListsWithSubjectAsKey(lMatches): | |
| 70 dSubject2MatchList = {} | |
| 71 for iMatch in lMatches: | |
| 72 if not dSubject2MatchList.has_key(iMatch.range_subject.seqname): | |
| 73 dSubject2MatchList[ iMatch.range_subject.seqname ] = [] | |
| 74 dSubject2MatchList[ iMatch.range_subject.seqname ].append(iMatch) | |
| 75 return dSubject2MatchList | |
| 76 | |
| 77 # # Split a Match list in several Match lists according to the query | |
| 78 # | |
| 79 # @param lMatches a list of Match instances | |
| 80 # @return a dictionary which keys are query names and values Match lists | |
| 81 # | |
| 82 @staticmethod | |
| 83 def getDictOfListsWithQueryAsKey (lMatches): | |
| 84 dQuery2MatchList = {} | |
| 85 for iMatch in lMatches: | |
| 86 if not dQuery2MatchList.has_key(iMatch.range_query.seqname): | |
| 87 dQuery2MatchList[ iMatch.range_query.seqname ] = [] | |
| 88 dQuery2MatchList[ iMatch.range_query.seqname ].append(iMatch) | |
| 89 return dQuery2MatchList | |
| 90 | |
| 91 # # Write Match instances contained in the given list | |
| 92 # | |
| 93 # @param lMatches a list of Match instances | |
| 94 # @param fileName name of the file to write the Match instances | |
| 95 # @param mode the open mode of the file ""w"" or ""a"" | |
| 96 # | |
| 97 @staticmethod | |
| 98 def writeListInFile(lMatches, fileName, mode = "w", header = None): | |
| 99 fileHandler = open(fileName, mode) | |
| 100 if header: | |
| 101 fileHandler.write(header) | |
| 102 for iMatch in lMatches: | |
| 103 iMatch.write(fileHandler) | |
| 104 fileHandler.close() | |
| 105 | |
| 106 # # Give path id list from a list of Match instances | |
| 107 # | |
| 108 # @param lMatch list of Match instances | |
| 109 # @return lId integer list | |
| 110 # | |
| 111 @staticmethod | |
| 112 def getIdListFromMatchList(lMatch): | |
| 113 lId = [] | |
| 114 for iMatch in lMatch: | |
| 115 lId.append(iMatch.id) | |
| 116 return lId | |
| 117 | |
| 118 # # Remove duplicated matches in a match list | |
| 119 # ## replace old PyRepet.MatchDB.rmvDoublons() | |
| 120 # | |
| 121 # @param lMatch list of Match instances | |
| 122 # @return lMatchesUniq match unique list | |
| 123 # | |
| 124 @staticmethod | |
| 125 def rmvDuplicateMatches(lMatch): | |
| 126 lMatchesUniq = [] | |
| 127 for match in lMatch: | |
| 128 if len(lMatchesUniq) == 0: | |
| 129 lMatchesUniq.append(match) | |
| 130 else: | |
| 131 nbDoublons = 0 | |
| 132 for m in lMatchesUniq: | |
| 133 if match.isDoublonWith(m): | |
| 134 nbDoublons += 1 | |
| 135 if nbDoublons == 0: | |
| 136 lMatchesUniq.append(match) | |
| 137 | |
| 138 for match1 in lMatchesUniq: | |
| 139 for match2 in lMatchesUniq: | |
| 140 if match1.id != match2.id: | |
| 141 if match1.isDoublonWith(match2): | |
| 142 raise RepetException ("*** Error: doublon not removed") | |
| 143 return lMatchesUniq | |
| 144 | |
| 145 # # Return the list of queries 'included' in subjects when two different databanks are used. | |
| 146 # #replace old pyRepet.MatchDB.filterDiffQrySbj() | |
| 147 # | |
| 148 # @param iBioseqDB bioseqDB databank of queries | |
| 149 # @param thresIdentity float identity threshold | |
| 150 # @param thresLength float length threshold | |
| 151 # @param verbose int verbosity | |
| 152 # | |
| 153 # @return lMatches match list to keep according to length and identity thresholds | |
| 154 # TODO: don't take into account match for sequence against itself. To do ? | |
| 155 @staticmethod | |
| 156 def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity = 0.95, thresLength = 0.98, verbose = 0): | |
| 157 if verbose > 0: | |
| 158 print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % (thresIdentity, thresLength); sys.stdout.flush() | |
| 159 | |
| 160 thresIdentityPerc = math.floor(thresIdentity * 100) | |
| 161 lQryToKeep = [] | |
| 162 dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile)) | |
| 163 | |
| 164 for seqH in iBioseqDB.idx.keys(): | |
| 165 # keep it if it has no match | |
| 166 if not dQry2Matches.has_key(seqH): | |
| 167 if seqH not in lQryToKeep: | |
| 168 lQryToKeep.append(seqH) | |
| 169 else: | |
| 170 isConditionsMet = False | |
| 171 for match in dQry2Matches[ seqH ]: | |
| 172 # check if they are above the thresholds | |
| 173 if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength: | |
| 174 isConditionsMet = True | |
| 175 break | |
| 176 if not isConditionsMet and seqH not in lQryToKeep: | |
| 177 lQryToKeep.append(seqH) | |
| 178 return lQryToKeep | |
| 179 | |
| 180 # # Count the number of distinct matches involved in at least one match above the thresholds. | |
| 181 # #replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() | |
| 182 # | |
| 183 # @param thresIdentity float identity threshold | |
| 184 # @param thresLength float length threshold | |
| 185 # | |
| 186 @staticmethod | |
| 187 def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity = 0.95, thresLength = 0.98, whatToCount = "query"): | |
| 188 thresIdentityPerc = math.floor(thresIdentity * 100) | |
| 189 countSbj = 0 | |
| 190 if whatToCount.lower() == "query": | |
| 191 dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches) | |
| 192 else: | |
| 193 dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches) | |
| 194 | |
| 195 for qry in dMatches.keys(): | |
| 196 countMatch = 0 | |
| 197 for match in dMatches[ qry ]: | |
| 198 | |
| 199 if match.identity >= thresIdentityPerc and getattr(match, whatToCount.lower() + "_length_perc") >= thresLength: | |
| 200 countMatch += 1 | |
| 201 if countMatch > 0: | |
| 202 countSbj += 1 | |
| 203 return countSbj | |
| 204 | |
| 205 # # Convert a 'match' file (output from Matcher) into an 'align' file | |
| 206 # # replace old parser.tab2align | |
| 207 # | |
| 208 # @param inFileName a string input file name | |
| 209 # | |
| 210 @staticmethod | |
| 211 def convertMatchFileToAlignFile(inFileName): | |
| 212 basename = os.path.splitext(inFileName)[0] | |
| 213 outFileName = "%s.align" % basename | |
| 214 outFile = open(outFileName, "w") | |
| 215 | |
| 216 lMatches = MatchUtils.getMatchListFromFile(inFileName) | |
| 217 | |
| 218 for match in lMatches: | |
| 219 string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity()) | |
| 220 outFile.write(string) | |
| 221 | |
| 222 outFile.close() | |
| 223 | |
| 224 # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) | |
| 225 # for each match, compute coverage on the smallest seq using this expression | |
| 226 # Min (|QMatchEnd - QMatchStart + 1|,|SMatchEnd - SMatchStart + 1|) | |
| 227 #coverage = ----------------------------------------------------------------- | |
| 228 # Min (QLength,SLength) | |
| 229 # | |
| 230 # Use this coverage for arc value. Matches of a sequence on itself are filtered. | |
| 231 # | |
| 232 # @param matchFileName string input match file name | |
| 233 # @param outFileName string output abc file name | |
| 234 # @param coverage float coverage filter threshold | |
| 235 # | |
| 236 @staticmethod | |
| 237 def convertMatchFileIntoABCFileWithCoverageComputeOnSmallestSeq(matchFileName, outFileName, coverageThreshold = 0): | |
| 238 sSeqNames = set() | |
| 239 sSeqNamesUsed = set() | |
| 240 with open(outFileName, "w") as outF: | |
| 241 with open(matchFileName) as inF: | |
| 242 inF.readline() | |
| 243 for inLine in inF: | |
| 244 splittedLine = inLine.split("\t") | |
| 245 qName = splittedLine[0] | |
| 246 sName = splittedLine[6] | |
| 247 sSeqNames.add(qName) | |
| 248 sSeqNames.add(sName) | |
| 249 if qName != sName: | |
| 250 matchMin = min(abs(float(splittedLine[2]) - float(splittedLine[1]) + 1), abs(float(splittedLine[8]) - float(splittedLine[7]) + 1)) | |
| 251 seqMin = min(int(splittedLine[3]) / float(splittedLine[4]), int(splittedLine[9]) / float(splittedLine[10])) | |
| 252 coverage = round(matchMin / seqMin, 2) | |
| 253 if coverage >= coverageThreshold: | |
| 254 outF.write("%s\n" % "\t".join([qName, sName, str(coverage)])) | |
| 255 sSeqNamesUsed.add(qName) | |
| 256 sSeqNamesUsed.add(sName) | |
| 257 | |
| 258 with open("%s.unused" % outFileName, "w") as outSeqNameOfUnusedMatchesF: | |
| 259 for seqName in sorted(sSeqNames - sSeqNamesUsed): | |
| 260 outSeqNameOfUnusedMatchesF.write("%s\n" % seqName) | |
| 261 | |
| 262 if FileUtils.isEmpty(outFileName): | |
| 263 print "WARNING: '%s' is empty." % outFileName | |
| 264 | |
| 265 # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) | |
| 266 # Use coverage on query for arc value | |
| 267 # | |
| 268 # @param matchFileName string input match file name | |
| 269 # @param outFileName string output abc file name | |
| 270 # @param coverage float query coverage filter threshold | |
| 271 # | |
| 272 @staticmethod | |
| 273 def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0): | |
| 274 with open(matchFileName) as inF: | |
| 275 with open(outFileName, "w") as outF: | |
| 276 inF.readline() | |
| 277 for inLine in inF: | |
| 278 splittedLine = inLine.split("\t") | |
| 279 if float(splittedLine[4]) >= coverage: | |
| 280 outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]]) | |
| 281 outLine += "\n" | |
| 282 outF.write(outLine) | |
| 283 | |
| 284 if FileUtils.isEmpty(outFileName): | |
| 285 print "WARNING: '%s' is empty." % outFileName | |
| 286 | |
| 287 # # Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines. | |
| 288 # # replace old parser.tabnum2id | |
| 289 # | |
| 290 # @param fileName a string input file name | |
| 291 # @param outputFileName a string output file name (optional) | |
| 292 # | |
| 293 @staticmethod | |
| 294 def generateMatchFileWithNewPathId(fileName, outputFileName = None): | |
| 295 if outputFileName is None: | |
| 296 outFile = open(fileName, "w") | |
| 297 else: | |
| 298 outFile = open(outputFileName, "w") | |
| 299 outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n") | |
| 300 | |
| 301 lMatches = MatchUtils.getMatchListFromFile(fileName) | |
| 302 count = 1 | |
| 303 dMatchKeyIdcount = {} | |
| 304 | |
| 305 for match in lMatches: | |
| 306 key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName() | |
| 307 if not key_id in dMatchKeyIdcount.keys(): | |
| 308 newPath = count | |
| 309 count += 1 | |
| 310 dMatchKeyIdcount[ key_id ] = newPath | |
| 311 else: | |
| 312 newPath = dMatchKeyIdcount[ key_id ] | |
| 313 | |
| 314 match.id = newPath | |
| 315 outFile.write(match.toString() + "\n") | |
| 316 outFile.close() | 
