comparison TEisotools-1.1.a/commons/core/coord/MatchUtils.py @ 16:836ce3d9d47a draft default tip

Uploaded
author urgi-team
date Thu, 21 Jul 2016 07:42:47 -0400
parents 255c852351c5
children
comparison
equal deleted inserted replaced
15:255c852351c5 16:836ce3d9d47a
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31 import os
32 import sys
33 import math
34 from commons.core.coord.Match import Match
35 from commons.core.utils.FileUtils import FileUtils
36 from commons.core.checker.RepetException import RepetException
37
38 # # Static methods for the manipulation of Match instances
39 #
40 class MatchUtils (object):
41
42 # # Return a list with Match instances from the given file
43 #
44 # @param inFile name of a file in the Match format
45 # @return a list of Match instances
46 #
47 @staticmethod
48 def getMatchListFromFile(inFile):
49 lMatchInstances = []
50 inFileHandler = open(inFile, "r")
51 while True:
52 line = inFileHandler.readline()
53 if line == "":
54 break
55 if line[0:10] == "query.name":
56 continue
57 m = Match()
58 m.setFromString(line)
59 lMatchInstances.append(m)
60 inFileHandler.close()
61 return lMatchInstances
62
63 # # Split a Match list in several Match lists according to the subject
64 #
65 # @param lMatches a list of Match instances
66 # @return a dictionary which keys are subject names and values Match lists
67 #
68 @staticmethod
69 def getDictOfListsWithSubjectAsKey(lMatches):
70 dSubject2MatchList = {}
71 for iMatch in lMatches:
72 if not dSubject2MatchList.has_key(iMatch.range_subject.seqname):
73 dSubject2MatchList[ iMatch.range_subject.seqname ] = []
74 dSubject2MatchList[ iMatch.range_subject.seqname ].append(iMatch)
75 return dSubject2MatchList
76
77 # # Split a Match list in several Match lists according to the query
78 #
79 # @param lMatches a list of Match instances
80 # @return a dictionary which keys are query names and values Match lists
81 #
82 @staticmethod
83 def getDictOfListsWithQueryAsKey (lMatches):
84 dQuery2MatchList = {}
85 for iMatch in lMatches:
86 if not dQuery2MatchList.has_key(iMatch.range_query.seqname):
87 dQuery2MatchList[ iMatch.range_query.seqname ] = []
88 dQuery2MatchList[ iMatch.range_query.seqname ].append(iMatch)
89 return dQuery2MatchList
90
91 # # Write Match instances contained in the given list
92 #
93 # @param lMatches a list of Match instances
94 # @param fileName name of the file to write the Match instances
95 # @param mode the open mode of the file ""w"" or ""a""
96 #
97 @staticmethod
98 def writeListInFile(lMatches, fileName, mode = "w", header = None):
99 fileHandler = open(fileName, mode)
100 if header:
101 fileHandler.write(header)
102 for iMatch in lMatches:
103 iMatch.write(fileHandler)
104 fileHandler.close()
105
106 # # Give path id list from a list of Match instances
107 #
108 # @param lMatch list of Match instances
109 # @return lId integer list
110 #
111 @staticmethod
112 def getIdListFromMatchList(lMatch):
113 lId = []
114 for iMatch in lMatch:
115 lId.append(iMatch.id)
116 return lId
117
118 # # Remove duplicated matches in a match list
119 # ## replace old PyRepet.MatchDB.rmvDoublons()
120 #
121 # @param lMatch list of Match instances
122 # @return lMatchesUniq match unique list
123 #
124 @staticmethod
125 def rmvDuplicateMatches(lMatch):
126 lMatchesUniq = []
127 for match in lMatch:
128 if len(lMatchesUniq) == 0:
129 lMatchesUniq.append(match)
130 else:
131 nbDoublons = 0
132 for m in lMatchesUniq:
133 if match.isDoublonWith(m):
134 nbDoublons += 1
135 if nbDoublons == 0:
136 lMatchesUniq.append(match)
137
138 for match1 in lMatchesUniq:
139 for match2 in lMatchesUniq:
140 if match1.id != match2.id:
141 if match1.isDoublonWith(match2):
142 raise RepetException ("*** Error: doublon not removed")
143 return lMatchesUniq
144
145 # # Return the list of queries 'included' in subjects when two different databanks are used.
146 # #replace old pyRepet.MatchDB.filterDiffQrySbj()
147 #
148 # @param iBioseqDB bioseqDB databank of queries
149 # @param thresIdentity float identity threshold
150 # @param thresLength float length threshold
151 # @param verbose int verbosity
152 #
153 # @return lMatches match list to keep according to length and identity thresholds
154 # TODO: don't take into account match for sequence against itself. To do ?
155 @staticmethod
156 def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity = 0.95, thresLength = 0.98, verbose = 0):
157 if verbose > 0:
158 print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % (thresIdentity, thresLength); sys.stdout.flush()
159
160 thresIdentityPerc = math.floor(thresIdentity * 100)
161 lQryToKeep = []
162 dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile))
163
164 for seqH in iBioseqDB.idx.keys():
165 # keep it if it has no match
166 if not dQry2Matches.has_key(seqH):
167 if seqH not in lQryToKeep:
168 lQryToKeep.append(seqH)
169 else:
170 isConditionsMet = False
171 for match in dQry2Matches[ seqH ]:
172 # check if they are above the thresholds
173 if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength:
174 isConditionsMet = True
175 break
176 if not isConditionsMet and seqH not in lQryToKeep:
177 lQryToKeep.append(seqH)
178 return lQryToKeep
179
180 # # Count the number of distinct matches involved in at least one match above the thresholds.
181 # #replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres()
182 #
183 # @param thresIdentity float identity threshold
184 # @param thresLength float length threshold
185 #
186 @staticmethod
187 def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity = 0.95, thresLength = 0.98, whatToCount = "query"):
188 thresIdentityPerc = math.floor(thresIdentity * 100)
189 countSbj = 0
190 if whatToCount.lower() == "query":
191 dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches)
192 else:
193 dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches)
194
195 for qry in dMatches.keys():
196 countMatch = 0
197 for match in dMatches[ qry ]:
198
199 if match.identity >= thresIdentityPerc and getattr(match, whatToCount.lower() + "_length_perc") >= thresLength:
200 countMatch += 1
201 if countMatch > 0:
202 countSbj += 1
203 return countSbj
204
205 # # Convert a 'match' file (output from Matcher) into an 'align' file
206 # # replace old parser.tab2align
207 #
208 # @param inFileName a string input file name
209 #
210 @staticmethod
211 def convertMatchFileToAlignFile(inFileName):
212 basename = os.path.splitext(inFileName)[0]
213 outFileName = "%s.align" % basename
214 outFile = open(outFileName, "w")
215
216 lMatches = MatchUtils.getMatchListFromFile(inFileName)
217
218 for match in lMatches:
219 string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity())
220 outFile.write(string)
221
222 outFile.close()
223
224 # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file)
225 # for each match, compute coverage on the smallest seq using this expression
226 # Min (|QMatchEnd - QMatchStart + 1|,|SMatchEnd - SMatchStart + 1|)
227 #coverage = -----------------------------------------------------------------
228 # Min (QLength,SLength)
229 #
230 # Use this coverage for arc value. Matches of a sequence on itself are filtered.
231 #
232 # @param matchFileName string input match file name
233 # @param outFileName string output abc file name
234 # @param coverage float coverage filter threshold
235 #
236 @staticmethod
237 def convertMatchFileIntoABCFileWithCoverageComputeOnSmallestSeq(matchFileName, outFileName, coverageThreshold = 0):
238 sSeqNames = set()
239 sSeqNamesUsed = set()
240 with open(outFileName, "w") as outF:
241 with open(matchFileName) as inF:
242 inF.readline()
243 for inLine in inF:
244 splittedLine = inLine.split("\t")
245 qName = splittedLine[0]
246 sName = splittedLine[6]
247 sSeqNames.add(qName)
248 sSeqNames.add(sName)
249 if qName != sName:
250 matchMin = min(abs(float(splittedLine[2]) - float(splittedLine[1]) + 1), abs(float(splittedLine[8]) - float(splittedLine[7]) + 1))
251 seqMin = min(int(splittedLine[3]) / float(splittedLine[4]), int(splittedLine[9]) / float(splittedLine[10]))
252 coverage = round(matchMin / seqMin, 2)
253 if coverage >= coverageThreshold:
254 outF.write("%s\n" % "\t".join([qName, sName, str(coverage)]))
255 sSeqNamesUsed.add(qName)
256 sSeqNamesUsed.add(sName)
257
258 with open("%s.unused" % outFileName, "w") as outSeqNameOfUnusedMatchesF:
259 for seqName in sorted(sSeqNames - sSeqNamesUsed):
260 outSeqNameOfUnusedMatchesF.write("%s\n" % seqName)
261
262 if FileUtils.isEmpty(outFileName):
263 print "WARNING: '%s' is empty." % outFileName
264
265 # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file)
266 # Use coverage on query for arc value
267 #
268 # @param matchFileName string input match file name
269 # @param outFileName string output abc file name
270 # @param coverage float query coverage filter threshold
271 #
272 @staticmethod
273 def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0):
274 with open(matchFileName) as inF:
275 with open(outFileName, "w") as outF:
276 inF.readline()
277 for inLine in inF:
278 splittedLine = inLine.split("\t")
279 if float(splittedLine[4]) >= coverage:
280 outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]])
281 outLine += "\n"
282 outF.write(outLine)
283
284 if FileUtils.isEmpty(outFileName):
285 print "WARNING: '%s' is empty." % outFileName
286
287 # # Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines.
288 # # replace old parser.tabnum2id
289 #
290 # @param fileName a string input file name
291 # @param outputFileName a string output file name (optional)
292 #
293 @staticmethod
294 def generateMatchFileWithNewPathId(fileName, outputFileName = None):
295 if outputFileName is None:
296 outFile = open(fileName, "w")
297 else:
298 outFile = open(outputFileName, "w")
299 outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n")
300
301 lMatches = MatchUtils.getMatchListFromFile(fileName)
302 count = 1
303 dMatchKeyIdcount = {}
304
305 for match in lMatches:
306 key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName()
307 if not key_id in dMatchKeyIdcount.keys():
308 newPath = count
309 count += 1
310 dMatchKeyIdcount[ key_id ] = newPath
311 else:
312 newPath = dMatchKeyIdcount[ key_id ]
313
314 match.id = newPath
315 outFile.write(match.toString() + "\n")
316 outFile.close()