Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.0/commons/core/coord/MatchUtils.py @ 6:20ec0d14798e draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 05:00:24 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:4093a2fb58be | 6:20ec0d14798e |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 import os | |
32 import sys | |
33 import math | |
34 from commons.core.coord.Match import Match | |
35 from commons.core.utils.FileUtils import FileUtils | |
36 from commons.core.checker.RepetException import RepetException | |
37 | |
38 # # Static methods for the manipulation of Match instances | |
39 # | |
40 class MatchUtils (object): | |
41 | |
42 # # Return a list with Match instances from the given file | |
43 # | |
44 # @param inFile name of a file in the Match format | |
45 # @return a list of Match instances | |
46 # | |
47 @staticmethod | |
48 def getMatchListFromFile(inFile): | |
49 lMatchInstances = [] | |
50 inFileHandler = open(inFile, "r") | |
51 while True: | |
52 line = inFileHandler.readline() | |
53 if line == "": | |
54 break | |
55 if line[0:10] == "query.name": | |
56 continue | |
57 m = Match() | |
58 m.setFromString(line) | |
59 lMatchInstances.append(m) | |
60 inFileHandler.close() | |
61 return lMatchInstances | |
62 | |
63 # # Split a Match list in several Match lists according to the subject | |
64 # | |
65 # @param lMatches a list of Match instances | |
66 # @return a dictionary which keys are subject names and values Match lists | |
67 # | |
68 @staticmethod | |
69 def getDictOfListsWithSubjectAsKey(lMatches): | |
70 dSubject2MatchList = {} | |
71 for iMatch in lMatches: | |
72 if not dSubject2MatchList.has_key(iMatch.range_subject.seqname): | |
73 dSubject2MatchList[ iMatch.range_subject.seqname ] = [] | |
74 dSubject2MatchList[ iMatch.range_subject.seqname ].append(iMatch) | |
75 return dSubject2MatchList | |
76 | |
77 # # Split a Match list in several Match lists according to the query | |
78 # | |
79 # @param lMatches a list of Match instances | |
80 # @return a dictionary which keys are query names and values Match lists | |
81 # | |
82 @staticmethod | |
83 def getDictOfListsWithQueryAsKey (lMatches): | |
84 dQuery2MatchList = {} | |
85 for iMatch in lMatches: | |
86 if not dQuery2MatchList.has_key(iMatch.range_query.seqname): | |
87 dQuery2MatchList[ iMatch.range_query.seqname ] = [] | |
88 dQuery2MatchList[ iMatch.range_query.seqname ].append(iMatch) | |
89 return dQuery2MatchList | |
90 | |
91 # # Write Match instances contained in the given list | |
92 # | |
93 # @param lMatches a list of Match instances | |
94 # @param fileName name of the file to write the Match instances | |
95 # @param mode the open mode of the file ""w"" or ""a"" | |
96 # | |
97 @staticmethod | |
98 def writeListInFile(lMatches, fileName, mode = "w", header = None): | |
99 fileHandler = open(fileName, mode) | |
100 if header: | |
101 fileHandler.write(header) | |
102 for iMatch in lMatches: | |
103 iMatch.write(fileHandler) | |
104 fileHandler.close() | |
105 | |
106 # # Give path id list from a list of Match instances | |
107 # | |
108 # @param lMatch list of Match instances | |
109 # @return lId integer list | |
110 # | |
111 @staticmethod | |
112 def getIdListFromMatchList(lMatch): | |
113 lId = [] | |
114 for iMatch in lMatch: | |
115 lId.append(iMatch.id) | |
116 return lId | |
117 | |
118 # # Remove duplicated matches in a match list | |
119 # ## replace old PyRepet.MatchDB.rmvDoublons() | |
120 # | |
121 # @param lMatch list of Match instances | |
122 # @return lMatchesUniq match unique list | |
123 # | |
124 @staticmethod | |
125 def rmvDuplicateMatches(lMatch): | |
126 lMatchesUniq = [] | |
127 for match in lMatch: | |
128 if len(lMatchesUniq) == 0: | |
129 lMatchesUniq.append(match) | |
130 else: | |
131 nbDoublons = 0 | |
132 for m in lMatchesUniq: | |
133 if match.isDoublonWith(m): | |
134 nbDoublons += 1 | |
135 if nbDoublons == 0: | |
136 lMatchesUniq.append(match) | |
137 | |
138 for match1 in lMatchesUniq: | |
139 for match2 in lMatchesUniq: | |
140 if match1.id != match2.id: | |
141 if match1.isDoublonWith(match2): | |
142 raise RepetException ("*** Error: doublon not removed") | |
143 return lMatchesUniq | |
144 | |
145 # # Return the list of queries 'included' in subjects when two different databanks are used. | |
146 # #replace old pyRepet.MatchDB.filterDiffQrySbj() | |
147 # | |
148 # @param iBioseqDB bioseqDB databank of queries | |
149 # @param thresIdentity float identity threshold | |
150 # @param thresLength float length threshold | |
151 # @param verbose int verbosity | |
152 # | |
153 # @return lMatches match list to keep according to length and identity thresholds | |
154 # TODO: don't take into account match for sequence against itself. To do ? | |
155 @staticmethod | |
156 def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity = 0.95, thresLength = 0.98, verbose = 0): | |
157 if verbose > 0: | |
158 print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % (thresIdentity, thresLength); sys.stdout.flush() | |
159 | |
160 thresIdentityPerc = math.floor(thresIdentity * 100) | |
161 lQryToKeep = [] | |
162 dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile)) | |
163 | |
164 for seqH in iBioseqDB.idx.keys(): | |
165 # keep it if it has no match | |
166 if not dQry2Matches.has_key(seqH): | |
167 if seqH not in lQryToKeep: | |
168 lQryToKeep.append(seqH) | |
169 else: | |
170 isConditionsMet = False | |
171 for match in dQry2Matches[ seqH ]: | |
172 # check if they are above the thresholds | |
173 if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength: | |
174 isConditionsMet = True | |
175 break | |
176 if not isConditionsMet and seqH not in lQryToKeep: | |
177 lQryToKeep.append(seqH) | |
178 return lQryToKeep | |
179 | |
180 # # Count the number of distinct matches involved in at least one match above the thresholds. | |
181 # #replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() | |
182 # | |
183 # @param thresIdentity float identity threshold | |
184 # @param thresLength float length threshold | |
185 # | |
186 @staticmethod | |
187 def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity = 0.95, thresLength = 0.98, whatToCount = "query"): | |
188 thresIdentityPerc = math.floor(thresIdentity * 100) | |
189 countSbj = 0 | |
190 if whatToCount.lower() == "query": | |
191 dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches) | |
192 else: | |
193 dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches) | |
194 | |
195 for qry in dMatches.keys(): | |
196 countMatch = 0 | |
197 for match in dMatches[ qry ]: | |
198 | |
199 if match.identity >= thresIdentityPerc and getattr(match, whatToCount.lower() + "_length_perc") >= thresLength: | |
200 countMatch += 1 | |
201 if countMatch > 0: | |
202 countSbj += 1 | |
203 return countSbj | |
204 | |
205 # # Convert a 'match' file (output from Matcher) into an 'align' file | |
206 # # replace old parser.tab2align | |
207 # | |
208 # @param inFileName a string input file name | |
209 # | |
210 @staticmethod | |
211 def convertMatchFileToAlignFile(inFileName): | |
212 basename = os.path.splitext(inFileName)[0] | |
213 outFileName = "%s.align" % basename | |
214 outFile = open(outFileName, "w") | |
215 | |
216 lMatches = MatchUtils.getMatchListFromFile(inFileName) | |
217 | |
218 for match in lMatches: | |
219 string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity()) | |
220 outFile.write(string) | |
221 | |
222 outFile.close() | |
223 | |
224 # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) | |
225 # for each match, compute coverage on the smallest seq using this expression | |
226 # Min (|QMatchEnd - QMatchStart + 1|,|SMatchEnd - SMatchStart + 1|) | |
227 #coverage = ----------------------------------------------------------------- | |
228 # Min (QLength,SLength) | |
229 # | |
230 # Use this coverage for arc value. Matches of a sequence on itself are filtered. | |
231 # | |
232 # @param matchFileName string input match file name | |
233 # @param outFileName string output abc file name | |
234 # @param coverage float coverage filter threshold | |
235 # | |
236 @staticmethod | |
237 def convertMatchFileIntoABCFileWithCoverageComputeOnSmallestSeq(matchFileName, outFileName, coverageThreshold = 0): | |
238 sSeqNames = set() | |
239 sSeqNamesUsed = set() | |
240 with open(outFileName, "w") as outF: | |
241 with open(matchFileName) as inF: | |
242 inF.readline() | |
243 for inLine in inF: | |
244 splittedLine = inLine.split("\t") | |
245 qName = splittedLine[0] | |
246 sName = splittedLine[6] | |
247 sSeqNames.add(qName) | |
248 sSeqNames.add(sName) | |
249 if qName != sName: | |
250 matchMin = min(abs(float(splittedLine[2]) - float(splittedLine[1]) + 1), abs(float(splittedLine[8]) - float(splittedLine[7]) + 1)) | |
251 seqMin = min(int(splittedLine[3]) / float(splittedLine[4]), int(splittedLine[9]) / float(splittedLine[10])) | |
252 coverage = round(matchMin / seqMin, 2) | |
253 if coverage >= coverageThreshold: | |
254 outF.write("%s\n" % "\t".join([qName, sName, str(coverage)])) | |
255 sSeqNamesUsed.add(qName) | |
256 sSeqNamesUsed.add(sName) | |
257 | |
258 with open("%s.unused" % outFileName, "w") as outSeqNameOfUnusedMatchesF: | |
259 for seqName in sorted(sSeqNames - sSeqNamesUsed): | |
260 outSeqNameOfUnusedMatchesF.write("%s\n" % seqName) | |
261 | |
262 if FileUtils.isEmpty(outFileName): | |
263 print "WARNING: '%s' is empty." % outFileName | |
264 | |
265 # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) | |
266 # Use coverage on query for arc value | |
267 # | |
268 # @param matchFileName string input match file name | |
269 # @param outFileName string output abc file name | |
270 # @param coverage float query coverage filter threshold | |
271 # | |
272 @staticmethod | |
273 def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0): | |
274 with open(matchFileName) as inF: | |
275 with open(outFileName, "w") as outF: | |
276 inF.readline() | |
277 for inLine in inF: | |
278 splittedLine = inLine.split("\t") | |
279 if float(splittedLine[4]) >= coverage: | |
280 outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]]) | |
281 outLine += "\n" | |
282 outF.write(outLine) | |
283 | |
284 if FileUtils.isEmpty(outFileName): | |
285 print "WARNING: '%s' is empty." % outFileName | |
286 | |
287 # # Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines. | |
288 # # replace old parser.tabnum2id | |
289 # | |
290 # @param fileName a string input file name | |
291 # @param outputFileName a string output file name (optional) | |
292 # | |
293 @staticmethod | |
294 def generateMatchFileWithNewPathId(fileName, outputFileName = None): | |
295 if outputFileName is None: | |
296 outFile = open(fileName, "w") | |
297 else: | |
298 outFile = open(outputFileName, "w") | |
299 outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n") | |
300 | |
301 lMatches = MatchUtils.getMatchListFromFile(fileName) | |
302 count = 1 | |
303 dMatchKeyIdcount = {} | |
304 | |
305 for match in lMatches: | |
306 key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName() | |
307 if not key_id in dMatchKeyIdcount.keys(): | |
308 newPath = count | |
309 count += 1 | |
310 dMatchKeyIdcount[ key_id ] = newPath | |
311 else: | |
312 newPath = dMatchKeyIdcount[ key_id ] | |
313 | |
314 match.id = newPath | |
315 outFile.write(match.toString() + "\n") | |
316 outFile.close() |