comparison commons/core/coord/MatchUtils.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents 769e306b7933
children
comparison
equal deleted inserted replaced
37:d22fadc825e3 38:2c0c0a89fad7
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31 import math
32 import os
33 import sys
34 from commons.core.coord.Match import Match
35 from commons.core.checker.RepetException import RepetException
36
37 ## Static methods for the manipulation of Match instances
38 #
39 class MatchUtils ( object ):
40
41 ## Return a list with Match instances from the given file
42 #
43 # @param inFile name of a file in the Match format
44 # @return a list of Match instances
45 #
46 def getMatchListFromFile(inFile ):
47 lMatchInstances = []
48 inFileHandler = open( inFile, "r" )
49 while True:
50 line = inFileHandler.readline()
51 if line == "":
52 break
53 if line[0:10] == "query.name":
54 continue
55 m = Match()
56 m.setFromString( line )
57 lMatchInstances.append( m )
58 inFileHandler.close()
59 return lMatchInstances
60
61 getMatchListFromFile = staticmethod( getMatchListFromFile )
62
63 ## Split a Match list in several Match lists according to the subject
64 #
65 # @param lMatches a list of Match instances
66 # @return a dictionary which keys are subject names and values Match lists
67 #
68 def getDictOfListsWithSubjectAsKey( lMatches ):
69 dSubject2MatchList = {}
70 for iMatch in lMatches:
71 if not dSubject2MatchList.has_key( iMatch.range_subject.seqname ):
72 dSubject2MatchList[ iMatch.range_subject.seqname ] = []
73 dSubject2MatchList[ iMatch.range_subject.seqname ].append( iMatch )
74 return dSubject2MatchList
75
76 getDictOfListsWithSubjectAsKey = staticmethod( getDictOfListsWithSubjectAsKey )
77
78 ## Split a Match list in several Match lists according to the query
79 #
80 # @param lMatches a list of Match instances
81 # @return a dictionary which keys are query names and values Match lists
82 #
83 def getDictOfListsWithQueryAsKey ( lMatches ):
84 dQuery2MatchList = {}
85 for iMatch in lMatches:
86 if not dQuery2MatchList.has_key( iMatch.range_query.seqname ):
87 dQuery2MatchList[ iMatch.range_query.seqname ] = []
88 dQuery2MatchList[ iMatch.range_query.seqname ].append( iMatch )
89 return dQuery2MatchList
90
91 getDictOfListsWithQueryAsKey = staticmethod( getDictOfListsWithQueryAsKey )
92
93 ## Write Match instances contained in the given list
94 #
95 # @param lMatches a list of Match instances
96 # @param fileName name of the file to write the Match instances
97 # @param mode the open mode of the file ""w"" or ""a""
98 #
99 def writeListInFile( lMatches, fileName, mode="w", header=None ):
100 fileHandler = open( fileName, mode )
101 if header:
102 fileHandler.write( header )
103 for iMatch in lMatches:
104 iMatch.write( fileHandler )
105 fileHandler.close()
106
107 writeListInFile = staticmethod( writeListInFile )
108
109 ## Give path id list from a list of Match instances
110 #
111 # @param lMatch list of Match instances
112 #
113 # @return lId integer list
114 #
115 def getIdListFromMatchList(lMatch):
116 lId = []
117 for iMatch in lMatch:
118 lId.append(iMatch.id)
119 return lId
120
121 getIdListFromMatchList = staticmethod(getIdListFromMatchList)
122
123 ## Remove duplicated matches in a match list
124 # ## replace old PyRepet.MatchDB.rmvDoublons()
125 # @param lMatch list of Match instances
126 #
127 # @return lMatchesUniq match unique list
128 #
129 def rmvDuplicateMatches(lMatch):
130 lMatchesUniq = []
131 for match in lMatch:
132 if len(lMatchesUniq) == 0:
133 lMatchesUniq.append( match )
134 else:
135 nbDoublons = 0
136 for m in lMatchesUniq:
137 if match.isDoublonWith( m ):
138 nbDoublons += 1
139 if nbDoublons == 0:
140 lMatchesUniq.append( match )
141
142 for match1 in lMatchesUniq:
143 for match2 in lMatchesUniq:
144 if match1.id != match2.id:
145 if match1.isDoublonWith( match2 ):
146 raise RepetException ( "*** Error: doublon not removed" )
147 return lMatchesUniq
148
149 rmvDuplicateMatches = staticmethod(rmvDuplicateMatches)
150
151 ## Return the list of queries 'included' in subjects when two different databanks are used.
152 ##replace old pyRepet.MatchDB.filterDiffQrySbj()
153 #
154 # @param iBioseqDB bioseqDB databank of queries
155 #
156 # @param thresIdentity float identity threshold
157 #
158 # @param thresLength float length threshold
159 #
160 # @param verbose int verbosity
161 #
162 # @return lMatches match list to keep according to length and identity thresholds
163 #TODO: don't take into account match for sequence against itself. To do ?
164 def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity=0.95, thresLength=0.98, verbose=0 ):
165 if verbose > 0:
166 print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % ( thresIdentity, thresLength ); sys.stdout.flush()
167
168 thresIdentityPerc = math.floor( thresIdentity*100 )
169 lQryToKeep = []
170 dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile))
171
172 for seqH in iBioseqDB.idx.keys():
173 # keep it if it has no match
174 if not dQry2Matches.has_key( seqH ):
175 if seqH not in lQryToKeep:
176 lQryToKeep.append( seqH )
177 else:
178 isConditionsMet = False
179 for match in dQry2Matches[ seqH ]:
180 # check if they are above the thresholds
181 if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength:
182 isConditionsMet = True
183 break
184 if not isConditionsMet and seqH not in lQryToKeep:
185 lQryToKeep.append( seqH )
186 return lQryToKeep
187
188 filterDiffQrySbj = staticmethod(filterDiffQrySbj)
189
190 ## Count the number of distinct matches involved in at least one match above the thresholds.
191 ##replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres()
192 # @param thresIdentity float identity threshold
193 #
194 # @param thresLength float length threshold
195 #
196 def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity=0.95, thresLength=0.98, whatToCount="query" ):
197 thresIdentityPerc = math.floor( thresIdentity*100 )
198 countSbj = 0
199 if whatToCount.lower() == "query":
200 dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches)
201 else:
202 dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches)
203
204 for qry in dMatches.keys():
205 countMatch = 0
206 for match in dMatches[ qry ]:
207
208 if match.identity >= thresIdentityPerc and getattr(match,whatToCount.lower() +"_length_perc") >= thresLength:
209 countMatch += 1
210 if countMatch > 0:
211 countSbj += 1
212 return countSbj
213
214 getNbDistinctSequencesInsideMatchesWithThresh = staticmethod(getNbDistinctSequencesInsideMatchesWithThresh)
215
216 ## Convert a 'match' file (output from Matcher) into an 'align' file
217 ## replace old parser.tab2align
218 #
219 # @param inFileName a string input file name
220 #
221 def convertMatchFileToAlignFile(inFileName):
222 basename = os.path.splitext(inFileName)[0]
223 outFileName = "%s.align" % basename
224 outFile = open(outFileName, "w")
225
226 lMatches = MatchUtils.getMatchListFromFile(inFileName)
227
228 for match in lMatches:
229 string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity() )
230 outFile.write( string )
231
232 outFile.close()
233
234 convertMatchFileToAlignFile = staticmethod(convertMatchFileToAlignFile)
235
236 ## Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file)
237 # Use coverage on query for arc value
238 #
239 # @param matchFileName string input match file name
240 # @param outFileName string output abc file name
241 # @param coverage float query coverage filter threshold
242 #
243 @staticmethod
244 def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0):
245 with open(matchFileName) as inF:
246 with open(outFileName, "w") as outF:
247 inF.readline()
248 inLine = inF.readline()
249 while inLine:
250 splittedLine = inLine.split("\t")
251 if float(splittedLine[4]) >= coverage:
252 outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]])
253 outLine += "\n"
254 outF.write(outLine)
255 inLine = inF.readline()
256
257 ## Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines.
258 ## replace old parser.tabnum2id
259 #
260 # @param fileName a string input file name
261 # @param outputFileName a string output file name (optional)
262 #
263 def generateMatchFileWithNewPathId(fileName, outputFileName=None):
264 if outputFileName is None:
265 outFile = open(fileName, "w")
266 else:
267 outFile = open(outputFileName, "w")
268 outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n")
269
270 lMatches = MatchUtils.getMatchListFromFile(fileName)
271 count = 1
272 dMatchKeyIdcount = {}
273
274 for match in lMatches:
275 key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName()
276 if not key_id in dMatchKeyIdcount.keys():
277 newPath = count
278 count += 1
279 dMatchKeyIdcount[ key_id ] = newPath
280 else:
281 newPath = dMatchKeyIdcount[ key_id ]
282
283 match.id = newPath
284 outFile.write( match.toString()+"\n" )
285 outFile.close()
286
287 generateMatchFileWithNewPathId = staticmethod(generateMatchFileWithNewPathId)
288