13
|
1 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
2 # http://www.inra.fr
|
|
3 # http://urgi.versailles.inra.fr
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30
|
|
31
|
|
32 import os
|
|
33 import sys
|
|
34 import copy
|
|
35 from commons.core.coord.Map import Map
|
|
36 from commons.core.coord.Path import Path
|
|
37 from commons.core.coord.Align import Align
|
|
38 from commons.core.coord.Range import Range
|
|
39 from commons.core.coord.SetUtils import SetUtils
|
|
40 from commons.core.coord.AlignUtils import AlignUtils
|
|
41 from commons.core.checker.RepetException import RepetDataException
|
|
42
|
|
43 ## Static methods for the manipulation of Path instances
|
|
44 #
|
|
45 class PathUtils ( object ):
|
|
46
|
|
47 ## Change the identifier of each Set instance in the given list
|
|
48 #
|
|
49 # @param lPaths list of Path instances
|
|
50 # @param newId new identifier
|
|
51 #
|
|
52 @staticmethod
|
|
53 def changeIdInList(lPaths, newId):
|
|
54 for iPath in lPaths:
|
|
55 iPath.id = newId
|
|
56
|
|
57
|
|
58 ## Return a list of Set instances containing the query range from a list of Path instances
|
|
59 #
|
|
60 # @param lPaths a list of Path instances
|
|
61 #
|
|
62 @staticmethod
|
|
63 def getSetListFromQueries(lPaths):
|
|
64 lSets = []
|
|
65 for iPath in lPaths:
|
|
66 lSets.append( iPath.getSubjectAsSetOfQuery() )
|
|
67 return lSets
|
|
68
|
|
69
|
|
70 ## Return a list of Set instances containing the subject range from a list of Path instances
|
|
71 #
|
|
72 # @param lPaths a list of Path instances
|
|
73 #
|
|
74 @staticmethod
|
|
75 def getSetListFromSubjects(lPaths):
|
|
76 lSets = []
|
|
77 for iPath in lPaths:
|
|
78 lSets.append( iPath.getQuerySetOfSubject() )
|
|
79 return lSets
|
|
80
|
|
81
|
|
82 ## Return a sorted list of Range instances containing the subjects from a list of Path instances
|
|
83 #
|
|
84 # @param lPaths a list of Path instances
|
|
85 # @note meaningful only if all Path instances have same identifier
|
|
86 #
|
|
87 @staticmethod
|
|
88 def getRangeListFromSubjects( lPaths ):
|
|
89 lRanges = []
|
|
90 for iPath in lPaths:
|
|
91 lRanges.append( iPath.range_subject )
|
|
92 if lRanges[0].isOnDirectStrand():
|
|
93 return sorted( lRanges, key=lambda iRange: ( iRange.getMin(), iRange.getMax() ) )
|
|
94 else:
|
|
95 return sorted( lRanges, key=lambda iRange: ( iRange.getMax(), iRange.getMin() ) )
|
|
96
|
|
97
|
|
98 ## Return a tuple with min and max of query coordinates from Path instances in the given list
|
|
99 #
|
|
100 # @param lPaths a list of Path instances
|
|
101 #
|
|
102 @staticmethod
|
|
103 def getQueryMinMaxFromPathList(lPaths):
|
|
104 qmin = -1
|
|
105 qmax = -1
|
|
106 for iPath in lPaths:
|
|
107 if qmin == -1:
|
|
108 qmin = iPath.range_query.start
|
|
109 qmin = min(qmin, iPath.range_query.getMin())
|
|
110 qmax = max(qmax, iPath.range_query.getMax())
|
|
111 return (qmin, qmax)
|
|
112
|
|
113
|
|
114 ## Return a tuple with min and max of subject coordinates from Path instances in the given list
|
|
115 #
|
|
116 # @param lPaths lists of Path instances
|
|
117 #
|
|
118 @staticmethod
|
|
119 def getSubjectMinMaxFromPathList(lPaths):
|
|
120 smin = -1
|
|
121 smax = -1
|
|
122 for iPath in lPaths:
|
|
123 if smin == -1:
|
|
124 smin = iPath.range_subject.start
|
|
125 smin = min(smin, iPath.range_subject.getMin())
|
|
126 smax = max(smax, iPath.range_subject.getMax())
|
|
127 return (smin, smax)
|
|
128
|
|
129
|
|
130 ## Returns a Path objects list where Paths query coordinates overlapping with
|
|
131 # any Path in a list are removed.
|
|
132 #
|
|
133 # WARNING: input Path lists are modified (sort)
|
|
134 #
|
|
135 # @param lRefPaths list of paths to check overlaps
|
|
136 # @param lPathsToClean list of paths to remove overlapping Paths on query coordinates
|
|
137 # @return path list
|
|
138 @staticmethod
|
|
139 def removeOverlappingPathsOnQueriesBetweenPathLists(lRefPaths, lPathsToClean):
|
|
140 if not lRefPaths:
|
|
141 print "WARNING: empty reference Paths list"
|
|
142 return lPathsToClean
|
|
143
|
|
144 lRefQueries = PathUtils.getListOfDistinctQueryNames(lRefPaths)
|
|
145 lToCleanQueries = PathUtils.getListOfDistinctQueryNames(lPathsToClean)
|
|
146
|
|
147 lCommonQueries = list(set(lRefQueries) & set(lToCleanQueries))
|
|
148 lCommonQueries.sort()
|
|
149 lSpecificToCleanQueries = list(set(lToCleanQueries) - set(lCommonQueries))
|
|
150 lSpecificToCleanQueries.sort()
|
|
151
|
|
152 lRefPaths.sort(key=lambda iPath: (iPath.getQueryName(), iPath.getIdentifier(), iPath.getQueryMin(), iPath.getQueryMax()))
|
|
153 lPathsToClean.sort(key=lambda iPath: (iPath.getQueryName(), iPath.getIdentifier(), iPath.getQueryMin(), iPath.getQueryMax()))
|
|
154
|
|
155 lCleanedPaths = []
|
|
156 lSpecificToCleanQueries = list(set(lToCleanQueries) - set(lCommonQueries))
|
|
157 lCleanedPaths.extend(PathUtils.extractPathsFromQueryNameList(lPathsToClean, lSpecificToCleanQueries))
|
|
158
|
|
159 dRefQueryToPathList = PathUtils.getDictOfListsWithQueryNameAsKey(lRefPaths)
|
|
160 dToCleanQueryToPathList = PathUtils.getDictOfListsWithQueryNameAsKey(lPathsToClean)
|
|
161
|
|
162 for queryName in lCommonQueries:
|
|
163
|
|
164 refQueryHash = PathUtils.getDictOfListsWithIdAsKey(dRefQueryToPathList[queryName])
|
|
165 toCleanQueryHash = PathUtils.getDictOfListsWithIdAsKey(dToCleanQueryToPathList[queryName])
|
|
166
|
|
167 for lCleanPathById in toCleanQueryHash.values():
|
|
168 isOverlapping = False
|
|
169
|
|
170 for lRefPathById in refQueryHash.values():
|
|
171 if PathUtils.areQueriesOverlappingBetweenPathLists(lRefPathById, lCleanPathById, areListsAlreadySort = True):
|
|
172 isOverlapping = True
|
|
173 break
|
|
174
|
|
175 if not isOverlapping:
|
|
176 lCleanedPaths.extend(lCleanPathById)
|
|
177
|
|
178 return lCleanedPaths
|
|
179
|
|
180
|
|
181 ## Return True if the query range of any Path instance from the first list overlaps with the query range of any Path instance from the second list
|
|
182 #
|
|
183 # @param lPaths1: list of Path instances
|
|
184 # @param lPaths2: list of Path instances
|
|
185 # @return boolean
|
|
186 #
|
|
187 @staticmethod
|
|
188 def areQueriesOverlappingBetweenPathLists( lPaths1, lPaths2, areListsAlreadySort = False):
|
|
189 if not areListsAlreadySort:
|
|
190 lSortedPaths1 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths1 )
|
|
191 lSortedPaths2 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths2 )
|
|
192 else:
|
|
193 lSortedPaths1 = lPaths1
|
|
194 lSortedPaths2 = lPaths2
|
|
195 i = 0
|
|
196 j = 0
|
|
197 while i != len(lSortedPaths1):
|
|
198 j = 0
|
|
199 while j != len(lSortedPaths2):
|
|
200 if not lSortedPaths1[i].range_query.isOverlapping( lSortedPaths2[j].range_query ):
|
|
201 j += 1
|
|
202 else:
|
|
203 return True
|
|
204 i += 1
|
|
205 return False
|
|
206
|
|
207
|
|
208 ## Show Path instances contained in the given list
|
|
209 #
|
|
210 # @param lPaths a list of Path instances
|
|
211 #
|
|
212 @staticmethod
|
|
213 def showList(lPaths):
|
|
214 for iPath in lPaths:
|
|
215 iPath.show()
|
|
216
|
|
217
|
|
218 ## Write Path instances contained in the given list
|
|
219 #
|
|
220 # @param lPaths a list of Path instances
|
|
221 # @param fileName name of the file to write the Path instances
|
|
222 # @param mode the open mode of the file ""w"" or ""a""
|
|
223 #
|
|
224 @staticmethod
|
|
225 def writeListInFile(lPaths, fileName, mode="w"):
|
|
226 AlignUtils.writeListInFile(lPaths, fileName, mode)
|
|
227
|
|
228
|
|
229 ## Return new list of Path instances with no duplicate
|
|
230 #
|
|
231 # @param lPaths a list of Path instances
|
|
232 # @param useOnlyCoord boolean if True, check only coordinates and sequence names
|
|
233 # @return lUniqPaths a path instances list
|
|
234 #
|
|
235 @staticmethod
|
|
236 def getPathListWithoutDuplicates(lPaths, useOnlyCoord = False):
|
|
237 if len(lPaths) < 2:
|
|
238 return lPaths
|
|
239 lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths )
|
|
240 lUniqPaths = [ lSortedPaths[0] ]
|
|
241 if useOnlyCoord:
|
|
242 for iPath in lSortedPaths[1:]:
|
|
243 if iPath.range_query.start != lUniqPaths[-1].range_query.start \
|
|
244 or iPath.range_query.end != lUniqPaths[-1].range_query.end \
|
|
245 or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname \
|
|
246 or iPath.range_subject.start != lUniqPaths[-1].range_subject.start \
|
|
247 or iPath.range_subject.end != lUniqPaths[-1].range_subject.end \
|
|
248 or iPath.range_subject.seqname != lUniqPaths[-1].range_subject.seqname:
|
|
249 lUniqPaths.append( iPath )
|
|
250 else:
|
|
251 for iPath in lSortedPaths[1:]:
|
|
252 if iPath != lUniqPaths[-1]:
|
|
253 lUniqPaths.append( iPath )
|
|
254 return lUniqPaths
|
|
255
|
|
256
|
|
257 @staticmethod
|
|
258 def getPathListWithoutDuplicatesOnQueryCoord(lPaths):
|
|
259 if len(lPaths) < 2:
|
|
260 return lPaths
|
|
261 lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths )
|
|
262 lUniqPaths = [ lSortedPaths[0] ]
|
|
263 for iPath in lSortedPaths[1:]:
|
|
264 if iPath.range_query.start != lUniqPaths[-1].range_query.start \
|
|
265 or iPath.range_query.end != lUniqPaths[-1].range_query.end \
|
|
266 or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname:
|
|
267 lUniqPaths.append( iPath )
|
|
268 return lUniqPaths
|
|
269
|
|
270
|
|
271 ## Split a Path list in several Path lists according to the identifier
|
|
272 #
|
|
273 # @param lPaths a list of Path instances
|
|
274 # @return a dictionary which keys are identifiers and values Path lists
|
|
275 #
|
|
276 @staticmethod
|
|
277 def getDictOfListsWithIdAsKey(lPaths):
|
|
278 dId2PathList = dict((ident, []) for ident in PathUtils.getListOfDistinctIdentifiers(lPaths))
|
|
279 for iPath in lPaths:
|
|
280 dId2PathList[iPath.id].append(iPath)
|
|
281 return dId2PathList
|
|
282
|
|
283
|
|
284 ## Split a Path list in several Path lists according to the query name
|
|
285 #
|
|
286 # @param lPaths a list of Path instances
|
|
287 # @return a dictionary which keys are query_names and values Path lists
|
|
288 #
|
|
289 @staticmethod
|
|
290 def getDictOfListsWithQueryNameAsKey(lPaths):
|
|
291 dId2PathList = dict((qn, []) for qn in PathUtils.getListOfDistinctQueryNames(lPaths))
|
|
292 for iPath in lPaths:
|
|
293 dId2PathList[iPath.getQueryName()].append(iPath)
|
|
294 return dId2PathList
|
|
295
|
|
296
|
|
297 ## Split a Path file in several Path lists according to the identifier
|
|
298 #
|
|
299 # @param pathFile name of the input Path file
|
|
300 # @return a dictionary which keys are identifiers and values Path lists
|
|
301 #
|
|
302 @staticmethod
|
|
303 def getDictOfListsWithIdAsKeyFromFile( pathFile ):
|
|
304 dId2PathList = {}
|
|
305 pathFileHandler = open(pathFile, "r")
|
|
306 for line in pathFileHandler:
|
|
307 iPath = Path()
|
|
308 iPath.setFromString(line)
|
|
309 if dId2PathList.has_key(iPath.id):
|
|
310 dId2PathList[ iPath.id ].append(iPath)
|
|
311 else:
|
|
312 dId2PathList[ iPath.id ] = [ iPath ]
|
|
313 pathFileHandler.close()
|
|
314 return dId2PathList
|
|
315
|
|
316
|
|
317 ## Return a list of Path list(s) obtained while splitting a list of connected Path instances according to another based on query coordinates
|
|
318 # Only the path instance of lToKeep between path instance of lToUnjoin are used to split lToUnjoin
|
|
319 # @param lToKeep: a list of Path instances to keep (reference)
|
|
320 # @param lToUnjoin: a list of Path instances to unjoin
|
|
321 # @return: list of Path list(s) (can be empty if one of the input lists is empty)
|
|
322 # @warning: all the path instances in a given list MUST be connected (i.e. same identifier)
|
|
323 # @warning: if the path instances in a given list overlap neither within each other nor with the Path instances of the other list, these path instances are not used to split the lToUnjoin
|
|
324 #
|
|
325 @staticmethod
|
|
326 def getPathListUnjoinedBasedOnQuery( lToKeep, lToUnjoin ):
|
|
327 lSortedToKeep = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToKeep )
|
|
328 length_lSortedToKeep = len(lSortedToKeep)
|
|
329 # PathUtils.showList(lSortedToKeep)
|
|
330 lSortedToUnjoin = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToUnjoin )
|
|
331 # PathUtils.showList(lSortedToUnjoin)
|
|
332 length_lSortedToUnjoin = len(lSortedToUnjoin)
|
|
333 if lToUnjoin == []:
|
|
334 return []
|
|
335 if lToKeep == []:
|
|
336 return [ lToUnjoin ]
|
|
337
|
|
338 lLists = []
|
|
339 k = 0
|
|
340 while k < length_lSortedToKeep:
|
|
341 j1 = 0
|
|
342 while j1 < length_lSortedToUnjoin and lSortedToKeep[k].range_query.getMin() > lSortedToUnjoin[j1].range_query.getMax():
|
|
343 j1 += 1
|
|
344 if j1 == length_lSortedToUnjoin:
|
|
345 break
|
|
346 if j1 != 0:
|
|
347 lLists.append( lSortedToUnjoin[:j1] )
|
|
348 del lSortedToUnjoin[:j1]
|
|
349 j1 = 0
|
|
350 if k+1 == len(lSortedToKeep):
|
|
351 break
|
|
352 j2 = j1
|
|
353 minQueryOf_lSortedToKeepKplus1 = lSortedToKeep[k+1].range_query.getMin()
|
|
354 maxQueryOf_lSortedToUnjoinJ2 = lSortedToUnjoin[j2].range_query.getMax()
|
|
355 if j2 < length_lSortedToUnjoin and minQueryOf_lSortedToKeepKplus1 > maxQueryOf_lSortedToUnjoinJ2:
|
|
356 while j2 < len(lSortedToUnjoin) and minQueryOf_lSortedToKeepKplus1 > maxQueryOf_lSortedToUnjoinJ2:
|
|
357 j2 += 1
|
|
358 maxQueryOf_lSortedToUnjoinJ2 = lSortedToUnjoin[j2].range_query.getMax()
|
|
359 lLists.append( lSortedToUnjoin[j1:j2] )
|
|
360 del lSortedToUnjoin[j1:j2]
|
|
361 k += 1
|
|
362
|
|
363 if lLists != [] or k == 0:
|
|
364 lLists.append( lSortedToUnjoin )
|
|
365 else:
|
|
366 lLists = lSortedToUnjoin
|
|
367
|
|
368 return lLists
|
|
369
|
|
370
|
|
371 ## Return the identity of the Path list, the identity of each instance being weighted by the length of each query range
|
|
372 # All Paths should have the same query and subject.
|
|
373 # The Paths are merged using query coordinates only.
|
|
374 #
|
|
375 # @param lPaths list of Path instances
|
|
376 #
|
|
377 @staticmethod
|
|
378 def getIdentityFromPathList( lPaths, checkSubjects=True ):
|
|
379 if len( PathUtils.getListOfDistinctQueryNames( lPaths ) ) > 1:
|
|
380 msg = "ERROR: try to compute identity from Paths with different queries"
|
|
381 sys.stderr.write( "%s\n" % msg )
|
|
382 sys.stderr.flush()
|
|
383 raise Exception
|
|
384 if checkSubjects and len( PathUtils.getListOfDistinctSubjectNames( lPaths ) ) > 1:
|
|
385 msg = "ERROR: try to compute identity from Paths with different subjects"
|
|
386 sys.stderr.write( "%s\n" % msg )
|
|
387 sys.stderr.flush()
|
|
388 raise Exception
|
|
389 identity = 0
|
|
390 lMergedPaths = PathUtils.mergePathsInListUsingQueryCoordsOnly( lPaths )
|
|
391 lQuerySets = PathUtils.getSetListFromQueries( lMergedPaths )
|
|
392 lMergedQuerySets = SetUtils.mergeSetsInList( lQuerySets )
|
|
393 totalLengthOnQry = SetUtils.getCumulLength( lMergedQuerySets )
|
|
394 for iPath in lMergedPaths:
|
|
395 identity += iPath.identity * iPath.getLengthOnQuery()
|
|
396 weightedIdentity = identity / float(totalLengthOnQry)
|
|
397 if weightedIdentity < 0:
|
|
398 msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity
|
|
399 sys.stderr.write("%s\n" % msg)
|
|
400 sys.stderr.flush()
|
|
401 raise Exception
|
|
402 elif weightedIdentity > 100:
|
|
403 msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity
|
|
404 sys.stderr.write("%s\n" % msg)
|
|
405 sys.stderr.flush()
|
|
406 raise RepetDataException(msg)
|
|
407 return weightedIdentity
|
|
408
|
|
409
|
|
410 ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, and finally their initial order.
|
|
411 #
|
|
412 # @param lPaths list of Path instances
|
|
413 #
|
|
414 @staticmethod
|
|
415 def getPathListSortedByIncreasingMinQueryThenMaxQuery(lPaths):
|
|
416 return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax() ) )
|
|
417
|
|
418
|
|
419 ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then their identifier, and finally their initial order.
|
|
420 #
|
|
421 # @param lPaths list of Path instances
|
|
422 #
|
|
423 @staticmethod
|
|
424 def getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier(lPaths):
|
|
425 return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax(), iPath.getIdentifier() ) )
|
|
426
|
|
427
|
|
428 ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then the min of the subject, then the max of the subject and finally their initial order.
|
|
429 #
|
|
430 # @param lPaths list of Path instances
|
|
431 #
|
|
432 @staticmethod
|
|
433 def getPathListSortedByIncreasingMinQueryThenMaxQueryThenMinSubjectThenMaxSubject(lPaths):
|
|
434 return sorted(lPaths, key=lambda iPath: (iPath.getQueryMin(), iPath.getQueryMax(), iPath.getSubjectMin(), iPath.getSubjectMax()))
|
|
435
|
|
436
|
|
437 ## Return a list of Path instances sorted in increasing order according to the min, then the inverse of the query length, and finally their initial order
|
|
438 #
|
|
439 # @param lPaths: list of Path instances
|
|
440 #
|
|
441 @staticmethod
|
|
442 def getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ):
|
|
443 return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), 1 / float(iPath.getLengthOnQuery()) ) )
|
|
444
|
|
445
|
|
446 ## Return a list of the distinct identifiers
|
|
447 #
|
|
448 # @param lPaths list of Path instances
|
|
449 #
|
|
450 @staticmethod
|
|
451 def getListOfDistinctIdentifiers( lPaths ):
|
|
452 sDistinctIdentifiers = set([iPath.id for iPath in lPaths])
|
|
453 return list(sDistinctIdentifiers)
|
|
454
|
|
455
|
|
456 ## Return a list of the distinct query names present in the collection
|
|
457 #
|
|
458 # @param lPaths list of Path instances
|
|
459 #
|
|
460 @staticmethod
|
|
461 def getListOfDistinctQueryNames( lPaths ):
|
|
462 sDistinctQueryNames = set([iPath.range_query.seqname for iPath in lPaths])
|
|
463 return list(sDistinctQueryNames)
|
|
464
|
|
465
|
|
466 ## Return a list of the distinct subject names present in the collection
|
|
467 #
|
|
468 # @param lPaths list of Path instances
|
|
469 #
|
|
470 @staticmethod
|
|
471 def getListOfDistinctSubjectNames( lPaths ):
|
|
472 sDistinctSubjectNames = set([iPath.range_subject.seqname for iPath in lPaths])
|
|
473 return list(sDistinctSubjectNames)
|
|
474
|
|
475
|
|
476 ## Return a list of paths with matching query names
|
|
477 #
|
|
478 # @param lPaths list of Path instances
|
|
479 # @param queryName query name to extract
|
|
480 @staticmethod
|
|
481 def extractPathsFromQueryName(lPaths, queryName):
|
|
482 return [iPath for iPath in lPaths if iPath.getQueryName() == queryName]
|
|
483
|
|
484
|
|
485 ## Return a list of paths with matching query names
|
|
486 #
|
|
487 # @param lPaths list of Path instances
|
|
488 # @param lQueryName query name list to extract
|
|
489 @staticmethod
|
|
490 def extractPathsFromQueryNameList(lPaths, lQueryNames):
|
|
491 d = dict.fromkeys(lQueryNames)
|
|
492 return [iPath for iPath in lPaths if iPath.getQueryName() in d]
|
|
493
|
|
494
|
|
495 ## Return a list of paths with matching subject names
|
|
496 #
|
|
497 # @param lPaths list of Path instances
|
|
498 # @param subjectName subject name to extract
|
|
499 @staticmethod
|
|
500 def extractPathsFromSubjectName(lPaths, subjectName):
|
|
501 return [iPath for iPath in lPaths if iPath.getSubjectName() == subjectName]
|
|
502
|
|
503
|
|
504 ## Return a list of paths with coordinates overlap a given range
|
|
505 #
|
|
506 # @param lPaths list of Path instances
|
|
507 # @param queryName query name to extract
|
|
508 # @param start starting position
|
|
509 # @param end ending position
|
|
510 # @return list of Path instance
|
|
511 @staticmethod
|
|
512 def extractPathsFromQueryCoord(lPaths, queryName, start, end):
|
|
513 lExtractedPaths = []
|
|
514 iAlign = Align(range_q = Range(queryName, start, end))
|
|
515
|
|
516 for path in PathUtils.extractPathsFromQueryName(lPaths, queryName):
|
|
517 if path.isQueryOverlapping(iAlign):
|
|
518 lExtractedPaths.append(path)
|
|
519
|
|
520 return lExtractedPaths
|
|
521
|
|
522
|
|
523 ## Return a list of lists containing query coordinates of the connections sorted in increasing order.
|
|
524 #
|
|
525 # @param lConnectedPaths: list of Path instances having the same identifier
|
|
526 # @param minLength: threshold below which connections are not reported (default= 0 bp)
|
|
527 # @note: return only connections longer than threshold
|
|
528 # @note: if coordinate on query ends at 100, return 101
|
|
529 # @warning: Path instances MUST be sorted in increasing order according to query coordinates
|
|
530 # @warning: Path instances MUST be on direct query strand (and maybe on reverse subject strand)
|
|
531 #
|
|
532 @staticmethod
|
|
533 def getListOfJoinCoordinatesOnQuery(lConnectedPaths, minLength=0):
|
|
534 lJoinCoordinates = []
|
|
535 for i in xrange(1,len(lConnectedPaths)):
|
|
536 startJoin = lConnectedPaths[i-1].range_query.end
|
|
537 endJoin = lConnectedPaths[i].range_query.start
|
|
538 if endJoin - startJoin + 1 > minLength:
|
|
539 lJoinCoordinates.append( [ startJoin + 1, endJoin - 1 ] )
|
|
540 return lJoinCoordinates
|
|
541
|
|
542
|
|
543 ## Return the length on the query of all Path instance in the given list
|
|
544 #
|
|
545 # @param lPaths list of Path instances
|
|
546 # @note overlapping ranges are not summed but truncated.
|
|
547 #
|
|
548 @staticmethod
|
|
549 def getLengthOnQueryFromPathList( lPaths ):
|
|
550 lSets = PathUtils.getSetListFromQueries( lPaths )
|
|
551 lMergedSets = SetUtils.mergeSetsInList( lSets )
|
|
552 length = SetUtils.getCumulLength( lMergedSets )
|
|
553 return length
|
|
554
|
|
555
|
|
556 ## Convert a Path file into an Align file
|
|
557 #
|
|
558 # @param pathFile: name of the input Path file
|
|
559 # @param alignFile: name of the output Align file
|
|
560 #
|
|
561 @staticmethod
|
|
562 def convertPathFileIntoAlignFile(pathFile, alignFile):
|
|
563 pathFileHandler = open(pathFile, "r")
|
|
564 alignFileHandler = open(alignFile, "w")
|
|
565 iPath = Path()
|
|
566 for line in pathFileHandler:
|
|
567 iPath.setFromString(line)
|
|
568 iAlign = iPath.getAlignInstance()
|
|
569 iAlign.write(alignFileHandler)
|
|
570 pathFileHandler.close()
|
|
571 alignFileHandler.close()
|
|
572
|
|
573
|
|
574 #TODO: duplicated method => to rename with the name of the next method (which is called) ?
|
|
575 ## Convert a Path File into a Map file with query coordinates only
|
|
576 #
|
|
577 # @param pathFile: name of the input Path file
|
|
578 # @param mapFile: name of the output Map file
|
|
579 #
|
|
580 @staticmethod
|
|
581 def convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile ):
|
|
582 pathFileHandler = open(pathFile, "r")
|
|
583 mapFileHandler = open(mapFile, "w")
|
|
584 p = Path()
|
|
585 for line in pathFileHandler:
|
|
586 p.reset()
|
|
587 p.setFromTuple(line.split("\t"))
|
|
588 p.writeSubjectAsMapOfQuery(mapFileHandler)
|
|
589 pathFileHandler.close()
|
|
590 mapFileHandler.close()
|
|
591
|
|
592
|
|
593 ## for each line of a given Path file, write the coordinates of the subject on the query as one line in a Map file
|
|
594 #
|
|
595 # @param pathFile: name of the input Path file
|
|
596 # @param mapFile: name of the output Map file
|
|
597 #
|
|
598 @staticmethod
|
|
599 def convertPathFileIntoMapFileWithSubjectsOnQueries( pathFile, mapFile ):
|
|
600 PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile )
|
|
601
|
|
602
|
|
603 ## Merge matches on queries
|
|
604 #
|
|
605 # @param inFile: name of the input Path file
|
|
606 # @param outFile: name of the output Path file
|
|
607 #
|
|
608 @staticmethod
|
|
609 def mergeMatchesOnQueries(inFile, outFile):
|
|
610 mapFile = "%s.map" % inFile
|
|
611 PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly(inFile, mapFile)
|
|
612 cmd = "mapOp"
|
|
613 cmd += " -q %s" % mapFile
|
|
614 cmd += " -m"
|
|
615 cmd += " 2>&1 > /dev/null"
|
|
616 exitStatus = os.system(cmd)
|
|
617 if exitStatus != 0:
|
|
618 print "ERROR: mapOp returned %i" % exitStatus
|
|
619 sys.exit(1)
|
|
620 os.remove(mapFile)
|
|
621 mergeFile = "%s.merge" % mapFile
|
|
622 mergeFileHandler = open(mergeFile, "r")
|
|
623 outFileHandler = open(outFile, "w")
|
|
624 m = Map()
|
|
625 for line in mergeFileHandler:
|
|
626 m.reset()
|
|
627 m.setFromString(line, "\t")
|
|
628 m.writeAsQueryOfPath(outFileHandler)
|
|
629 mergeFileHandler.close()
|
|
630 os.remove(mergeFile)
|
|
631 outFileHandler.close()
|
|
632
|
|
633
|
|
634 ## Filter chains of Path(s) which length is below a given threshold
|
|
635 #
|
|
636 # @param lPaths: list of Path instances
|
|
637 # @param minLengthChain: minimum length of a chain to be kept
|
|
638 # @note: a chain may contain a single Path instance
|
|
639 # @return: a list of Path instances
|
|
640 #
|
|
641 @staticmethod
|
|
642 def filterPathListOnChainLength( lPaths, minLengthChain ):
|
|
643 lFilteredPaths = []
|
|
644 dPathnum2Paths = PathUtils.getDictOfListsWithIdAsKey( lPaths )
|
|
645 for pathnum in dPathnum2Paths.keys():
|
|
646 length = PathUtils.getLengthOnQueryFromPathList( dPathnum2Paths[ pathnum ] )
|
|
647 if length >= minLengthChain:
|
|
648 lFilteredPaths += dPathnum2Paths[ pathnum ]
|
|
649 return lFilteredPaths
|
|
650
|
|
651
|
|
652 ## Return a Path list from a Path file
|
|
653 #
|
|
654 # @param pathFile string name of a Path file
|
|
655 # @return a list of Path instances
|
|
656 #
|
|
657 @staticmethod
|
|
658 def getPathListFromFile( pathFile ):
|
|
659 lPaths = []
|
|
660 with open(pathFile, "r") as pathFileHandler:
|
|
661 for line in pathFileHandler:
|
|
662 iPath = Path()
|
|
663 iPath.setFromString(line)
|
|
664 lPaths.append(iPath)
|
|
665 return lPaths
|
|
666
|
|
667
|
|
668 ## Convert a chain into a 'pathrange'
|
|
669 #
|
|
670 # @param lPaths a list of Path instances with the same identifier
|
|
671 # @note: the min and max of each Path is used
|
|
672 #
|
|
673 @staticmethod
|
|
674 def convertPathListToPathrange( lPaths ):
|
|
675 if len(lPaths) == 0:
|
|
676 return
|
|
677 if len(lPaths) == 1:
|
|
678 return lPaths[0]
|
|
679 iPathrange = copy.deepcopy( lPaths[0] )
|
|
680 iPathrange.identity = lPaths[0].identity * lPaths[0].getLengthOnQuery()
|
|
681 cumulQueryLength = iPathrange.getLengthOnQuery()
|
|
682 for iPath in lPaths[1:]:
|
|
683 if iPath.id != iPathrange.id:
|
|
684 msg = "ERROR: two Path instances in the chain have different identifiers"
|
|
685 sys.stderr.write( "%s\n" % ( msg ) )
|
|
686 sys.exit(1)
|
|
687 if iPathrange.range_subject.isOnDirectStrand() != iPath.range_subject.isOnDirectStrand():
|
|
688 msg = "ERROR: two Path instances in the chain are on different strands"
|
|
689 sys.stderr.write( "%s\n" % ( msg ) )
|
|
690 sys.exit(1)
|
|
691 iPathrange.range_query.start = min( iPathrange.range_query.start, iPath.range_query.start )
|
|
692 iPathrange.range_query.end = max( iPathrange.range_query.end, iPath.range_query.end )
|
|
693 if iPathrange.range_subject.isOnDirectStrand():
|
|
694 iPathrange.range_subject.start = min( iPathrange.range_subject.start, iPath.range_subject.start )
|
|
695 iPathrange.range_subject.end = max( iPathrange.range_subject.end, iPath.range_subject.end )
|
|
696 else:
|
|
697 iPathrange.range_subject.start = max( iPathrange.range_subject.start, iPath.range_subject.start )
|
|
698 iPathrange.range_subject.end = min( iPathrange.range_subject.end, iPath.range_subject.end )
|
|
699 iPathrange.e_value = min( iPathrange.e_value, iPath.e_value )
|
|
700 iPathrange.score += iPath.score
|
|
701 iPathrange.identity += iPath.identity * iPath.getLengthOnQuery()
|
|
702 cumulQueryLength += iPath.getLengthOnQuery()
|
|
703 iPathrange.identity = iPathrange.identity / float(cumulQueryLength)
|
|
704 return iPathrange
|
|
705
|
|
706
|
|
707 ## Convert a Path file into an Align file via 'pathrange'
|
|
708 #
|
|
709 # @param pathFile: name of the input Path file
|
|
710 # @param alignFile: name of the output Align file
|
|
711 # @param verbose integer verbosity level
|
|
712 # @note: the min and max of each Path is used
|
|
713 #
|
|
714 @staticmethod
|
|
715 def convertPathFileIntoAlignFileViaPathrange( pathFile, alignFile, verbose=0 ):
|
|
716 lPaths = PathUtils.getPathListFromFile( pathFile )
|
|
717 dId2PathList = PathUtils.getDictOfListsWithIdAsKey( lPaths )
|
|
718 lIds = dId2PathList.keys()
|
|
719 lIds.sort()
|
|
720 if verbose > 0:
|
|
721 msg = "number of chains: %i" % ( len(lIds) )
|
|
722 sys.stdout.write( "%s\n" % ( msg ) )
|
|
723 sys.stdout.flush()
|
|
724 alignFileHandler = open( alignFile, "w" )
|
|
725 for identifier in lIds:
|
|
726 iPath = PathUtils.convertPathListToPathrange( dId2PathList[ identifier ] )
|
|
727 iAlign = iPath.getAlignInstance()
|
|
728 iAlign.write( alignFileHandler )
|
|
729 alignFileHandler.close()
|
|
730
|
|
731
|
|
732 ## Split a list of Path instances according to the name of the query
|
|
733 #
|
|
734 # @param lInPaths list of align instances
|
|
735 # @return lOutPathLists list of align instances lists
|
|
736 #
|
|
737 @staticmethod
|
|
738 def splitPathListByQueryName( lInPaths ):
|
|
739 lInSortedPaths = sorted( lInPaths, key=lambda o: o.range_query.seqname )
|
|
740 lOutPathLists = []
|
|
741 if len(lInSortedPaths) != 0 :
|
|
742 lPathsForCurrentQuery = []
|
|
743 previousQuery = lInSortedPaths[0].range_query.seqname
|
|
744 for iPath in lInSortedPaths :
|
|
745 currentQuery = iPath.range_query.seqname
|
|
746 if previousQuery != currentQuery :
|
|
747 lOutPathLists.append( lPathsForCurrentQuery )
|
|
748 previousQuery = currentQuery
|
|
749 lPathsForCurrentQuery = []
|
|
750 lPathsForCurrentQuery.append( iPath )
|
|
751
|
|
752 lOutPathLists.append(lPathsForCurrentQuery)
|
|
753
|
|
754 return lOutPathLists
|
|
755
|
|
756
|
|
757 ## Create an Path file from each list of Path instances in the input list
|
|
758 #
|
|
759 # @param lPathList list of lists with Path instances
|
|
760 # @param pattern string
|
|
761 # @param dirName string
|
|
762 #
|
|
763 @staticmethod
|
|
764 def createPathFiles( lPathList, pattern, dirName="" ):
|
|
765 nbFiles = len(lPathList)
|
|
766 countFile = 1
|
|
767 if dirName != "" :
|
|
768 if dirName[-1] != "/":
|
|
769 dirName = dirName + '/'
|
|
770 os.mkdir( dirName )
|
|
771
|
|
772 for lPath in lPathList:
|
|
773 fileName = dirName + pattern + "_%s.path" % ( str(countFile).zfill( len(str(nbFiles)) ) )
|
|
774 PathUtils.writeListInFile( lPath, fileName )
|
|
775 countFile += 1
|
|
776
|
|
777
|
|
778 ## Merge all overlapping Path instances in a list without considering the identifiers
|
|
779 # Start by sorting the Path instances by their increasing min coordinate
|
|
780 #
|
|
781 # @return: a new list with the merged Path instances
|
|
782 #
|
|
783 @staticmethod
|
|
784 def mergePathsInList( lPaths ):
|
|
785 lMergedPaths = []
|
|
786 if len(lPaths)==0:
|
|
787 return lMergedPaths
|
|
788
|
|
789 lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths )
|
|
790
|
|
791 prev_count = 0
|
|
792 for iPath in lSortedPaths[0:]:
|
|
793 if prev_count != len(lSortedPaths):
|
|
794 for i in lSortedPaths[ prev_count + 1: ]:
|
|
795 if iPath.isOverlapping( i ):
|
|
796 iPath.merge( i )
|
|
797 isAlreadyInList = False
|
|
798 for newPath in lMergedPaths:
|
|
799 if newPath.isOverlapping( iPath ):
|
|
800 isAlreadyInList = True
|
|
801 newPath.merge( iPath )
|
|
802 lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath
|
|
803 if not isAlreadyInList:
|
|
804 lMergedPaths.append( iPath )
|
|
805 prev_count += 1
|
|
806 return lMergedPaths
|
|
807
|
|
808
|
|
809 ## Merge all overlapping Path instances in a list without considering if subjects are overlapping.
|
|
810 # Start by sorting the Path instances by their increasing min coordinate.
|
|
811 #
|
|
812 # @return: a new list with the merged Path instances
|
|
813 #
|
|
814 @staticmethod
|
|
815 def mergePathsInListUsingQueryCoordsOnly( lPaths ):
|
|
816 lMergedPaths = []
|
|
817 if len(lPaths)==0:
|
|
818 return lMergedPaths
|
|
819
|
|
820 lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths )
|
|
821
|
|
822 prev_count = 0
|
|
823 for iPath in lSortedPaths[0:]:
|
|
824 if prev_count != len(lSortedPaths):
|
|
825 for i in lSortedPaths[ prev_count + 1: ]:
|
|
826 if iPath.isQueryOverlapping( i ):
|
|
827 iPath.merge( i )
|
|
828 isAlreadyInList = False
|
|
829 for newPath in lMergedPaths:
|
|
830 if newPath.isQueryOverlapping( iPath ):
|
|
831 isAlreadyInList = True
|
|
832 newPath.merge( iPath )
|
|
833 lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath
|
|
834 if not isAlreadyInList:
|
|
835 lMergedPaths.append( iPath )
|
|
836 prev_count += 1
|
|
837 return lMergedPaths
|
|
838
|
|
839
|
|
840 ## Convert a Path file into a GFF file
|
|
841 #
|
|
842 # @param pathFile: name of the input Path file
|
|
843 # @param gffFile: name of the output GFF file
|
|
844 # @param source: source to write in the GFF file (column 2)
|
|
845 #
|
|
846 # @note the 'path' query is supposed to correspond to the 'gff' first column
|
|
847 #
|
|
848 @staticmethod
|
|
849 def convertPathFileIntoGffFile( pathFile, gffFile, source="REPET", verbose=0 ):
|
|
850 dId2PathList = PathUtils.getDictOfListsWithIdAsKeyFromFile( pathFile )
|
|
851 if verbose > 0:
|
|
852 msg = "number of chains: %i" % ( len(dId2PathList.keys()) )
|
|
853 sys.stdout.write( "%s\n" % msg )
|
|
854 sys.stdout.flush()
|
|
855 gffFileHandler = open( gffFile, "w" )
|
|
856 for id in dId2PathList.keys():
|
|
857 if len( dId2PathList[ id ] ) == 1:
|
|
858 iPath = dId2PathList[ id ][0]
|
|
859 string = iPath.toStringAsGff( ID="%i" % iPath.getIdentifier(),
|
|
860 source=source )
|
|
861 gffFileHandler.write( "%s\n" % string )
|
|
862 else:
|
|
863 iPathrange = PathUtils.convertPathListToPathrange( dId2PathList[ id ] )
|
|
864 string = iPathrange.toStringAsGff( ID="ms%i" % iPathrange.getIdentifier(),
|
|
865 source=source )
|
|
866 gffFileHandler.write( "%s\n" % string )
|
|
867 count = 0
|
|
868 for iPath in dId2PathList[ id ]:
|
|
869 count += 1
|
|
870 string = iPath.toStringAsGff( type="match_part",
|
|
871 ID="mp%i-%i" % ( iPath.getIdentifier(), count ),
|
|
872 Parent="ms%i" % iPathrange.getIdentifier(),
|
|
873 source=source )
|
|
874 gffFileHandler.write( "%s\n" % string )
|
|
875 gffFileHandler.close()
|
|
876
|
|
877
|
|
878 ## Convert a Path file into a Set file
|
|
879 # replace old parser.pathrange2set
|
|
880 # @param pathFile: name of the input Path file
|
|
881 # @param setFile: name of the output Set file
|
|
882 #
|
|
883 @staticmethod
|
|
884 def convertPathFileIntoSetFile( pathFile, setFile ):
|
|
885 pathFileHandler = open(pathFile, "r")
|
|
886 setFileHandler = open(setFile, "w")
|
|
887 iPath = Path()
|
|
888 for line in pathFileHandler:
|
|
889 iPath.setFromString(line)
|
|
890 iSet = iPath.getSubjectAsSetOfQuery()
|
|
891 iSet.write(setFileHandler)
|
|
892 pathFileHandler.close()
|
|
893 setFileHandler.close()
|
|
894
|
|
895 ## Write Path File without duplicated Path (same query, same subject and same coordinate)
|
|
896 #
|
|
897 # @param inputFile: name of the input Path file
|
|
898 # @param outputFile: name of the output Path file
|
|
899 #
|
|
900 @staticmethod
|
|
901 def removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName(inputFile, outputFile):
|
|
902 f = open(inputFile, "r")
|
|
903 line = f.readline()
|
|
904 previousQuery = ""
|
|
905 previousSubject = ""
|
|
906 lPaths = []
|
|
907 while line:
|
|
908 iPath = Path()
|
|
909 iPath.setFromString(line)
|
|
910 query = iPath.getQueryName()
|
|
911 subject = iPath.getSubjectName()
|
|
912 if (query != previousQuery or subject != previousSubject) and lPaths != []:
|
|
913 lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths)
|
|
914 PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a")
|
|
915 lPaths = []
|
|
916 lPaths.append(iPath)
|
|
917 previousQuery = query
|
|
918 previousSubject = subject
|
|
919 line = f.readline()
|
|
920 lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths)
|
|
921 PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a")
|
|
922 f.close()
|