comparison TEisotools-1.1.a/commons/core/coord/PathUtils.py @ 13:feef9a0db09d draft

Uploaded
author urgi-team
date Wed, 20 Jul 2016 09:04:42 -0400
parents
children
comparison
equal deleted inserted replaced
12:22b0494ec883 13:feef9a0db09d
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import os
33 import sys
34 import copy
35 from commons.core.coord.Map import Map
36 from commons.core.coord.Path import Path
37 from commons.core.coord.Align import Align
38 from commons.core.coord.Range import Range
39 from commons.core.coord.SetUtils import SetUtils
40 from commons.core.coord.AlignUtils import AlignUtils
41 from commons.core.checker.RepetException import RepetDataException
42
43 ## Static methods for the manipulation of Path instances
44 #
45 class PathUtils ( object ):
46
47 ## Change the identifier of each Set instance in the given list
48 #
49 # @param lPaths list of Path instances
50 # @param newId new identifier
51 #
52 @staticmethod
53 def changeIdInList(lPaths, newId):
54 for iPath in lPaths:
55 iPath.id = newId
56
57
58 ## Return a list of Set instances containing the query range from a list of Path instances
59 #
60 # @param lPaths a list of Path instances
61 #
62 @staticmethod
63 def getSetListFromQueries(lPaths):
64 lSets = []
65 for iPath in lPaths:
66 lSets.append( iPath.getSubjectAsSetOfQuery() )
67 return lSets
68
69
70 ## Return a list of Set instances containing the subject range from a list of Path instances
71 #
72 # @param lPaths a list of Path instances
73 #
74 @staticmethod
75 def getSetListFromSubjects(lPaths):
76 lSets = []
77 for iPath in lPaths:
78 lSets.append( iPath.getQuerySetOfSubject() )
79 return lSets
80
81
82 ## Return a sorted list of Range instances containing the subjects from a list of Path instances
83 #
84 # @param lPaths a list of Path instances
85 # @note meaningful only if all Path instances have same identifier
86 #
87 @staticmethod
88 def getRangeListFromSubjects( lPaths ):
89 lRanges = []
90 for iPath in lPaths:
91 lRanges.append( iPath.range_subject )
92 if lRanges[0].isOnDirectStrand():
93 return sorted( lRanges, key=lambda iRange: ( iRange.getMin(), iRange.getMax() ) )
94 else:
95 return sorted( lRanges, key=lambda iRange: ( iRange.getMax(), iRange.getMin() ) )
96
97
98 ## Return a tuple with min and max of query coordinates from Path instances in the given list
99 #
100 # @param lPaths a list of Path instances
101 #
102 @staticmethod
103 def getQueryMinMaxFromPathList(lPaths):
104 qmin = -1
105 qmax = -1
106 for iPath in lPaths:
107 if qmin == -1:
108 qmin = iPath.range_query.start
109 qmin = min(qmin, iPath.range_query.getMin())
110 qmax = max(qmax, iPath.range_query.getMax())
111 return (qmin, qmax)
112
113
114 ## Return a tuple with min and max of subject coordinates from Path instances in the given list
115 #
116 # @param lPaths lists of Path instances
117 #
118 @staticmethod
119 def getSubjectMinMaxFromPathList(lPaths):
120 smin = -1
121 smax = -1
122 for iPath in lPaths:
123 if smin == -1:
124 smin = iPath.range_subject.start
125 smin = min(smin, iPath.range_subject.getMin())
126 smax = max(smax, iPath.range_subject.getMax())
127 return (smin, smax)
128
129
130 ## Returns a Path objects list where Paths query coordinates overlapping with
131 # any Path in a list are removed.
132 #
133 # WARNING: input Path lists are modified (sort)
134 #
135 # @param lRefPaths list of paths to check overlaps
136 # @param lPathsToClean list of paths to remove overlapping Paths on query coordinates
137 # @return path list
138 @staticmethod
139 def removeOverlappingPathsOnQueriesBetweenPathLists(lRefPaths, lPathsToClean):
140 if not lRefPaths:
141 print "WARNING: empty reference Paths list"
142 return lPathsToClean
143
144 lRefQueries = PathUtils.getListOfDistinctQueryNames(lRefPaths)
145 lToCleanQueries = PathUtils.getListOfDistinctQueryNames(lPathsToClean)
146
147 lCommonQueries = list(set(lRefQueries) & set(lToCleanQueries))
148 lCommonQueries.sort()
149 lSpecificToCleanQueries = list(set(lToCleanQueries) - set(lCommonQueries))
150 lSpecificToCleanQueries.sort()
151
152 lRefPaths.sort(key=lambda iPath: (iPath.getQueryName(), iPath.getIdentifier(), iPath.getQueryMin(), iPath.getQueryMax()))
153 lPathsToClean.sort(key=lambda iPath: (iPath.getQueryName(), iPath.getIdentifier(), iPath.getQueryMin(), iPath.getQueryMax()))
154
155 lCleanedPaths = []
156 lSpecificToCleanQueries = list(set(lToCleanQueries) - set(lCommonQueries))
157 lCleanedPaths.extend(PathUtils.extractPathsFromQueryNameList(lPathsToClean, lSpecificToCleanQueries))
158
159 dRefQueryToPathList = PathUtils.getDictOfListsWithQueryNameAsKey(lRefPaths)
160 dToCleanQueryToPathList = PathUtils.getDictOfListsWithQueryNameAsKey(lPathsToClean)
161
162 for queryName in lCommonQueries:
163
164 refQueryHash = PathUtils.getDictOfListsWithIdAsKey(dRefQueryToPathList[queryName])
165 toCleanQueryHash = PathUtils.getDictOfListsWithIdAsKey(dToCleanQueryToPathList[queryName])
166
167 for lCleanPathById in toCleanQueryHash.values():
168 isOverlapping = False
169
170 for lRefPathById in refQueryHash.values():
171 if PathUtils.areQueriesOverlappingBetweenPathLists(lRefPathById, lCleanPathById, areListsAlreadySort = True):
172 isOverlapping = True
173 break
174
175 if not isOverlapping:
176 lCleanedPaths.extend(lCleanPathById)
177
178 return lCleanedPaths
179
180
181 ## Return True if the query range of any Path instance from the first list overlaps with the query range of any Path instance from the second list
182 #
183 # @param lPaths1: list of Path instances
184 # @param lPaths2: list of Path instances
185 # @return boolean
186 #
187 @staticmethod
188 def areQueriesOverlappingBetweenPathLists( lPaths1, lPaths2, areListsAlreadySort = False):
189 if not areListsAlreadySort:
190 lSortedPaths1 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths1 )
191 lSortedPaths2 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths2 )
192 else:
193 lSortedPaths1 = lPaths1
194 lSortedPaths2 = lPaths2
195 i = 0
196 j = 0
197 while i != len(lSortedPaths1):
198 j = 0
199 while j != len(lSortedPaths2):
200 if not lSortedPaths1[i].range_query.isOverlapping( lSortedPaths2[j].range_query ):
201 j += 1
202 else:
203 return True
204 i += 1
205 return False
206
207
208 ## Show Path instances contained in the given list
209 #
210 # @param lPaths a list of Path instances
211 #
212 @staticmethod
213 def showList(lPaths):
214 for iPath in lPaths:
215 iPath.show()
216
217
218 ## Write Path instances contained in the given list
219 #
220 # @param lPaths a list of Path instances
221 # @param fileName name of the file to write the Path instances
222 # @param mode the open mode of the file ""w"" or ""a""
223 #
224 @staticmethod
225 def writeListInFile(lPaths, fileName, mode="w"):
226 AlignUtils.writeListInFile(lPaths, fileName, mode)
227
228
229 ## Return new list of Path instances with no duplicate
230 #
231 # @param lPaths a list of Path instances
232 # @param useOnlyCoord boolean if True, check only coordinates and sequence names
233 # @return lUniqPaths a path instances list
234 #
235 @staticmethod
236 def getPathListWithoutDuplicates(lPaths, useOnlyCoord = False):
237 if len(lPaths) < 2:
238 return lPaths
239 lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths )
240 lUniqPaths = [ lSortedPaths[0] ]
241 if useOnlyCoord:
242 for iPath in lSortedPaths[1:]:
243 if iPath.range_query.start != lUniqPaths[-1].range_query.start \
244 or iPath.range_query.end != lUniqPaths[-1].range_query.end \
245 or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname \
246 or iPath.range_subject.start != lUniqPaths[-1].range_subject.start \
247 or iPath.range_subject.end != lUniqPaths[-1].range_subject.end \
248 or iPath.range_subject.seqname != lUniqPaths[-1].range_subject.seqname:
249 lUniqPaths.append( iPath )
250 else:
251 for iPath in lSortedPaths[1:]:
252 if iPath != lUniqPaths[-1]:
253 lUniqPaths.append( iPath )
254 return lUniqPaths
255
256
257 @staticmethod
258 def getPathListWithoutDuplicatesOnQueryCoord(lPaths):
259 if len(lPaths) < 2:
260 return lPaths
261 lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths )
262 lUniqPaths = [ lSortedPaths[0] ]
263 for iPath in lSortedPaths[1:]:
264 if iPath.range_query.start != lUniqPaths[-1].range_query.start \
265 or iPath.range_query.end != lUniqPaths[-1].range_query.end \
266 or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname:
267 lUniqPaths.append( iPath )
268 return lUniqPaths
269
270
271 ## Split a Path list in several Path lists according to the identifier
272 #
273 # @param lPaths a list of Path instances
274 # @return a dictionary which keys are identifiers and values Path lists
275 #
276 @staticmethod
277 def getDictOfListsWithIdAsKey(lPaths):
278 dId2PathList = dict((ident, []) for ident in PathUtils.getListOfDistinctIdentifiers(lPaths))
279 for iPath in lPaths:
280 dId2PathList[iPath.id].append(iPath)
281 return dId2PathList
282
283
284 ## Split a Path list in several Path lists according to the query name
285 #
286 # @param lPaths a list of Path instances
287 # @return a dictionary which keys are query_names and values Path lists
288 #
289 @staticmethod
290 def getDictOfListsWithQueryNameAsKey(lPaths):
291 dId2PathList = dict((qn, []) for qn in PathUtils.getListOfDistinctQueryNames(lPaths))
292 for iPath in lPaths:
293 dId2PathList[iPath.getQueryName()].append(iPath)
294 return dId2PathList
295
296
297 ## Split a Path file in several Path lists according to the identifier
298 #
299 # @param pathFile name of the input Path file
300 # @return a dictionary which keys are identifiers and values Path lists
301 #
302 @staticmethod
303 def getDictOfListsWithIdAsKeyFromFile( pathFile ):
304 dId2PathList = {}
305 pathFileHandler = open(pathFile, "r")
306 for line in pathFileHandler:
307 iPath = Path()
308 iPath.setFromString(line)
309 if dId2PathList.has_key(iPath.id):
310 dId2PathList[ iPath.id ].append(iPath)
311 else:
312 dId2PathList[ iPath.id ] = [ iPath ]
313 pathFileHandler.close()
314 return dId2PathList
315
316
317 ## Return a list of Path list(s) obtained while splitting a list of connected Path instances according to another based on query coordinates
318 # Only the path instance of lToKeep between path instance of lToUnjoin are used to split lToUnjoin
319 # @param lToKeep: a list of Path instances to keep (reference)
320 # @param lToUnjoin: a list of Path instances to unjoin
321 # @return: list of Path list(s) (can be empty if one of the input lists is empty)
322 # @warning: all the path instances in a given list MUST be connected (i.e. same identifier)
323 # @warning: if the path instances in a given list overlap neither within each other nor with the Path instances of the other list, these path instances are not used to split the lToUnjoin
324 #
325 @staticmethod
326 def getPathListUnjoinedBasedOnQuery( lToKeep, lToUnjoin ):
327 lSortedToKeep = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToKeep )
328 length_lSortedToKeep = len(lSortedToKeep)
329 # PathUtils.showList(lSortedToKeep)
330 lSortedToUnjoin = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToUnjoin )
331 # PathUtils.showList(lSortedToUnjoin)
332 length_lSortedToUnjoin = len(lSortedToUnjoin)
333 if lToUnjoin == []:
334 return []
335 if lToKeep == []:
336 return [ lToUnjoin ]
337
338 lLists = []
339 k = 0
340 while k < length_lSortedToKeep:
341 j1 = 0
342 while j1 < length_lSortedToUnjoin and lSortedToKeep[k].range_query.getMin() > lSortedToUnjoin[j1].range_query.getMax():
343 j1 += 1
344 if j1 == length_lSortedToUnjoin:
345 break
346 if j1 != 0:
347 lLists.append( lSortedToUnjoin[:j1] )
348 del lSortedToUnjoin[:j1]
349 j1 = 0
350 if k+1 == len(lSortedToKeep):
351 break
352 j2 = j1
353 minQueryOf_lSortedToKeepKplus1 = lSortedToKeep[k+1].range_query.getMin()
354 maxQueryOf_lSortedToUnjoinJ2 = lSortedToUnjoin[j2].range_query.getMax()
355 if j2 < length_lSortedToUnjoin and minQueryOf_lSortedToKeepKplus1 > maxQueryOf_lSortedToUnjoinJ2:
356 while j2 < len(lSortedToUnjoin) and minQueryOf_lSortedToKeepKplus1 > maxQueryOf_lSortedToUnjoinJ2:
357 j2 += 1
358 maxQueryOf_lSortedToUnjoinJ2 = lSortedToUnjoin[j2].range_query.getMax()
359 lLists.append( lSortedToUnjoin[j1:j2] )
360 del lSortedToUnjoin[j1:j2]
361 k += 1
362
363 if lLists != [] or k == 0:
364 lLists.append( lSortedToUnjoin )
365 else:
366 lLists = lSortedToUnjoin
367
368 return lLists
369
370
371 ## Return the identity of the Path list, the identity of each instance being weighted by the length of each query range
372 # All Paths should have the same query and subject.
373 # The Paths are merged using query coordinates only.
374 #
375 # @param lPaths list of Path instances
376 #
377 @staticmethod
378 def getIdentityFromPathList( lPaths, checkSubjects=True ):
379 if len( PathUtils.getListOfDistinctQueryNames( lPaths ) ) > 1:
380 msg = "ERROR: try to compute identity from Paths with different queries"
381 sys.stderr.write( "%s\n" % msg )
382 sys.stderr.flush()
383 raise Exception
384 if checkSubjects and len( PathUtils.getListOfDistinctSubjectNames( lPaths ) ) > 1:
385 msg = "ERROR: try to compute identity from Paths with different subjects"
386 sys.stderr.write( "%s\n" % msg )
387 sys.stderr.flush()
388 raise Exception
389 identity = 0
390 lMergedPaths = PathUtils.mergePathsInListUsingQueryCoordsOnly( lPaths )
391 lQuerySets = PathUtils.getSetListFromQueries( lMergedPaths )
392 lMergedQuerySets = SetUtils.mergeSetsInList( lQuerySets )
393 totalLengthOnQry = SetUtils.getCumulLength( lMergedQuerySets )
394 for iPath in lMergedPaths:
395 identity += iPath.identity * iPath.getLengthOnQuery()
396 weightedIdentity = identity / float(totalLengthOnQry)
397 if weightedIdentity < 0:
398 msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity
399 sys.stderr.write("%s\n" % msg)
400 sys.stderr.flush()
401 raise Exception
402 elif weightedIdentity > 100:
403 msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity
404 sys.stderr.write("%s\n" % msg)
405 sys.stderr.flush()
406 raise RepetDataException(msg)
407 return weightedIdentity
408
409
410 ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, and finally their initial order.
411 #
412 # @param lPaths list of Path instances
413 #
414 @staticmethod
415 def getPathListSortedByIncreasingMinQueryThenMaxQuery(lPaths):
416 return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax() ) )
417
418
419 ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then their identifier, and finally their initial order.
420 #
421 # @param lPaths list of Path instances
422 #
423 @staticmethod
424 def getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier(lPaths):
425 return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax(), iPath.getIdentifier() ) )
426
427
428 ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then the min of the subject, then the max of the subject and finally their initial order.
429 #
430 # @param lPaths list of Path instances
431 #
432 @staticmethod
433 def getPathListSortedByIncreasingMinQueryThenMaxQueryThenMinSubjectThenMaxSubject(lPaths):
434 return sorted(lPaths, key=lambda iPath: (iPath.getQueryMin(), iPath.getQueryMax(), iPath.getSubjectMin(), iPath.getSubjectMax()))
435
436
437 ## Return a list of Path instances sorted in increasing order according to the min, then the inverse of the query length, and finally their initial order
438 #
439 # @param lPaths: list of Path instances
440 #
441 @staticmethod
442 def getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ):
443 return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), 1 / float(iPath.getLengthOnQuery()) ) )
444
445
446 ## Return a list of the distinct identifiers
447 #
448 # @param lPaths list of Path instances
449 #
450 @staticmethod
451 def getListOfDistinctIdentifiers( lPaths ):
452 sDistinctIdentifiers = set([iPath.id for iPath in lPaths])
453 return list(sDistinctIdentifiers)
454
455
456 ## Return a list of the distinct query names present in the collection
457 #
458 # @param lPaths list of Path instances
459 #
460 @staticmethod
461 def getListOfDistinctQueryNames( lPaths ):
462 sDistinctQueryNames = set([iPath.range_query.seqname for iPath in lPaths])
463 return list(sDistinctQueryNames)
464
465
466 ## Return a list of the distinct subject names present in the collection
467 #
468 # @param lPaths list of Path instances
469 #
470 @staticmethod
471 def getListOfDistinctSubjectNames( lPaths ):
472 sDistinctSubjectNames = set([iPath.range_subject.seqname for iPath in lPaths])
473 return list(sDistinctSubjectNames)
474
475
476 ## Return a list of paths with matching query names
477 #
478 # @param lPaths list of Path instances
479 # @param queryName query name to extract
480 @staticmethod
481 def extractPathsFromQueryName(lPaths, queryName):
482 return [iPath for iPath in lPaths if iPath.getQueryName() == queryName]
483
484
485 ## Return a list of paths with matching query names
486 #
487 # @param lPaths list of Path instances
488 # @param lQueryName query name list to extract
489 @staticmethod
490 def extractPathsFromQueryNameList(lPaths, lQueryNames):
491 d = dict.fromkeys(lQueryNames)
492 return [iPath for iPath in lPaths if iPath.getQueryName() in d]
493
494
495 ## Return a list of paths with matching subject names
496 #
497 # @param lPaths list of Path instances
498 # @param subjectName subject name to extract
499 @staticmethod
500 def extractPathsFromSubjectName(lPaths, subjectName):
501 return [iPath for iPath in lPaths if iPath.getSubjectName() == subjectName]
502
503
504 ## Return a list of paths with coordinates overlap a given range
505 #
506 # @param lPaths list of Path instances
507 # @param queryName query name to extract
508 # @param start starting position
509 # @param end ending position
510 # @return list of Path instance
511 @staticmethod
512 def extractPathsFromQueryCoord(lPaths, queryName, start, end):
513 lExtractedPaths = []
514 iAlign = Align(range_q = Range(queryName, start, end))
515
516 for path in PathUtils.extractPathsFromQueryName(lPaths, queryName):
517 if path.isQueryOverlapping(iAlign):
518 lExtractedPaths.append(path)
519
520 return lExtractedPaths
521
522
523 ## Return a list of lists containing query coordinates of the connections sorted in increasing order.
524 #
525 # @param lConnectedPaths: list of Path instances having the same identifier
526 # @param minLength: threshold below which connections are not reported (default= 0 bp)
527 # @note: return only connections longer than threshold
528 # @note: if coordinate on query ends at 100, return 101
529 # @warning: Path instances MUST be sorted in increasing order according to query coordinates
530 # @warning: Path instances MUST be on direct query strand (and maybe on reverse subject strand)
531 #
532 @staticmethod
533 def getListOfJoinCoordinatesOnQuery(lConnectedPaths, minLength=0):
534 lJoinCoordinates = []
535 for i in xrange(1,len(lConnectedPaths)):
536 startJoin = lConnectedPaths[i-1].range_query.end
537 endJoin = lConnectedPaths[i].range_query.start
538 if endJoin - startJoin + 1 > minLength:
539 lJoinCoordinates.append( [ startJoin + 1, endJoin - 1 ] )
540 return lJoinCoordinates
541
542
543 ## Return the length on the query of all Path instance in the given list
544 #
545 # @param lPaths list of Path instances
546 # @note overlapping ranges are not summed but truncated.
547 #
548 @staticmethod
549 def getLengthOnQueryFromPathList( lPaths ):
550 lSets = PathUtils.getSetListFromQueries( lPaths )
551 lMergedSets = SetUtils.mergeSetsInList( lSets )
552 length = SetUtils.getCumulLength( lMergedSets )
553 return length
554
555
556 ## Convert a Path file into an Align file
557 #
558 # @param pathFile: name of the input Path file
559 # @param alignFile: name of the output Align file
560 #
561 @staticmethod
562 def convertPathFileIntoAlignFile(pathFile, alignFile):
563 pathFileHandler = open(pathFile, "r")
564 alignFileHandler = open(alignFile, "w")
565 iPath = Path()
566 for line in pathFileHandler:
567 iPath.setFromString(line)
568 iAlign = iPath.getAlignInstance()
569 iAlign.write(alignFileHandler)
570 pathFileHandler.close()
571 alignFileHandler.close()
572
573
574 #TODO: duplicated method => to rename with the name of the next method (which is called) ?
575 ## Convert a Path File into a Map file with query coordinates only
576 #
577 # @param pathFile: name of the input Path file
578 # @param mapFile: name of the output Map file
579 #
580 @staticmethod
581 def convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile ):
582 pathFileHandler = open(pathFile, "r")
583 mapFileHandler = open(mapFile, "w")
584 p = Path()
585 for line in pathFileHandler:
586 p.reset()
587 p.setFromTuple(line.split("\t"))
588 p.writeSubjectAsMapOfQuery(mapFileHandler)
589 pathFileHandler.close()
590 mapFileHandler.close()
591
592
593 ## for each line of a given Path file, write the coordinates of the subject on the query as one line in a Map file
594 #
595 # @param pathFile: name of the input Path file
596 # @param mapFile: name of the output Map file
597 #
598 @staticmethod
599 def convertPathFileIntoMapFileWithSubjectsOnQueries( pathFile, mapFile ):
600 PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile )
601
602
603 ## Merge matches on queries
604 #
605 # @param inFile: name of the input Path file
606 # @param outFile: name of the output Path file
607 #
608 @staticmethod
609 def mergeMatchesOnQueries(inFile, outFile):
610 mapFile = "%s.map" % inFile
611 PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly(inFile, mapFile)
612 cmd = "mapOp"
613 cmd += " -q %s" % mapFile
614 cmd += " -m"
615 cmd += " 2>&1 > /dev/null"
616 exitStatus = os.system(cmd)
617 if exitStatus != 0:
618 print "ERROR: mapOp returned %i" % exitStatus
619 sys.exit(1)
620 os.remove(mapFile)
621 mergeFile = "%s.merge" % mapFile
622 mergeFileHandler = open(mergeFile, "r")
623 outFileHandler = open(outFile, "w")
624 m = Map()
625 for line in mergeFileHandler:
626 m.reset()
627 m.setFromString(line, "\t")
628 m.writeAsQueryOfPath(outFileHandler)
629 mergeFileHandler.close()
630 os.remove(mergeFile)
631 outFileHandler.close()
632
633
634 ## Filter chains of Path(s) which length is below a given threshold
635 #
636 # @param lPaths: list of Path instances
637 # @param minLengthChain: minimum length of a chain to be kept
638 # @note: a chain may contain a single Path instance
639 # @return: a list of Path instances
640 #
641 @staticmethod
642 def filterPathListOnChainLength( lPaths, minLengthChain ):
643 lFilteredPaths = []
644 dPathnum2Paths = PathUtils.getDictOfListsWithIdAsKey( lPaths )
645 for pathnum in dPathnum2Paths.keys():
646 length = PathUtils.getLengthOnQueryFromPathList( dPathnum2Paths[ pathnum ] )
647 if length >= minLengthChain:
648 lFilteredPaths += dPathnum2Paths[ pathnum ]
649 return lFilteredPaths
650
651
652 ## Return a Path list from a Path file
653 #
654 # @param pathFile string name of a Path file
655 # @return a list of Path instances
656 #
657 @staticmethod
658 def getPathListFromFile( pathFile ):
659 lPaths = []
660 with open(pathFile, "r") as pathFileHandler:
661 for line in pathFileHandler:
662 iPath = Path()
663 iPath.setFromString(line)
664 lPaths.append(iPath)
665 return lPaths
666
667
668 ## Convert a chain into a 'pathrange'
669 #
670 # @param lPaths a list of Path instances with the same identifier
671 # @note: the min and max of each Path is used
672 #
673 @staticmethod
674 def convertPathListToPathrange( lPaths ):
675 if len(lPaths) == 0:
676 return
677 if len(lPaths) == 1:
678 return lPaths[0]
679 iPathrange = copy.deepcopy( lPaths[0] )
680 iPathrange.identity = lPaths[0].identity * lPaths[0].getLengthOnQuery()
681 cumulQueryLength = iPathrange.getLengthOnQuery()
682 for iPath in lPaths[1:]:
683 if iPath.id != iPathrange.id:
684 msg = "ERROR: two Path instances in the chain have different identifiers"
685 sys.stderr.write( "%s\n" % ( msg ) )
686 sys.exit(1)
687 if iPathrange.range_subject.isOnDirectStrand() != iPath.range_subject.isOnDirectStrand():
688 msg = "ERROR: two Path instances in the chain are on different strands"
689 sys.stderr.write( "%s\n" % ( msg ) )
690 sys.exit(1)
691 iPathrange.range_query.start = min( iPathrange.range_query.start, iPath.range_query.start )
692 iPathrange.range_query.end = max( iPathrange.range_query.end, iPath.range_query.end )
693 if iPathrange.range_subject.isOnDirectStrand():
694 iPathrange.range_subject.start = min( iPathrange.range_subject.start, iPath.range_subject.start )
695 iPathrange.range_subject.end = max( iPathrange.range_subject.end, iPath.range_subject.end )
696 else:
697 iPathrange.range_subject.start = max( iPathrange.range_subject.start, iPath.range_subject.start )
698 iPathrange.range_subject.end = min( iPathrange.range_subject.end, iPath.range_subject.end )
699 iPathrange.e_value = min( iPathrange.e_value, iPath.e_value )
700 iPathrange.score += iPath.score
701 iPathrange.identity += iPath.identity * iPath.getLengthOnQuery()
702 cumulQueryLength += iPath.getLengthOnQuery()
703 iPathrange.identity = iPathrange.identity / float(cumulQueryLength)
704 return iPathrange
705
706
707 ## Convert a Path file into an Align file via 'pathrange'
708 #
709 # @param pathFile: name of the input Path file
710 # @param alignFile: name of the output Align file
711 # @param verbose integer verbosity level
712 # @note: the min and max of each Path is used
713 #
714 @staticmethod
715 def convertPathFileIntoAlignFileViaPathrange( pathFile, alignFile, verbose=0 ):
716 lPaths = PathUtils.getPathListFromFile( pathFile )
717 dId2PathList = PathUtils.getDictOfListsWithIdAsKey( lPaths )
718 lIds = dId2PathList.keys()
719 lIds.sort()
720 if verbose > 0:
721 msg = "number of chains: %i" % ( len(lIds) )
722 sys.stdout.write( "%s\n" % ( msg ) )
723 sys.stdout.flush()
724 alignFileHandler = open( alignFile, "w" )
725 for identifier in lIds:
726 iPath = PathUtils.convertPathListToPathrange( dId2PathList[ identifier ] )
727 iAlign = iPath.getAlignInstance()
728 iAlign.write( alignFileHandler )
729 alignFileHandler.close()
730
731
732 ## Split a list of Path instances according to the name of the query
733 #
734 # @param lInPaths list of align instances
735 # @return lOutPathLists list of align instances lists
736 #
737 @staticmethod
738 def splitPathListByQueryName( lInPaths ):
739 lInSortedPaths = sorted( lInPaths, key=lambda o: o.range_query.seqname )
740 lOutPathLists = []
741 if len(lInSortedPaths) != 0 :
742 lPathsForCurrentQuery = []
743 previousQuery = lInSortedPaths[0].range_query.seqname
744 for iPath in lInSortedPaths :
745 currentQuery = iPath.range_query.seqname
746 if previousQuery != currentQuery :
747 lOutPathLists.append( lPathsForCurrentQuery )
748 previousQuery = currentQuery
749 lPathsForCurrentQuery = []
750 lPathsForCurrentQuery.append( iPath )
751
752 lOutPathLists.append(lPathsForCurrentQuery)
753
754 return lOutPathLists
755
756
757 ## Create an Path file from each list of Path instances in the input list
758 #
759 # @param lPathList list of lists with Path instances
760 # @param pattern string
761 # @param dirName string
762 #
763 @staticmethod
764 def createPathFiles( lPathList, pattern, dirName="" ):
765 nbFiles = len(lPathList)
766 countFile = 1
767 if dirName != "" :
768 if dirName[-1] != "/":
769 dirName = dirName + '/'
770 os.mkdir( dirName )
771
772 for lPath in lPathList:
773 fileName = dirName + pattern + "_%s.path" % ( str(countFile).zfill( len(str(nbFiles)) ) )
774 PathUtils.writeListInFile( lPath, fileName )
775 countFile += 1
776
777
778 ## Merge all overlapping Path instances in a list without considering the identifiers
779 # Start by sorting the Path instances by their increasing min coordinate
780 #
781 # @return: a new list with the merged Path instances
782 #
783 @staticmethod
784 def mergePathsInList( lPaths ):
785 lMergedPaths = []
786 if len(lPaths)==0:
787 return lMergedPaths
788
789 lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths )
790
791 prev_count = 0
792 for iPath in lSortedPaths[0:]:
793 if prev_count != len(lSortedPaths):
794 for i in lSortedPaths[ prev_count + 1: ]:
795 if iPath.isOverlapping( i ):
796 iPath.merge( i )
797 isAlreadyInList = False
798 for newPath in lMergedPaths:
799 if newPath.isOverlapping( iPath ):
800 isAlreadyInList = True
801 newPath.merge( iPath )
802 lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath
803 if not isAlreadyInList:
804 lMergedPaths.append( iPath )
805 prev_count += 1
806 return lMergedPaths
807
808
809 ## Merge all overlapping Path instances in a list without considering if subjects are overlapping.
810 # Start by sorting the Path instances by their increasing min coordinate.
811 #
812 # @return: a new list with the merged Path instances
813 #
814 @staticmethod
815 def mergePathsInListUsingQueryCoordsOnly( lPaths ):
816 lMergedPaths = []
817 if len(lPaths)==0:
818 return lMergedPaths
819
820 lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths )
821
822 prev_count = 0
823 for iPath in lSortedPaths[0:]:
824 if prev_count != len(lSortedPaths):
825 for i in lSortedPaths[ prev_count + 1: ]:
826 if iPath.isQueryOverlapping( i ):
827 iPath.merge( i )
828 isAlreadyInList = False
829 for newPath in lMergedPaths:
830 if newPath.isQueryOverlapping( iPath ):
831 isAlreadyInList = True
832 newPath.merge( iPath )
833 lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath
834 if not isAlreadyInList:
835 lMergedPaths.append( iPath )
836 prev_count += 1
837 return lMergedPaths
838
839
840 ## Convert a Path file into a GFF file
841 #
842 # @param pathFile: name of the input Path file
843 # @param gffFile: name of the output GFF file
844 # @param source: source to write in the GFF file (column 2)
845 #
846 # @note the 'path' query is supposed to correspond to the 'gff' first column
847 #
848 @staticmethod
849 def convertPathFileIntoGffFile( pathFile, gffFile, source="REPET", verbose=0 ):
850 dId2PathList = PathUtils.getDictOfListsWithIdAsKeyFromFile( pathFile )
851 if verbose > 0:
852 msg = "number of chains: %i" % ( len(dId2PathList.keys()) )
853 sys.stdout.write( "%s\n" % msg )
854 sys.stdout.flush()
855 gffFileHandler = open( gffFile, "w" )
856 for id in dId2PathList.keys():
857 if len( dId2PathList[ id ] ) == 1:
858 iPath = dId2PathList[ id ][0]
859 string = iPath.toStringAsGff( ID="%i" % iPath.getIdentifier(),
860 source=source )
861 gffFileHandler.write( "%s\n" % string )
862 else:
863 iPathrange = PathUtils.convertPathListToPathrange( dId2PathList[ id ] )
864 string = iPathrange.toStringAsGff( ID="ms%i" % iPathrange.getIdentifier(),
865 source=source )
866 gffFileHandler.write( "%s\n" % string )
867 count = 0
868 for iPath in dId2PathList[ id ]:
869 count += 1
870 string = iPath.toStringAsGff( type="match_part",
871 ID="mp%i-%i" % ( iPath.getIdentifier(), count ),
872 Parent="ms%i" % iPathrange.getIdentifier(),
873 source=source )
874 gffFileHandler.write( "%s\n" % string )
875 gffFileHandler.close()
876
877
878 ## Convert a Path file into a Set file
879 # replace old parser.pathrange2set
880 # @param pathFile: name of the input Path file
881 # @param setFile: name of the output Set file
882 #
883 @staticmethod
884 def convertPathFileIntoSetFile( pathFile, setFile ):
885 pathFileHandler = open(pathFile, "r")
886 setFileHandler = open(setFile, "w")
887 iPath = Path()
888 for line in pathFileHandler:
889 iPath.setFromString(line)
890 iSet = iPath.getSubjectAsSetOfQuery()
891 iSet.write(setFileHandler)
892 pathFileHandler.close()
893 setFileHandler.close()
894
895 ## Write Path File without duplicated Path (same query, same subject and same coordinate)
896 #
897 # @param inputFile: name of the input Path file
898 # @param outputFile: name of the output Path file
899 #
900 @staticmethod
901 def removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName(inputFile, outputFile):
902 f = open(inputFile, "r")
903 line = f.readline()
904 previousQuery = ""
905 previousSubject = ""
906 lPaths = []
907 while line:
908 iPath = Path()
909 iPath.setFromString(line)
910 query = iPath.getQueryName()
911 subject = iPath.getSubjectName()
912 if (query != previousQuery or subject != previousSubject) and lPaths != []:
913 lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths)
914 PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a")
915 lPaths = []
916 lPaths.append(iPath)
917 previousQuery = query
918 previousSubject = subject
919 line = f.readline()
920 lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths)
921 PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a")
922 f.close()