comparison commons/core/coord/SetUtils.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents 769e306b7933
children
comparison
equal deleted inserted replaced
37:d22fadc825e3 38:2c0c0a89fad7
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 from commons.core.coord.Set import Set
33
34 ## Static methods for the manipulation of Path instances
35 #
36 class SetUtils( object ):
37
38 ## Change the identifier of each Set instance in the given list
39 #
40 # @param lSets list of Set instances
41 # @param newId new identifier
42 #
43 def changeIdInList(lSets, newId):
44 for iSet in lSets:
45 iSet.id = newId
46
47 changeIdInList = staticmethod( changeIdInList )
48
49 ## Return the length of the overlap between two lists of Set instances
50 #
51 # @param lSets1 list of Set instances
52 # @param lSets2 list of Set instances
53 # @return length of overlap
54 # @warning sequence names are supposed to be identical
55 #
56 def getOverlapLengthBetweenLists(lSets1, lSets2):
57 lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1)
58 lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2)
59 osize = 0
60 i = 0
61 j = 0
62 while i!= len(lSet1Sorted):
63 while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\
64 and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])):
65 j+=1
66 jj=j
67 while jj!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[jj]):
68 osize+=lSet1Sorted[i].getOverlapLength(lSet2Sorted[jj])
69 jj+=1
70 i+=1
71 return osize
72
73 getOverlapLengthBetweenLists = staticmethod( getOverlapLengthBetweenLists )
74
75 ## Return True if the two lists of Set instances overlap, False otherwise
76 #
77 # @param lSets1 list of Set instances
78 # @param lSets2 list of Set instances
79 #
80 def areSetsOverlappingBetweenLists( lSets1, lSets2 ):
81 lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1)
82 lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2)
83 i=0
84 j=0
85 while i!= len(lSet1Sorted):
86 while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\
87 and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])):
88 j+=1
89 if j!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[j]):
90 return True
91 i+=1
92 return False
93
94 areSetsOverlappingBetweenLists = staticmethod( areSetsOverlappingBetweenLists )
95
96 ## Merge all overlapping Set instances between two lists of Set and give the next identifier
97 #
98 # @param lSets1 list of Set instances
99 # @param lSets2 list of Set instances
100 # @param max_id start id value for inserting new Set
101 # @return a new list of the merged Set instances and the next identifier
102 #
103 def getListOfMergedSetsAndNextId(lSets1, lSets2, max_id=0):
104 lSets_merged = []
105 list2merge = SetUtils.getListOfIdListOfOverlappingSets ( lSets1,lSets2 )
106 idlist1 = SetUtils.getDictOfListsWithIdAsKey(lSets1)
107 idlist2 = SetUtils.getDictOfListsWithIdAsKey(lSets2)
108 if max_id == 0:
109 max_id = max(idlist1.keys()) + 1
110 for i in list2merge:
111 if i == []:
112 continue
113 l = []
114 min_id = max(i)
115 for j in i:
116 if j>0:
117 if min_id>j:
118 min_id=j
119 l.extend(idlist1[j])
120 del idlist1[j]
121 else:
122 l.extend(idlist2[j*-1])
123 del idlist2[j*-1]
124 l = SetUtils.mergeSetsInList(l)
125 SetUtils.changeIdInList(l, min_id)
126 lSets_merged.extend(l)
127 for id, alist in idlist1.items():
128 lSets_merged.extend(alist)
129 for id,alist in idlist2.items():
130 SetUtils.changeIdInList(alist,max_id)
131 lSets_merged.extend(alist)
132 max_id+=1
133 return lSets_merged, max_id
134
135 getListOfMergedSetsAndNextId = staticmethod ( getListOfMergedSetsAndNextId )
136
137 # ## Concatenate two Set instance lists and give the next identifier
138 # #
139 # # @param lSets1 list of Set instances
140 # # @param lSets2 list of Set instances
141 # # @param maxId start id value for inserting new Set
142 # # @return a new list of Set instances and the next identifier
143 # #
144 # @staticmethod
145 # def getSetsListOfTwoConcatenatedSetsListAndNextId(lSets1, lSets2, maxId = 0):
146 # lOutSets = lSets1
147 # dId2SetsList2 = SetUtils.getDictOfListsWithIdAsKey(lSets2)
148 # if maxId == 0:
149 # dId2SetsList1 = SetUtils.getDictOfListsWithIdAsKey(lSets1)
150 # maxId = max(dId2SetsList1.keys())
151 # for lSets in dId2SetsList2.values():
152 # SetUtils.changeIdInList(lSets, maxId)
153 # lOutSets.extend(lSets)
154 # maxId += 1
155 # return lOutSets, maxId
156
157 ## Return the sum of the length of each Set instance in the given list
158 #
159 # @param lSets: list of Set instances
160 #
161 def getCumulLength(lSets):
162 length = 0
163 for i in lSets:
164 length += i.getLength()
165 return length
166
167 getCumulLength = staticmethod( getCumulLength )
168
169 ## Return a tuple with min and max coordinates of Set instances in the given list
170 #
171 # @param lSets list of Set instances
172 #
173 def getListBoundaries(lSets):
174 qmin = -1
175 qmax = -1
176 for iSet in lSets:
177 if qmin == -1:
178 qmin = iSet.start
179 qmin = min(qmin, iSet.getMin())
180 qmax = max(qmax, iSet.getMax())
181 return (qmin, qmax)
182
183 getListBoundaries = staticmethod( getListBoundaries )
184
185 ## Show Set instances contained in the given list
186 #
187 # @param lSets list of Set instances
188 #
189 def showList(lSets):
190 for iSet in lSets:
191 iSet.show()
192
193 showList = staticmethod( showList )
194
195 ## Write Set instances contained in the given list
196 #
197 # @param lSets list of Set instances
198 # @param fileName a file name
199 # @param mode the open mode of the file '"w"' or '"a"'
200 #
201 def writeListInFile(lSets, fileName, mode="w"):
202 fileHandler = open(fileName, mode)
203 for iSet in lSets:
204 iSet.write(fileHandler)
205 fileHandler.close()
206
207 writeListInFile = staticmethod( writeListInFile )
208
209 ## Split a Set list in several Set lists according to the identifier
210 #
211 # @param lSets list of Set instances
212 # @return a dictionary which keys are identifiers and values Set lists
213 #
214 def getDictOfListsWithIdAsKey(lSets):
215 dId2SetList = {}
216 for iSet in lSets:
217 if dId2SetList.has_key(iSet.id):
218 dId2SetList[iSet.id].append(iSet)
219 else:
220 dId2SetList[iSet.id] = [iSet]
221 return dId2SetList
222
223 getDictOfListsWithIdAsKey = staticmethod( getDictOfListsWithIdAsKey )
224
225
226 ## Split a Set list in several Set lists according to the identifier
227 #
228 # @param lSets list of Set instances
229 # @return a dictionary which keys are identifiers and values Set lists
230 #
231 def getDictOfListsWithIdAsKeyFromFile( setFile ):
232 dId2SetList = {}
233 setFileHandler = open( setFile, "r" )
234 while True:
235 line = setFileHandler.readline()
236 if line == "":
237 break
238 iSet = Set()
239 iSet.setFromTuple( line[:-1].split("\t") )
240 if not dId2SetList.has_key( iSet.id ):
241 dId2SetList[ iSet.id ] = []
242 dId2SetList[ iSet.id ].append( iSet )
243 setFileHandler.close()
244 return dId2SetList
245
246 getDictOfListsWithIdAsKeyFromFile = staticmethod( getDictOfListsWithIdAsKeyFromFile )
247
248
249 ## Return a Map list from the given Set List
250 #
251 # @param lSets list of Set instances
252 #
253 def getMapListFromSetList(lSets):
254 lMaps = []
255 for iSet in lSets:
256 lMaps.append(iSet.set2map())
257 return lMaps
258
259 getMapListFromSetList = staticmethod( getMapListFromSetList )
260
261 ## Construct a Set list from a Map list
262 #
263 # @param lMaps list of Map instances
264 #
265 def getSetListFromMapList(lMaps):
266 lSets = []
267 c = 0
268 for iMap in lMaps:
269 c += 1
270 lSets.append( Set(c, iMap.name, iMap.seqname, iMap.start, iMap.end) )
271 return lSets
272
273 getSetListFromMapList = staticmethod( getSetListFromMapList )
274
275 ## Merge all overlapping Set instances in a list without considering the identifiers.
276 # Start by sorting Set instances by their increasing Min coordinate.
277 #
278 # @return: a new list of the merged Set instances
279 #
280 def mergeSetsInList(lSets):
281 l=[]
282 if len(lSets)==0:
283 return l
284
285 lSortedSets = SetUtils.getSetListSortedByIncreasingMinThenInvLength( lSets )
286
287 prev_count = 0
288 for iSet in lSortedSets[0:]:
289 if prev_count != len(lSortedSets):
290 for i in lSortedSets[ prev_count + 1: ]:
291 if iSet.isOverlapping( i ):
292 iSet.merge( i )
293 IsAlreadyInList = False
294 for newSet in l:
295 if newSet.isOverlapping( iSet ):
296 IsAlreadyInList = True
297 newSet.merge( iSet )
298 l [ l.index( newSet ) ] = newSet
299 if not IsAlreadyInList:
300 l.append( iSet )
301 prev_count += 1
302 return l
303
304 mergeSetsInList = staticmethod( mergeSetsInList )
305
306 ## Unjoin a Set list according to another
307 #
308 # @param lToKeep: a list of Set instances to keep
309 # @param lToUnjoin: a list of Set instances to unjoin
310 # @return: lToUnjoin split in several list
311 #
312 def getSetListUnjoined(lToKeep, lToUnjoin):
313 lSortedToKeep = SetUtils.getSetListSortedByIncreasingMinThenMax( lToKeep )
314 lSortedToUnjoin = SetUtils.getSetListSortedByIncreasingMinThenMax( lToUnjoin )
315 if lSortedToUnjoin == []:
316 return []
317 if lSortedToKeep == []:
318 return [ lSortedToUnjoin ]
319
320 i=0
321 resultListSet=[]
322 while i<len(lSortedToKeep):
323 j1=0
324 while j1<len(lSortedToUnjoin) and lSortedToKeep[i].getMin() > lSortedToUnjoin[j1].getMax():
325 j1+=1
326 if j1==len(lSortedToUnjoin):
327 break
328 if j1!=0:
329 resultListSet.append(lSortedToUnjoin[:j1])
330 del lSortedToUnjoin[:j1]
331 j1=0
332 if i+1==len(lSortedToKeep):
333 break
334 j2=j1
335 if j2<len(lSortedToUnjoin) and lSortedToKeep[i+1].getMin() > lSortedToUnjoin[j2].getMax():
336 while j2<len(lSortedToUnjoin) and lSortedToKeep[i+1].getMin() > lSortedToUnjoin[j2].getMax():
337 j2+=1
338 resultListSet.append(lSortedToUnjoin[j1:j2])
339 del lSortedToUnjoin[j1:j2]
340 i+=1
341
342 if resultListSet!=[] or i == 0:
343 resultListSet.append(lSortedToUnjoin)
344 return resultListSet
345
346 getSetListUnjoined = staticmethod(getSetListUnjoined)
347
348 ## Return new list of Set instances with no duplicate
349 #
350 # @param lSets list of Set instances
351 #
352 def getSetListWithoutDuplicates( lSets ):
353 if len(lSets) < 2:
354 return lSets
355 lSortedSet = SetUtils.getSetListSortedByIncreasingMinThenMax( lSets )
356 lUniqSet = [ lSortedSet[0] ]
357 for iSet in lSortedSet[1:]:
358 if iSet != lUniqSet[-1]:
359 lUniqSet.append( iSet )
360 return lUniqSet
361
362 getSetListWithoutDuplicates = staticmethod( getSetListWithoutDuplicates )
363
364 ## Return a list of Set instances sorted in increasing order according to the Min, then the Max, and finally their initial order
365 #
366 # @param lSets: list of Set instances
367 #
368 def getSetListSortedByIncreasingMinThenMax( lSets ):
369 return sorted( lSets, key=lambda iSet: ( iSet.getMin(), iSet.getMax() ) )
370
371 getSetListSortedByIncreasingMinThenMax = staticmethod( getSetListSortedByIncreasingMinThenMax )
372
373 ## Return a list of Set instances sorted in increasing order according to the min, then the inverse of the length, and finally their initial order
374 #
375 # @param lSets: list of Set instances
376 #
377 def getSetListSortedByIncreasingMinThenInvLength( lSets ):
378 return sorted( lSets, key=lambda iSet: ( iSet.getMin(), 1 / float(iSet.getLength()) ) )
379
380 getSetListSortedByIncreasingMinThenInvLength = staticmethod( getSetListSortedByIncreasingMinThenInvLength )
381
382 ## Return a list of Set instances sorted in increasing order according to the SeqName, then the Name, then the Min, then the Max and finally their initial order
383 #
384 # @param lSets: list of Set instances
385 #
386 def getSetListSortedBySeqThenRegionThenMinThenMax(lSets):
387 return sorted(lSets, key=lambda iSet: (iSet.getSeqname(), iSet.getName(), iSet.getMin(), iSet.getMax()))
388
389 getSetListSortedBySeqThenRegionThenMinThenMax = staticmethod(getSetListSortedBySeqThenRegionThenMinThenMax)
390
391 ## Return a list of identifier lists of overlapping Sets from the subject list, according to the reference list
392 #
393 # @param lRef list of Set instances
394 # @param lSubject list of Set instances
395 #
396 def getListOfIdListOfOverlappingSets(lRef,lSubject):
397 lSortedRef = SetUtils.getSetListSortedByIncreasingMinThenMax( lRef )
398 lSortedSubject = SetUtils.getSetListSortedByIncreasingMinThenMax( lSubject )
399
400 lOverlappingSet = []
401 lOverlappingSetCounter = 0
402
403 id2LOverlappingSet_pos = {}
404
405 i = 0
406 j = 0
407 while i!= len(lSortedRef):
408 while j!= len(lSortedSubject) and lSortedRef[i].getMin()>lSortedSubject[j].getMax()\
409 and not(lSortedRef[i].isOverlapping(lSortedSubject[j])\
410 and lSortedRef[i].isOnDirectStrand()==lSortedSubject[j].isOnDirectStrand()):
411 j+=1
412 jj=j
413 while jj!= len(lSortedSubject) and lSortedRef[i].isOverlapping(lSortedSubject[jj])\
414 and lSortedRef[i].isOnDirectStrand()==lSortedSubject[jj].isOnDirectStrand():
415 id1=lSortedRef[i].id
416 id2=lSortedSubject[jj].id*-1
417 if id2LOverlappingSet_pos.has_key(id1) \
418 and not id2LOverlappingSet_pos.has_key(id2):
419 lOverlappingSet[id2LOverlappingSet_pos[id1]].append(id2)
420 id2LOverlappingSet_pos[id2]=id2LOverlappingSet_pos[id1]
421 if id2LOverlappingSet_pos.has_key(id2) \
422 and not id2LOverlappingSet_pos.has_key(id1):
423 lOverlappingSet[id2LOverlappingSet_pos[id2]].append(id1)
424 id2LOverlappingSet_pos[id1]=id2LOverlappingSet_pos[id2]
425 if not id2LOverlappingSet_pos.has_key(id2) \
426 and not id2LOverlappingSet_pos.has_key(id1):
427 lOverlappingSet.append([id1,id2])
428 id2LOverlappingSet_pos[id1]=lOverlappingSetCounter
429 id2LOverlappingSet_pos[id2]=lOverlappingSetCounter
430 lOverlappingSetCounter+=1
431 jj+=1
432 i+=1
433
434 return lOverlappingSet
435
436 getListOfIdListOfOverlappingSets = staticmethod (getListOfIdListOfOverlappingSets)
437
438 ## Return a list of sets without overlapping between two lists of sets
439 #
440 # @param lSet1 and lSet2
441 #
442 def getListOfSetWithoutOverlappingBetweenTwoListOfSet(lSet1, lSet2):
443 for i in lSet1:
444 for idx,j in enumerate(lSet2):
445 n=j.diff(i)
446 if not n.isEmpty() and n.getLength()>=20:
447 lSet2.append(n)
448 lSet2WithoutOverlaps=[]
449 for i in lSet2:
450 if not i.isEmpty() and i.getLength()>=20:
451 lSet2WithoutOverlaps.append(i)
452 return lSet2WithoutOverlaps
453
454 getListOfSetWithoutOverlappingBetweenTwoListOfSet = staticmethod (getListOfSetWithoutOverlappingBetweenTwoListOfSet)
455
456 ## Return a Set list from a Set file
457 #
458 # @param setFile string name of a Set file
459 # @return a list of Set instances
460 #
461 def getSetListFromFile( setFile ):
462 lSets = []
463 setFileHandler = open( setFile, "r" )
464 while True:
465 line = setFileHandler.readline()
466 if line == "":
467 break
468 iSet = Set()
469 iSet.setFromString( line )
470 lSets.append( iSet )
471 setFileHandler.close()
472 return lSets
473
474 getSetListFromFile = staticmethod( getSetListFromFile )
475
476
477 def convertSetFileIntoMapFile( setFile, mapFile ):
478 setFileHandler = open( setFile, "r" )
479 mapFileHandler = open( mapFile, "w" )
480 iSet = Set()
481 while True:
482 line = setFileHandler.readline()
483 if line == "":
484 break
485 iSet.setFromString( line )
486 iMap = iSet.getMapInstance()
487 iMap.write( mapFileHandler )
488 setFileHandler.close()
489 mapFileHandler.close()
490
491 convertSetFileIntoMapFile = staticmethod( convertSetFileIntoMapFile )
492
493
494 def getDictOfListsWithSeqnameAsKey( lSets ):
495 dSeqnamesToSetList = {}
496 for iSet in lSets:
497 if not dSeqnamesToSetList.has_key( iSet.seqname ):
498 dSeqnamesToSetList[ iSet.seqname ] = []
499 dSeqnamesToSetList[ iSet.seqname ].append( iSet )
500 return dSeqnamesToSetList
501
502 getDictOfListsWithSeqnameAsKey = staticmethod( getDictOfListsWithSeqnameAsKey )
503
504
505 def filterOnLength( lSets, minLength=0, maxLength=10000000000 ):
506 if minLength == 0 and maxLength == 0:
507 return lSets
508 lFiltered = []
509 for iSet in lSets:
510 if minLength <= iSet.getLength() <= maxLength:
511 lFiltered.append( iSet )
512 return lFiltered
513
514 filterOnLength = staticmethod( filterOnLength )
515
516
517 def getListOfNames( setFile ):
518 lNames = []
519 setFileHandler = open( setFile, "r" )
520 iSet = Set()
521 while True:
522 line = setFileHandler.readline()
523 if line == "":
524 break
525 iSet.setFromTuple( line[:-1].split("\t") )
526 if iSet.name not in lNames:
527 lNames.append( iSet.name )
528 setFileHandler.close()
529 return lNames
530
531 getListOfNames = staticmethod( getListOfNames )
532
533
534 def getDictOfDictsWithNamesThenIdAsKeyFromFile( setFile ):
535 dNames2DictsId = {}
536 setFileHandler = open( setFile, "r" )
537 while True:
538 line = setFileHandler.readline()
539 if line == "":
540 break
541 iSet = Set()
542 iSet.setFromTuple( line[:-1].split("\t") )
543 if not dNames2DictsId.has_key( iSet.name ):
544 dNames2DictsId[ iSet.name ] = { iSet.id: [ iSet ] }
545 else:
546 if not dNames2DictsId[ iSet.name ].has_key( iSet.id ):
547 dNames2DictsId[ iSet.name ][ iSet.id ] = [ iSet ]
548 else:
549 dNames2DictsId[ iSet.name ][ iSet.id ].append( iSet )
550 setFileHandler.close()
551 return dNames2DictsId
552
553 getDictOfDictsWithNamesThenIdAsKeyFromFile = staticmethod( getDictOfDictsWithNamesThenIdAsKeyFromFile )