Mercurial > repos > urgi-team > teiso
diff TEisotools-1.1.a/commons/core/coord/SetUtils.py @ 16:836ce3d9d47a draft default tip
Uploaded
author | urgi-team |
---|---|
date | Thu, 21 Jul 2016 07:42:47 -0400 |
parents | 255c852351c5 |
children |
line wrap: on
line diff
--- a/TEisotools-1.1.a/commons/core/coord/SetUtils.py Thu Jul 21 07:36:44 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,553 +0,0 @@ -# Copyright INRA (Institut National de la Recherche Agronomique) -# http://www.inra.fr -# http://urgi.versailles.inra.fr -# -# This software is governed by the CeCILL license under French law and -# abiding by the rules of distribution of free software. You can use, -# modify and/ or redistribute the software under the terms of the CeCILL -# license as circulated by CEA, CNRS and INRIA at the following URL -# "http://www.cecill.info". -# -# As a counterpart to the access to the source code and rights to copy, -# modify and redistribute granted by the license, users are provided only -# with a limited warranty and the software's author, the holder of the -# economic rights, and the successive licensors have only limited -# liability. -# -# In this respect, the user's attention is drawn to the risks associated -# with loading, using, modifying and/or developing or reproducing the -# software by the user in light of its specific status of free software, -# that may mean that it is complicated to manipulate, and that also -# therefore means that it is reserved for developers and experienced -# professionals having in-depth computer knowledge. Users are therefore -# encouraged to load and test the software's suitability as regards their -# requirements in conditions enabling the security of their systems and/or -# data to be ensured and, more generally, to use and operate it in the -# same conditions as regards security. -# -# The fact that you are presently reading this means that you have had -# knowledge of the CeCILL license and that you accept its terms. - - -from commons.core.coord.Set import Set - -## Static methods for the manipulation of Set instances -# -class SetUtils( object ): - - ## Change the identifier of each Set instance in the given list - # - # @param lSets list of Set instances - # @param newId new identifier - # - def changeIdInList(lSets, newId): - for iSet in lSets: - iSet.id = newId - - changeIdInList = staticmethod( changeIdInList ) - - ## Return the length of the overlap between two lists of Set instances - # - # @param lSets1 list of Set instances - # @param lSets2 list of Set instances - # @return length of overlap - # @warning sequence names are supposed to be identical - # - def getOverlapLengthBetweenLists(lSets1, lSets2): - lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1) - lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2) - osize = 0 - i = 0 - j = 0 - while i!= len(lSet1Sorted): - while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\ - and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])): - j+=1 - jj=j - while jj!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[jj]): - osize+=lSet1Sorted[i].getOverlapLength(lSet2Sorted[jj]) - jj+=1 - i+=1 - return osize - - getOverlapLengthBetweenLists = staticmethod( getOverlapLengthBetweenLists ) - - ## Return True if the two lists of Set instances overlap, False otherwise - # - # @param lSets1 list of Set instances - # @param lSets2 list of Set instances - # - def areSetsOverlappingBetweenLists( lSets1, lSets2 ): - lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1) - lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2) - i=0 - j=0 - while i!= len(lSet1Sorted): - while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\ - and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])): - j+=1 - if j!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[j]): - return True - i+=1 - return False - - areSetsOverlappingBetweenLists = staticmethod( areSetsOverlappingBetweenLists ) - - ## Merge all overlapping Set instances between two lists of Set and give the next identifier - # - # @param lSets1 list of Set instances - # @param lSets2 list of Set instances - # @param max_id start id value for inserting new Set - # @return a new list of the merged Set instances and the next identifier - # - def getListOfMergedSetsAndNextId(lSets1, lSets2, max_id=0): - lSets_merged = [] - list2merge = SetUtils.getListOfIdListOfOverlappingSets ( lSets1,lSets2 ) - idlist1 = SetUtils.getDictOfListsWithIdAsKey(lSets1) - idlist2 = SetUtils.getDictOfListsWithIdAsKey(lSets2) - if max_id == 0: - max_id = max(idlist1.keys()) + 1 - for i in list2merge: - if i == []: - continue - l = [] - min_id = max(i) - for j in i: - if j>0: - if min_id>j: - min_id=j - l.extend(idlist1[j]) - del idlist1[j] - else: - l.extend(idlist2[j*-1]) - del idlist2[j*-1] - l = SetUtils.mergeSetsInList(l) - SetUtils.changeIdInList(l, min_id) - lSets_merged.extend(l) - for id, alist in idlist1.items(): - lSets_merged.extend(alist) - for id,alist in idlist2.items(): - SetUtils.changeIdInList(alist,max_id) - lSets_merged.extend(alist) - max_id+=1 - return lSets_merged, max_id - - getListOfMergedSetsAndNextId = staticmethod ( getListOfMergedSetsAndNextId ) - -# ## Concatenate two Set instance lists and give the next identifier -# # -# # @param lSets1 list of Set instances -# # @param lSets2 list of Set instances -# # @param maxId start id value for inserting new Set -# # @return a new list of Set instances and the next identifier -# # -# @staticmethod -# def getSetsListOfTwoConcatenatedSetsListAndNextId(lSets1, lSets2, maxId = 0): -# lOutSets = lSets1 -# dId2SetsList2 = SetUtils.getDictOfListsWithIdAsKey(lSets2) -# if maxId == 0: -# dId2SetsList1 = SetUtils.getDictOfListsWithIdAsKey(lSets1) -# maxId = max(dId2SetsList1.keys()) -# for lSets in dId2SetsList2.values(): -# SetUtils.changeIdInList(lSets, maxId) -# lOutSets.extend(lSets) -# maxId += 1 -# return lOutSets, maxId - - ## Return the sum of the length of each Set instance in the given list - # - # @param lSets: list of Set instances - # - def getCumulLength(lSets): - length = 0 - for i in lSets: - length += i.getLength() - return length - - getCumulLength = staticmethod( getCumulLength ) - - ## Return a tuple with min and max coordinates of Set instances in the given list - # - # @param lSets list of Set instances - # - def getListBoundaries(lSets): - qmin = -1 - qmax = -1 - for iSet in lSets: - if qmin == -1: - qmin = iSet.start - qmin = min(qmin, iSet.getMin()) - qmax = max(qmax, iSet.getMax()) - return (qmin, qmax) - - getListBoundaries = staticmethod( getListBoundaries ) - - ## Show Set instances contained in the given list - # - # @param lSets list of Set instances - # - def showList(lSets): - for iSet in lSets: - iSet.show() - - showList = staticmethod( showList ) - - ## Write Set instances contained in the given list - # - # @param lSets list of Set instances - # @param fileName a file name - # @param mode the open mode of the file '"w"' or '"a"' - # - def writeListInFile(lSets, fileName, mode="w"): - fileHandler = open(fileName, mode) - for iSet in lSets: - iSet.write(fileHandler) - fileHandler.close() - - writeListInFile = staticmethod( writeListInFile ) - - ## Split a Set list in several Set lists according to the identifier - # - # @param lSets list of Set instances - # @return a dictionary which keys are identifiers and values Set lists - # - def getDictOfListsWithIdAsKey(lSets): - dId2SetList = {} - for iSet in lSets: - if dId2SetList.has_key(iSet.id): - dId2SetList[iSet.id].append(iSet) - else: - dId2SetList[iSet.id] = [iSet] - return dId2SetList - - getDictOfListsWithIdAsKey = staticmethod( getDictOfListsWithIdAsKey ) - - - ## Split a Set list in several Set lists according to the identifier - # - # @param lSets list of Set instances - # @return a dictionary which keys are identifiers and values Set lists - # - def getDictOfListsWithIdAsKeyFromFile( setFile ): - dId2SetList = {} - setFileHandler = open( setFile, "r" ) - while True: - line = setFileHandler.readline() - if line == "": - break - iSet = Set() - iSet.setFromTuple( line[:-1].split("\t") ) - if not dId2SetList.has_key( iSet.id ): - dId2SetList[ iSet.id ] = [] - dId2SetList[ iSet.id ].append( iSet ) - setFileHandler.close() - return dId2SetList - - getDictOfListsWithIdAsKeyFromFile = staticmethod( getDictOfListsWithIdAsKeyFromFile ) - - - ## Return a Map list from the given Set List - # - # @param lSets list of Set instances - # - def getMapListFromSetList(lSets): - lMaps = [] - for iSet in lSets: - lMaps.append(iSet.set2map()) - return lMaps - - getMapListFromSetList = staticmethod( getMapListFromSetList ) - - ## Construct a Set list from a Map list - # - # @param lMaps list of Map instances - # - def getSetListFromMapList(lMaps): - lSets = [] - c = 0 - for iMap in lMaps: - c += 1 - lSets.append( Set(c, iMap.name, iMap.seqname, iMap.start, iMap.end) ) - return lSets - - getSetListFromMapList = staticmethod( getSetListFromMapList ) - - ## Merge all overlapping Set instances in a list without considering the identifiers. - # Start by sorting Set instances by their increasing Min coordinate. - # - # @return: a new list of the merged Set instances - # - def mergeSetsInList(lSets): - l=[] - if len(lSets)==0: - return l - - lSortedSets = SetUtils.getSetListSortedByIncreasingMinThenInvLength( lSets ) - - prev_count = 0 - for iSet in lSortedSets[0:]: - if prev_count != len(lSortedSets): - for i in lSortedSets[ prev_count + 1: ]: - if iSet.isOverlapping( i ): - iSet.merge( i ) - IsAlreadyInList = False - for newSet in l: - if newSet.isOverlapping( iSet ): - IsAlreadyInList = True - newSet.merge( iSet ) - l [ l.index( newSet ) ] = newSet - if not IsAlreadyInList: - l.append( iSet ) - prev_count += 1 - return l - - mergeSetsInList = staticmethod( mergeSetsInList ) - - ## Unjoin a Set list according to another - # - # @param lToKeep: a list of Set instances to keep - # @param lToUnjoin: a list of Set instances to unjoin - # @return: lToUnjoin split in several list - # - def getSetListUnjoined(lToKeep, lToUnjoin): - lSortedToKeep = SetUtils.getSetListSortedByIncreasingMinThenMax( lToKeep ) - lSortedToUnjoin = SetUtils.getSetListSortedByIncreasingMinThenMax( lToUnjoin ) - if lSortedToUnjoin == []: - return [] - if lSortedToKeep == []: - return [ lSortedToUnjoin ] - - i=0 - resultListSet=[] - while i<len(lSortedToKeep): - j1=0 - while j1<len(lSortedToUnjoin) and lSortedToKeep[i].getMin() > lSortedToUnjoin[j1].getMax(): - j1+=1 - if j1==len(lSortedToUnjoin): - break - if j1!=0: - resultListSet.append(lSortedToUnjoin[:j1]) - del lSortedToUnjoin[:j1] - j1=0 - if i+1==len(lSortedToKeep): - break - j2=j1 - if j2<len(lSortedToUnjoin) and lSortedToKeep[i+1].getMin() > lSortedToUnjoin[j2].getMax(): - while j2<len(lSortedToUnjoin) and lSortedToKeep[i+1].getMin() > lSortedToUnjoin[j2].getMax(): - j2+=1 - resultListSet.append(lSortedToUnjoin[j1:j2]) - del lSortedToUnjoin[j1:j2] - i+=1 - - if resultListSet!=[] or i == 0: - resultListSet.append(lSortedToUnjoin) - return resultListSet - - getSetListUnjoined = staticmethod(getSetListUnjoined) - - ## Return new list of Set instances with no duplicate - # - # @param lSets list of Set instances - # - def getSetListWithoutDuplicates( lSets ): - if len(lSets) < 2: - return lSets - lSortedSet = SetUtils.getSetListSortedByIncreasingMinThenMax( lSets ) - lUniqSet = [ lSortedSet[0] ] - for iSet in lSortedSet[1:]: - if iSet != lUniqSet[-1]: - lUniqSet.append( iSet ) - return lUniqSet - - getSetListWithoutDuplicates = staticmethod( getSetListWithoutDuplicates ) - - ## Return a list of Set instances sorted in increasing order according to the Min, then the Max, and finally their initial order - # - # @param lSets: list of Set instances - # - def getSetListSortedByIncreasingMinThenMax( lSets ): - return sorted( lSets, key=lambda iSet: ( iSet.getMin(), iSet.getMax() ) ) - - getSetListSortedByIncreasingMinThenMax = staticmethod( getSetListSortedByIncreasingMinThenMax ) - - ## Return a list of Set instances sorted in increasing order according to the min, then the inverse of the length, and finally their initial order - # - # @param lSets: list of Set instances - # - def getSetListSortedByIncreasingMinThenInvLength( lSets ): - return sorted( lSets, key=lambda iSet: ( iSet.getMin(), 1 / float(iSet.getLength()) ) ) - - getSetListSortedByIncreasingMinThenInvLength = staticmethod( getSetListSortedByIncreasingMinThenInvLength ) - - ## Return a list of Set instances sorted in increasing order according to the SeqName, then the Name, then the Min, then the Max and finally their initial order - # - # @param lSets: list of Set instances - # - def getSetListSortedBySeqThenRegionThenMinThenMax(lSets): - return sorted(lSets, key=lambda iSet: (iSet.getSeqname(), iSet.getName(), iSet.getMin(), iSet.getMax())) - - getSetListSortedBySeqThenRegionThenMinThenMax = staticmethod(getSetListSortedBySeqThenRegionThenMinThenMax) - - ## Return a list of identifier lists of overlapping Sets from the subject list, according to the reference list - # - # @param lRef list of Set instances - # @param lSubject list of Set instances - # - def getListOfIdListOfOverlappingSets(lRef,lSubject): - lSortedRef = SetUtils.getSetListSortedByIncreasingMinThenMax( lRef ) - lSortedSubject = SetUtils.getSetListSortedByIncreasingMinThenMax( lSubject ) - - lOverlappingSet = [] - lOverlappingSetCounter = 0 - - id2LOverlappingSet_pos = {} - - i = 0 - j = 0 - while i!= len(lSortedRef): - while j!= len(lSortedSubject) and lSortedRef[i].getMin()>lSortedSubject[j].getMax()\ - and not(lSortedRef[i].isOverlapping(lSortedSubject[j])\ - and lSortedRef[i].isOnDirectStrand()==lSortedSubject[j].isOnDirectStrand()): - j+=1 - jj=j - while jj!= len(lSortedSubject) and lSortedRef[i].isOverlapping(lSortedSubject[jj])\ - and lSortedRef[i].isOnDirectStrand()==lSortedSubject[jj].isOnDirectStrand(): - id1=lSortedRef[i].id - id2=lSortedSubject[jj].id*-1 - if id2LOverlappingSet_pos.has_key(id1) \ - and not id2LOverlappingSet_pos.has_key(id2): - lOverlappingSet[id2LOverlappingSet_pos[id1]].append(id2) - id2LOverlappingSet_pos[id2]=id2LOverlappingSet_pos[id1] - if id2LOverlappingSet_pos.has_key(id2) \ - and not id2LOverlappingSet_pos.has_key(id1): - lOverlappingSet[id2LOverlappingSet_pos[id2]].append(id1) - id2LOverlappingSet_pos[id1]=id2LOverlappingSet_pos[id2] - if not id2LOverlappingSet_pos.has_key(id2) \ - and not id2LOverlappingSet_pos.has_key(id1): - lOverlappingSet.append([id1,id2]) - id2LOverlappingSet_pos[id1]=lOverlappingSetCounter - id2LOverlappingSet_pos[id2]=lOverlappingSetCounter - lOverlappingSetCounter+=1 - jj+=1 - i+=1 - - return lOverlappingSet - - getListOfIdListOfOverlappingSets = staticmethod (getListOfIdListOfOverlappingSets) - - ## Return a list of sets without overlapping between two lists of sets - # - # @param lSet1 and lSet2 - # - def getListOfSetWithoutOverlappingBetweenTwoListOfSet(lSet1, lSet2): - for i in lSet1: - for idx,j in enumerate(lSet2): - n=j.diff(i) - if not n.isEmpty() and n.getLength()>=20: - lSet2.append(n) - lSet2WithoutOverlaps=[] - for i in lSet2: - if not i.isEmpty() and i.getLength()>=20: - lSet2WithoutOverlaps.append(i) - return lSet2WithoutOverlaps - - getListOfSetWithoutOverlappingBetweenTwoListOfSet = staticmethod (getListOfSetWithoutOverlappingBetweenTwoListOfSet) - - ## Return a Set list from a Set file - # - # @param setFile string name of a Set file - # @return a list of Set instances - # - def getSetListFromFile( setFile ): - lSets = [] - setFileHandler = open( setFile, "r" ) - while True: - line = setFileHandler.readline() - if line == "": - break - iSet = Set() - iSet.setFromString( line ) - lSets.append( iSet ) - setFileHandler.close() - return lSets - - getSetListFromFile = staticmethod( getSetListFromFile ) - - - def convertSetFileIntoMapFile( setFile, mapFile ): - setFileHandler = open( setFile, "r" ) - mapFileHandler = open( mapFile, "w" ) - iSet = Set() - while True: - line = setFileHandler.readline() - if line == "": - break - iSet.setFromString( line ) - iMap = iSet.getMapInstance() - iMap.write( mapFileHandler ) - setFileHandler.close() - mapFileHandler.close() - - convertSetFileIntoMapFile = staticmethod( convertSetFileIntoMapFile ) - - - def getDictOfListsWithSeqnameAsKey( lSets ): - dSeqnamesToSetList = {} - for iSet in lSets: - if not dSeqnamesToSetList.has_key( iSet.seqname ): - dSeqnamesToSetList[ iSet.seqname ] = [] - dSeqnamesToSetList[ iSet.seqname ].append( iSet ) - return dSeqnamesToSetList - - getDictOfListsWithSeqnameAsKey = staticmethod( getDictOfListsWithSeqnameAsKey ) - - - def filterOnLength( lSets, minLength=0, maxLength=10000000000 ): - if minLength == 0 and maxLength == 0: - return lSets - lFiltered = [] - for iSet in lSets: - if minLength <= iSet.getLength() <= maxLength: - lFiltered.append( iSet ) - return lFiltered - - filterOnLength = staticmethod( filterOnLength ) - - - def getListOfNames( setFile ): - lNames = [] - setFileHandler = open( setFile, "r" ) - iSet = Set() - while True: - line = setFileHandler.readline() - if line == "": - break - iSet.setFromTuple( line[:-1].split("\t") ) - if iSet.name not in lNames: - lNames.append( iSet.name ) - setFileHandler.close() - return lNames - - getListOfNames = staticmethod( getListOfNames ) - - - def getDictOfDictsWithNamesThenIdAsKeyFromFile( setFile ): - dNames2DictsId = {} - setFileHandler = open( setFile, "r" ) - while True: - line = setFileHandler.readline() - if line == "": - break - iSet = Set() - iSet.setFromTuple( line[:-1].split("\t") ) - if not dNames2DictsId.has_key( iSet.name ): - dNames2DictsId[ iSet.name ] = { iSet.id: [ iSet ] } - else: - if not dNames2DictsId[ iSet.name ].has_key( iSet.id ): - dNames2DictsId[ iSet.name ][ iSet.id ] = [ iSet ] - else: - dNames2DictsId[ iSet.name ][ iSet.id ].append( iSet ) - setFileHandler.close() - return dNames2DictsId - - getDictOfDictsWithNamesThenIdAsKeyFromFile = staticmethod( getDictOfDictsWithNamesThenIdAsKeyFromFile )