Mercurial > repos > urgi-team > teiso
diff TEisotools-1.1.a/commons/core/coord/SetUtils.py @ 13:feef9a0db09d draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 09:04:42 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.1.a/commons/core/coord/SetUtils.py Wed Jul 20 09:04:42 2016 -0400 @@ -0,0 +1,553 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.coord.Set import Set + +## Static methods for the manipulation of Set instances +# +class SetUtils( object ): + + ## Change the identifier of each Set instance in the given list + # + # @param lSets list of Set instances + # @param newId new identifier + # + def changeIdInList(lSets, newId): + for iSet in lSets: + iSet.id = newId + + changeIdInList = staticmethod( changeIdInList ) + + ## Return the length of the overlap between two lists of Set instances + # + # @param lSets1 list of Set instances + # @param lSets2 list of Set instances + # @return length of overlap + # @warning sequence names are supposed to be identical + # + def getOverlapLengthBetweenLists(lSets1, lSets2): + lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1) + lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2) + osize = 0 + i = 0 + j = 0 + while i!= len(lSet1Sorted): + while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\ + and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])): + j+=1 + jj=j + while jj!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[jj]): + osize+=lSet1Sorted[i].getOverlapLength(lSet2Sorted[jj]) + jj+=1 + i+=1 + return osize + + getOverlapLengthBetweenLists = staticmethod( getOverlapLengthBetweenLists ) + + ## Return True if the two lists of Set instances overlap, False otherwise + # + # @param lSets1 list of Set instances + # @param lSets2 list of Set instances + # + def areSetsOverlappingBetweenLists( lSets1, lSets2 ): + lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1) + lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2) + i=0 + j=0 + while i!= len(lSet1Sorted): + while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\ + and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])): + j+=1 + if j!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[j]): + return True + i+=1 + return False + + areSetsOverlappingBetweenLists = staticmethod( areSetsOverlappingBetweenLists ) + + ## Merge all overlapping Set instances between two lists of Set and give the next identifier + # + # @param lSets1 list of Set instances + # @param lSets2 list of Set instances + # @param max_id start id value for inserting new Set + # @return a new list of the merged Set instances and the next identifier + # + def getListOfMergedSetsAndNextId(lSets1, lSets2, max_id=0): + lSets_merged = [] + list2merge = SetUtils.getListOfIdListOfOverlappingSets ( lSets1,lSets2 ) + idlist1 = SetUtils.getDictOfListsWithIdAsKey(lSets1) + idlist2 = SetUtils.getDictOfListsWithIdAsKey(lSets2) + if max_id == 0: + max_id = max(idlist1.keys()) + 1 + for i in list2merge: + if i == []: + continue + l = [] + min_id = max(i) + for j in i: + if j>0: + if min_id>j: + min_id=j + l.extend(idlist1[j]) + del idlist1[j] + else: + l.extend(idlist2[j*-1]) + del idlist2[j*-1] + l = SetUtils.mergeSetsInList(l) + SetUtils.changeIdInList(l, min_id) + lSets_merged.extend(l) + for id, alist in idlist1.items(): + lSets_merged.extend(alist) + for id,alist in idlist2.items(): + SetUtils.changeIdInList(alist,max_id) + lSets_merged.extend(alist) + max_id+=1 + return lSets_merged, max_id + + getListOfMergedSetsAndNextId = staticmethod ( getListOfMergedSetsAndNextId ) + +# ## Concatenate two Set instance lists and give the next identifier +# # +# # @param lSets1 list of Set instances +# # @param lSets2 list of Set instances +# # @param maxId start id value for inserting new Set +# # @return a new list of Set instances and the next identifier +# # +# @staticmethod +# def getSetsListOfTwoConcatenatedSetsListAndNextId(lSets1, lSets2, maxId = 0): +# lOutSets = lSets1 +# dId2SetsList2 = SetUtils.getDictOfListsWithIdAsKey(lSets2) +# if maxId == 0: +# dId2SetsList1 = SetUtils.getDictOfListsWithIdAsKey(lSets1) +# maxId = max(dId2SetsList1.keys()) +# for lSets in dId2SetsList2.values(): +# SetUtils.changeIdInList(lSets, maxId) +# lOutSets.extend(lSets) +# maxId += 1 +# return lOutSets, maxId + + ## Return the sum of the length of each Set instance in the given list + # + # @param lSets: list of Set instances + # + def getCumulLength(lSets): + length = 0 + for i in lSets: + length += i.getLength() + return length + + getCumulLength = staticmethod( getCumulLength ) + + ## Return a tuple with min and max coordinates of Set instances in the given list + # + # @param lSets list of Set instances + # + def getListBoundaries(lSets): + qmin = -1 + qmax = -1 + for iSet in lSets: + if qmin == -1: + qmin = iSet.start + qmin = min(qmin, iSet.getMin()) + qmax = max(qmax, iSet.getMax()) + return (qmin, qmax) + + getListBoundaries = staticmethod( getListBoundaries ) + + ## Show Set instances contained in the given list + # + # @param lSets list of Set instances + # + def showList(lSets): + for iSet in lSets: + iSet.show() + + showList = staticmethod( showList ) + + ## Write Set instances contained in the given list + # + # @param lSets list of Set instances + # @param fileName a file name + # @param mode the open mode of the file '"w"' or '"a"' + # + def writeListInFile(lSets, fileName, mode="w"): + fileHandler = open(fileName, mode) + for iSet in lSets: + iSet.write(fileHandler) + fileHandler.close() + + writeListInFile = staticmethod( writeListInFile ) + + ## Split a Set list in several Set lists according to the identifier + # + # @param lSets list of Set instances + # @return a dictionary which keys are identifiers and values Set lists + # + def getDictOfListsWithIdAsKey(lSets): + dId2SetList = {} + for iSet in lSets: + if dId2SetList.has_key(iSet.id): + dId2SetList[iSet.id].append(iSet) + else: + dId2SetList[iSet.id] = [iSet] + return dId2SetList + + getDictOfListsWithIdAsKey = staticmethod( getDictOfListsWithIdAsKey ) + + + ## Split a Set list in several Set lists according to the identifier + # + # @param lSets list of Set instances + # @return a dictionary which keys are identifiers and values Set lists + # + def getDictOfListsWithIdAsKeyFromFile( setFile ): + dId2SetList = {} + setFileHandler = open( setFile, "r" ) + while True: + line = setFileHandler.readline() + if line == "": + break + iSet = Set() + iSet.setFromTuple( line[:-1].split("\t") ) + if not dId2SetList.has_key( iSet.id ): + dId2SetList[ iSet.id ] = [] + dId2SetList[ iSet.id ].append( iSet ) + setFileHandler.close() + return dId2SetList + + getDictOfListsWithIdAsKeyFromFile = staticmethod( getDictOfListsWithIdAsKeyFromFile ) + + + ## Return a Map list from the given Set List + # + # @param lSets list of Set instances + # + def getMapListFromSetList(lSets): + lMaps = [] + for iSet in lSets: + lMaps.append(iSet.set2map()) + return lMaps + + getMapListFromSetList = staticmethod( getMapListFromSetList ) + + ## Construct a Set list from a Map list + # + # @param lMaps list of Map instances + # + def getSetListFromMapList(lMaps): + lSets = [] + c = 0 + for iMap in lMaps: + c += 1 + lSets.append( Set(c, iMap.name, iMap.seqname, iMap.start, iMap.end) ) + return lSets + + getSetListFromMapList = staticmethod( getSetListFromMapList ) + + ## Merge all overlapping Set instances in a list without considering the identifiers. + # Start by sorting Set instances by their increasing Min coordinate. + # + # @return: a new list of the merged Set instances + # + def mergeSetsInList(lSets): + l=[] + if len(lSets)==0: + return l + + lSortedSets = SetUtils.getSetListSortedByIncreasingMinThenInvLength( lSets ) + + prev_count = 0 + for iSet in lSortedSets[0:]: + if prev_count != len(lSortedSets): + for i in lSortedSets[ prev_count + 1: ]: + if iSet.isOverlapping( i ): + iSet.merge( i ) + IsAlreadyInList = False + for newSet in l: + if newSet.isOverlapping( iSet ): + IsAlreadyInList = True + newSet.merge( iSet ) + l [ l.index( newSet ) ] = newSet + if not IsAlreadyInList: + l.append( iSet ) + prev_count += 1 + return l + + mergeSetsInList = staticmethod( mergeSetsInList ) + + ## Unjoin a Set list according to another + # + # @param lToKeep: a list of Set instances to keep + # @param lToUnjoin: a list of Set instances to unjoin + # @return: lToUnjoin split in several list + # + def getSetListUnjoined(lToKeep, lToUnjoin): + lSortedToKeep = SetUtils.getSetListSortedByIncreasingMinThenMax( lToKeep ) + lSortedToUnjoin = SetUtils.getSetListSortedByIncreasingMinThenMax( lToUnjoin ) + if lSortedToUnjoin == []: + return [] + if lSortedToKeep == []: + return [ lSortedToUnjoin ] + + i=0 + resultListSet=[] + while i<len(lSortedToKeep): + j1=0 + while j1<len(lSortedToUnjoin) and lSortedToKeep[i].getMin() > lSortedToUnjoin[j1].getMax(): + j1+=1 + if j1==len(lSortedToUnjoin): + break + if j1!=0: + resultListSet.append(lSortedToUnjoin[:j1]) + del lSortedToUnjoin[:j1] + j1=0 + if i+1==len(lSortedToKeep): + break + j2=j1 + if j2<len(lSortedToUnjoin) and lSortedToKeep[i+1].getMin() > lSortedToUnjoin[j2].getMax(): + while j2<len(lSortedToUnjoin) and lSortedToKeep[i+1].getMin() > lSortedToUnjoin[j2].getMax(): + j2+=1 + resultListSet.append(lSortedToUnjoin[j1:j2]) + del lSortedToUnjoin[j1:j2] + i+=1 + + if resultListSet!=[] or i == 0: + resultListSet.append(lSortedToUnjoin) + return resultListSet + + getSetListUnjoined = staticmethod(getSetListUnjoined) + + ## Return new list of Set instances with no duplicate + # + # @param lSets list of Set instances + # + def getSetListWithoutDuplicates( lSets ): + if len(lSets) < 2: + return lSets + lSortedSet = SetUtils.getSetListSortedByIncreasingMinThenMax( lSets ) + lUniqSet = [ lSortedSet[0] ] + for iSet in lSortedSet[1:]: + if iSet != lUniqSet[-1]: + lUniqSet.append( iSet ) + return lUniqSet + + getSetListWithoutDuplicates = staticmethod( getSetListWithoutDuplicates ) + + ## Return a list of Set instances sorted in increasing order according to the Min, then the Max, and finally their initial order + # + # @param lSets: list of Set instances + # + def getSetListSortedByIncreasingMinThenMax( lSets ): + return sorted( lSets, key=lambda iSet: ( iSet.getMin(), iSet.getMax() ) ) + + getSetListSortedByIncreasingMinThenMax = staticmethod( getSetListSortedByIncreasingMinThenMax ) + + ## Return a list of Set instances sorted in increasing order according to the min, then the inverse of the length, and finally their initial order + # + # @param lSets: list of Set instances + # + def getSetListSortedByIncreasingMinThenInvLength( lSets ): + return sorted( lSets, key=lambda iSet: ( iSet.getMin(), 1 / float(iSet.getLength()) ) ) + + getSetListSortedByIncreasingMinThenInvLength = staticmethod( getSetListSortedByIncreasingMinThenInvLength ) + + ## Return a list of Set instances sorted in increasing order according to the SeqName, then the Name, then the Min, then the Max and finally their initial order + # + # @param lSets: list of Set instances + # + def getSetListSortedBySeqThenRegionThenMinThenMax(lSets): + return sorted(lSets, key=lambda iSet: (iSet.getSeqname(), iSet.getName(), iSet.getMin(), iSet.getMax())) + + getSetListSortedBySeqThenRegionThenMinThenMax = staticmethod(getSetListSortedBySeqThenRegionThenMinThenMax) + + ## Return a list of identifier lists of overlapping Sets from the subject list, according to the reference list + # + # @param lRef list of Set instances + # @param lSubject list of Set instances + # + def getListOfIdListOfOverlappingSets(lRef,lSubject): + lSortedRef = SetUtils.getSetListSortedByIncreasingMinThenMax( lRef ) + lSortedSubject = SetUtils.getSetListSortedByIncreasingMinThenMax( lSubject ) + + lOverlappingSet = [] + lOverlappingSetCounter = 0 + + id2LOverlappingSet_pos = {} + + i = 0 + j = 0 + while i!= len(lSortedRef): + while j!= len(lSortedSubject) and lSortedRef[i].getMin()>lSortedSubject[j].getMax()\ + and not(lSortedRef[i].isOverlapping(lSortedSubject[j])\ + and lSortedRef[i].isOnDirectStrand()==lSortedSubject[j].isOnDirectStrand()): + j+=1 + jj=j + while jj!= len(lSortedSubject) and lSortedRef[i].isOverlapping(lSortedSubject[jj])\ + and lSortedRef[i].isOnDirectStrand()==lSortedSubject[jj].isOnDirectStrand(): + id1=lSortedRef[i].id + id2=lSortedSubject[jj].id*-1 + if id2LOverlappingSet_pos.has_key(id1) \ + and not id2LOverlappingSet_pos.has_key(id2): + lOverlappingSet[id2LOverlappingSet_pos[id1]].append(id2) + id2LOverlappingSet_pos[id2]=id2LOverlappingSet_pos[id1] + if id2LOverlappingSet_pos.has_key(id2) \ + and not id2LOverlappingSet_pos.has_key(id1): + lOverlappingSet[id2LOverlappingSet_pos[id2]].append(id1) + id2LOverlappingSet_pos[id1]=id2LOverlappingSet_pos[id2] + if not id2LOverlappingSet_pos.has_key(id2) \ + and not id2LOverlappingSet_pos.has_key(id1): + lOverlappingSet.append([id1,id2]) + id2LOverlappingSet_pos[id1]=lOverlappingSetCounter + id2LOverlappingSet_pos[id2]=lOverlappingSetCounter + lOverlappingSetCounter+=1 + jj+=1 + i+=1 + + return lOverlappingSet + + getListOfIdListOfOverlappingSets = staticmethod (getListOfIdListOfOverlappingSets) + + ## Return a list of sets without overlapping between two lists of sets + # + # @param lSet1 and lSet2 + # + def getListOfSetWithoutOverlappingBetweenTwoListOfSet(lSet1, lSet2): + for i in lSet1: + for idx,j in enumerate(lSet2): + n=j.diff(i) + if not n.isEmpty() and n.getLength()>=20: + lSet2.append(n) + lSet2WithoutOverlaps=[] + for i in lSet2: + if not i.isEmpty() and i.getLength()>=20: + lSet2WithoutOverlaps.append(i) + return lSet2WithoutOverlaps + + getListOfSetWithoutOverlappingBetweenTwoListOfSet = staticmethod (getListOfSetWithoutOverlappingBetweenTwoListOfSet) + + ## Return a Set list from a Set file + # + # @param setFile string name of a Set file + # @return a list of Set instances + # + def getSetListFromFile( setFile ): + lSets = [] + setFileHandler = open( setFile, "r" ) + while True: + line = setFileHandler.readline() + if line == "": + break + iSet = Set() + iSet.setFromString( line ) + lSets.append( iSet ) + setFileHandler.close() + return lSets + + getSetListFromFile = staticmethod( getSetListFromFile ) + + + def convertSetFileIntoMapFile( setFile, mapFile ): + setFileHandler = open( setFile, "r" ) + mapFileHandler = open( mapFile, "w" ) + iSet = Set() + while True: + line = setFileHandler.readline() + if line == "": + break + iSet.setFromString( line ) + iMap = iSet.getMapInstance() + iMap.write( mapFileHandler ) + setFileHandler.close() + mapFileHandler.close() + + convertSetFileIntoMapFile = staticmethod( convertSetFileIntoMapFile ) + + + def getDictOfListsWithSeqnameAsKey( lSets ): + dSeqnamesToSetList = {} + for iSet in lSets: + if not dSeqnamesToSetList.has_key( iSet.seqname ): + dSeqnamesToSetList[ iSet.seqname ] = [] + dSeqnamesToSetList[ iSet.seqname ].append( iSet ) + return dSeqnamesToSetList + + getDictOfListsWithSeqnameAsKey = staticmethod( getDictOfListsWithSeqnameAsKey ) + + + def filterOnLength( lSets, minLength=0, maxLength=10000000000 ): + if minLength == 0 and maxLength == 0: + return lSets + lFiltered = [] + for iSet in lSets: + if minLength <= iSet.getLength() <= maxLength: + lFiltered.append( iSet ) + return lFiltered + + filterOnLength = staticmethod( filterOnLength ) + + + def getListOfNames( setFile ): + lNames = [] + setFileHandler = open( setFile, "r" ) + iSet = Set() + while True: + line = setFileHandler.readline() + if line == "": + break + iSet.setFromTuple( line[:-1].split("\t") ) + if iSet.name not in lNames: + lNames.append( iSet.name ) + setFileHandler.close() + return lNames + + getListOfNames = staticmethod( getListOfNames ) + + + def getDictOfDictsWithNamesThenIdAsKeyFromFile( setFile ): + dNames2DictsId = {} + setFileHandler = open( setFile, "r" ) + while True: + line = setFileHandler.readline() + if line == "": + break + iSet = Set() + iSet.setFromTuple( line[:-1].split("\t") ) + if not dNames2DictsId.has_key( iSet.name ): + dNames2DictsId[ iSet.name ] = { iSet.id: [ iSet ] } + else: + if not dNames2DictsId[ iSet.name ].has_key( iSet.id ): + dNames2DictsId[ iSet.name ][ iSet.id ] = [ iSet ] + else: + dNames2DictsId[ iSet.name ][ iSet.id ].append( iSet ) + setFileHandler.close() + return dNames2DictsId + + getDictOfDictsWithNamesThenIdAsKeyFromFile = staticmethod( getDictOfDictsWithNamesThenIdAsKeyFromFile )