teiso: TEisotools-1.1.a/commons/core/coord/PathUtils.py comparison

comparison TEisotools-1.1.a/commons/core/coord/PathUtils.py @ 13:feef9a0db09d draft

Uploaded

author	urgi-team
date	Wed, 20 Jul 2016 09:04:42 -0400
parents
children

comparison

equal deleted inserted replaced

-:22b0494ec883
+:feef9a0db09d
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability.
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and,  more generally, to use and operate it in the
+# same conditions as regards security.
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+import os
+import sys
+import copy
+from commons.core.coord.Map import Map
+from commons.core.coord.Path import Path
+from commons.core.coord.Align import Align
+from commons.core.coord.Range import Range
+from commons.core.coord.SetUtils import SetUtils
+from commons.core.coord.AlignUtils import AlignUtils
+from commons.core.checker.RepetException import RepetDataException
+## Static methods for the manipulation of Path instances
+#
+class PathUtils ( object ):
+## Change the identifier of each Set instance in the given list
+#
+# @param lPaths list of Path instances
+# @param newId new identifier
+#
+@staticmethod
+def changeIdInList(lPaths, newId):
+for iPath in lPaths:
+iPath.id = newId
+## Return a list of Set instances containing the query range from a list of Path instances
+#
+# @param lPaths a list of Path instances
+#
+@staticmethod
+def getSetListFromQueries(lPaths):
+lSets = []
+for iPath in lPaths:
+lSets.append( iPath.getSubjectAsSetOfQuery() )
+return lSets
+## Return a list of Set instances containing the subject range from a list of Path instances
+#
+# @param lPaths a list of Path instances
+#
+@staticmethod
+def getSetListFromSubjects(lPaths):
+lSets = []
+for iPath in lPaths:
+lSets.append( iPath.getQuerySetOfSubject() )
+return lSets
+## Return a sorted list of Range instances containing the subjects from a list of Path instances
+#
+# @param lPaths a list of Path instances
+# @note meaningful only if all Path instances have same identifier
+#
+@staticmethod
+def getRangeListFromSubjects( lPaths ):
+lRanges = []
+for iPath in lPaths:
+lRanges.append( iPath.range_subject )
+if lRanges[0].isOnDirectStrand():
+return sorted( lRanges, key=lambda iRange: ( iRange.getMin(), iRange.getMax() ) )
+else:
+return sorted( lRanges, key=lambda iRange: ( iRange.getMax(), iRange.getMin() ) )
+## Return a tuple with min and max of query coordinates from Path instances in the given list
+#
+# @param lPaths a list of Path instances
+#
+@staticmethod
+def getQueryMinMaxFromPathList(lPaths):
+qmin = -1
+qmax = -1
+for iPath in lPaths:
+if qmin == -1:
+qmin = iPath.range_query.start
+qmin = min(qmin, iPath.range_query.getMin())
+qmax = max(qmax, iPath.range_query.getMax())
+return (qmin, qmax)
+## Return a tuple with min and max of subject coordinates from Path instances in the given list
+#
+# @param lPaths lists of Path instances
+#
+@staticmethod
+def getSubjectMinMaxFromPathList(lPaths):
+smin = -1
+smax = -1
+for iPath in lPaths:
+if smin == -1:
+smin = iPath.range_subject.start
+smin = min(smin, iPath.range_subject.getMin())
+smax = max(smax, iPath.range_subject.getMax())
+return (smin, smax)
+## Returns a Path objects list where Paths query coordinates overlapping with
+# any Path in a list are removed.
+#
+# WARNING: input Path lists are modified (sort)
+#
+# @param lRefPaths list of paths to check overlaps
+# @param lPathsToClean list of paths to remove overlapping Paths on query coordinates
+# @return path list
+@staticmethod
+def removeOverlappingPathsOnQueriesBetweenPathLists(lRefPaths, lPathsToClean):
+if not lRefPaths:
+print "WARNING: empty reference Paths list"
+return lPathsToClean
+lRefQueries = PathUtils.getListOfDistinctQueryNames(lRefPaths)
+lToCleanQueries = PathUtils.getListOfDistinctQueryNames(lPathsToClean)
+lCommonQueries = list(set(lRefQueries) & set(lToCleanQueries))
+lCommonQueries.sort()
+lSpecificToCleanQueries = list(set(lToCleanQueries) - set(lCommonQueries))
+lSpecificToCleanQueries.sort()
+lRefPaths.sort(key=lambda iPath: (iPath.getQueryName(), iPath.getIdentifier(), iPath.getQueryMin(), iPath.getQueryMax()))
+lPathsToClean.sort(key=lambda iPath: (iPath.getQueryName(), iPath.getIdentifier(), iPath.getQueryMin(), iPath.getQueryMax()))
+lCleanedPaths = []
+lSpecificToCleanQueries = list(set(lToCleanQueries) - set(lCommonQueries))
+lCleanedPaths.extend(PathUtils.extractPathsFromQueryNameList(lPathsToClean, lSpecificToCleanQueries))
+dRefQueryToPathList = PathUtils.getDictOfListsWithQueryNameAsKey(lRefPaths)
+dToCleanQueryToPathList = PathUtils.getDictOfListsWithQueryNameAsKey(lPathsToClean)
+for queryName in lCommonQueries:
+refQueryHash = PathUtils.getDictOfListsWithIdAsKey(dRefQueryToPathList[queryName])
+toCleanQueryHash = PathUtils.getDictOfListsWithIdAsKey(dToCleanQueryToPathList[queryName])
+for lCleanPathById in toCleanQueryHash.values():
+isOverlapping = False
+for lRefPathById in refQueryHash.values():
+if PathUtils.areQueriesOverlappingBetweenPathLists(lRefPathById, lCleanPathById, areListsAlreadySort = True):
+isOverlapping = True
+break
+if not isOverlapping:
+lCleanedPaths.extend(lCleanPathById)
+return lCleanedPaths
+## Return True if the query range of any Path instance from the first list overlaps with the query range of any Path instance from the second list
+#
+#  @param lPaths1: list of Path instances
+#  @param lPaths2: list of Path instances
+#  @return boolean
+#
+@staticmethod
+def areQueriesOverlappingBetweenPathLists( lPaths1, lPaths2, areListsAlreadySort = False):
+if not areListsAlreadySort:
+lSortedPaths1 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths1 )
+lSortedPaths2 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths2 )
+else:
+lSortedPaths1 = lPaths1
+lSortedPaths2 = lPaths2
+i = 0
+j = 0
+while i != len(lSortedPaths1):
+j = 0
+while j != len(lSortedPaths2):
+if not lSortedPaths1[i].range_query.isOverlapping( lSortedPaths2[j].range_query ):
+j += 1
+else:
+return True
+i += 1
+return False
+## Show Path instances contained in the given list
+#
+# @param lPaths a list of Path instances
+#
+@staticmethod
+def showList(lPaths):
+for iPath in lPaths:
+iPath.show()
+## Write Path instances contained in the given list
+#
+# @param lPaths a list of Path instances
+# @param fileName name of the file to write the Path instances
+# @param mode the open mode of the file ""w"" or ""a""
+#
+@staticmethod
+def writeListInFile(lPaths, fileName, mode="w"):
+AlignUtils.writeListInFile(lPaths, fileName, mode)
+## Return new list of Path instances with no duplicate
+#
+# @param lPaths a list of Path instances
+# @param useOnlyCoord boolean if True, check only coordinates and sequence names
+# @return lUniqPaths a path instances list
+#
+@staticmethod
+def getPathListWithoutDuplicates(lPaths, useOnlyCoord = False):
+if len(lPaths) < 2:
+return lPaths
+lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths )
+lUniqPaths = [ lSortedPaths[0] ]
+if useOnlyCoord:
+for iPath in lSortedPaths[1:]:
+if iPath.range_query.start != lUniqPaths[-1].range_query.start \
+or iPath.range_query.end != lUniqPaths[-1].range_query.end \
+or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname \
+or iPath.range_subject.start != lUniqPaths[-1].range_subject.start \
+or iPath.range_subject.end != lUniqPaths[-1].range_subject.end \
+or iPath.range_subject.seqname != lUniqPaths[-1].range_subject.seqname:
+lUniqPaths.append( iPath )
+else:
+for iPath in lSortedPaths[1:]:
+if iPath != lUniqPaths[-1]:
+lUniqPaths.append( iPath )
+return lUniqPaths
+@staticmethod
+def getPathListWithoutDuplicatesOnQueryCoord(lPaths):
+if len(lPaths) < 2:
+return lPaths
+lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths )
+lUniqPaths = [ lSortedPaths[0] ]
+for iPath in lSortedPaths[1:]:
+if iPath.range_query.start != lUniqPaths[-1].range_query.start \
+or iPath.range_query.end != lUniqPaths[-1].range_query.end \
+or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname:
+lUniqPaths.append( iPath )
+return lUniqPaths
+##  Split a Path list in several Path lists according to the identifier
+#
+#  @param lPaths a list of Path instances
+#  @return a dictionary which keys are identifiers and values Path lists
+#
+@staticmethod
+def getDictOfListsWithIdAsKey(lPaths):
+dId2PathList = dict((ident, []) for ident in PathUtils.getListOfDistinctIdentifiers(lPaths))
+for iPath in lPaths:
+dId2PathList[iPath.id].append(iPath)
+return dId2PathList
+##  Split a Path list in several Path lists according to the query name
+#
+#  @param lPaths a list of Path instances
+#  @return a dictionary which keys are query_names and values Path lists
+#
+@staticmethod
+def getDictOfListsWithQueryNameAsKey(lPaths):
+dId2PathList = dict((qn, []) for qn in PathUtils.getListOfDistinctQueryNames(lPaths))
+for iPath in lPaths:
+dId2PathList[iPath.getQueryName()].append(iPath)
+return dId2PathList
+##  Split a Path file in several Path lists according to the identifier
+#
+#  @param pathFile name of the input Path file
+#  @return a dictionary which keys are identifiers and values Path lists
+#
+@staticmethod
+def getDictOfListsWithIdAsKeyFromFile( pathFile ):
+dId2PathList = {}
+pathFileHandler = open(pathFile, "r")
+for line in pathFileHandler:
+iPath = Path()
+iPath.setFromString(line)
+if dId2PathList.has_key(iPath.id):
+dId2PathList[ iPath.id ].append(iPath)
+else:
+dId2PathList[ iPath.id ] = [ iPath ]
+pathFileHandler.close()
+return dId2PathList
+## Return a list of Path list(s) obtained while splitting a list of connected Path instances according to another based on query coordinates
+#  Only the path instance of lToKeep between path instance of lToUnjoin are used to split lToUnjoin
+# @param lToKeep: a list of Path instances to keep (reference)
+# @param lToUnjoin: a list of Path instances to unjoin
+# @return: list of Path list(s) (can be empty if one of the input lists is empty)
+# @warning: all the path instances in a given list MUST be connected (i.e. same identifier)
+# @warning: if the path instances in a given list overlap neither within each other nor with the Path instances of the other list, these path instances are not used to split the lToUnjoin
+#
+@staticmethod
+def getPathListUnjoinedBasedOnQuery( lToKeep, lToUnjoin ):
+lSortedToKeep = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToKeep )
+length_lSortedToKeep = len(lSortedToKeep)
+#        PathUtils.showList(lSortedToKeep)
+lSortedToUnjoin = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToUnjoin )
+#        PathUtils.showList(lSortedToUnjoin)
+length_lSortedToUnjoin = len(lSortedToUnjoin)
+if lToUnjoin == []:
+return []
+if lToKeep == []:
+return [ lToUnjoin ]
+lLists = []
+k = 0
+while k < length_lSortedToKeep:
+j1 = 0
+while j1 < length_lSortedToUnjoin and lSortedToKeep[k].range_query.getMin() > lSortedToUnjoin[j1].range_query.getMax():
+j1 += 1
+if j1 == length_lSortedToUnjoin:
+break
+if j1 != 0:
+lLists.append( lSortedToUnjoin[:j1] )
+del lSortedToUnjoin[:j1]
+j1 = 0
+if k+1 == len(lSortedToKeep):
+break
+j2 = j1
+minQueryOf_lSortedToKeepKplus1 = lSortedToKeep[k+1].range_query.getMin()
+maxQueryOf_lSortedToUnjoinJ2 = lSortedToUnjoin[j2].range_query.getMax()
+if j2 < length_lSortedToUnjoin and minQueryOf_lSortedToKeepKplus1 > maxQueryOf_lSortedToUnjoinJ2:
+while j2 < len(lSortedToUnjoin) and minQueryOf_lSortedToKeepKplus1 > maxQueryOf_lSortedToUnjoinJ2:
+j2 += 1
+maxQueryOf_lSortedToUnjoinJ2 = lSortedToUnjoin[j2].range_query.getMax()
+lLists.append( lSortedToUnjoin[j1:j2] )
+del lSortedToUnjoin[j1:j2]
+k += 1
+if lLists != [] or k == 0:
+lLists.append( lSortedToUnjoin )
+else:
+lLists = lSortedToUnjoin
+return lLists
+## Return the identity of the Path list, the identity of each instance being weighted by the length of each query range
+#  All Paths should have the same query and subject.
+#  The Paths are merged using query coordinates only.
+#
+# @param lPaths list of Path instances
+#
+@staticmethod
+def getIdentityFromPathList( lPaths, checkSubjects=True ):
+if len( PathUtils.getListOfDistinctQueryNames( lPaths ) ) > 1:
+msg = "ERROR: try to compute identity from Paths with different queries"
+sys.stderr.write( "%s\n" % msg )
+sys.stderr.flush()
+raise Exception
+if checkSubjects and len( PathUtils.getListOfDistinctSubjectNames( lPaths ) ) > 1:
+msg = "ERROR: try to compute identity from Paths with different subjects"
+sys.stderr.write( "%s\n" % msg )
+sys.stderr.flush()
+raise Exception
+identity = 0
+lMergedPaths = PathUtils.mergePathsInListUsingQueryCoordsOnly( lPaths )
+lQuerySets = PathUtils.getSetListFromQueries( lMergedPaths )
+lMergedQuerySets = SetUtils.mergeSetsInList( lQuerySets )
+totalLengthOnQry = SetUtils.getCumulLength( lMergedQuerySets )
+for iPath in lMergedPaths:
+identity += iPath.identity * iPath.getLengthOnQuery()
+weightedIdentity = identity / float(totalLengthOnQry)
+if weightedIdentity < 0:
+msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity
+sys.stderr.write("%s\n" % msg)
+sys.stderr.flush()
+raise Exception
+elif weightedIdentity > 100:
+msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity
+sys.stderr.write("%s\n" % msg)
+sys.stderr.flush()
+raise RepetDataException(msg)
+return weightedIdentity
+## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, and finally their initial order.
+#
+# @param lPaths list of Path instances
+#
+@staticmethod
+def getPathListSortedByIncreasingMinQueryThenMaxQuery(lPaths):
+return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax() ) )
+## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then their identifier, and finally their initial order.
+#
+# @param lPaths list of Path instances
+#
+@staticmethod
+def getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier(lPaths):
+return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax(), iPath.getIdentifier() ) )
+## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then the min of the subject, then the max of the subject and finally their initial order.
+#
+# @param lPaths list of Path instances
+#
+@staticmethod
+def getPathListSortedByIncreasingMinQueryThenMaxQueryThenMinSubjectThenMaxSubject(lPaths):
+return sorted(lPaths, key=lambda iPath: (iPath.getQueryMin(), iPath.getQueryMax(), iPath.getSubjectMin(), iPath.getSubjectMax()))
+## Return a list of Path instances sorted in increasing order according to the min, then the inverse of the query length, and finally their initial order
+#
+# @param lPaths: list of Path instances
+#
+@staticmethod
+def getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ):
+return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), 1 / float(iPath.getLengthOnQuery()) ) )
+## Return a list of the distinct identifiers
+#
+# @param lPaths list of Path instances
+#
+@staticmethod
+def getListOfDistinctIdentifiers( lPaths ):
+sDistinctIdentifiers = set([iPath.id for iPath in lPaths])
+return list(sDistinctIdentifiers)
+## Return a list of the distinct query names present in the collection
+#
+# @param lPaths list of Path instances
+#
+@staticmethod
+def getListOfDistinctQueryNames( lPaths ):
+sDistinctQueryNames = set([iPath.range_query.seqname for iPath in lPaths])
+return list(sDistinctQueryNames)
+## Return a list of the distinct subject names present in the collection
+#
+# @param lPaths list of Path instances
+#
+@staticmethod
+def getListOfDistinctSubjectNames( lPaths ):
+sDistinctSubjectNames = set([iPath.range_subject.seqname for iPath in lPaths])
+return list(sDistinctSubjectNames)
+## Return a list of paths with matching query names
+#
+# @param lPaths list of Path instances
+# @param  queryName query name to extract
+@staticmethod
+def extractPathsFromQueryName(lPaths, queryName):
+return [iPath for iPath in lPaths if iPath.getQueryName() == queryName]
+## Return a list of paths with matching query names
+#
+# @param lPaths list of Path instances
+# @param  lQueryName query name list to extract
+@staticmethod
+def extractPathsFromQueryNameList(lPaths, lQueryNames):
+d = dict.fromkeys(lQueryNames)
+return [iPath for iPath in lPaths if iPath.getQueryName() in d]
+## Return a list of paths with matching subject names
+#
+# @param lPaths list of Path instances
+# @param  subjectName subject name to extract
+@staticmethod
+def extractPathsFromSubjectName(lPaths, subjectName):
+return [iPath for iPath in lPaths if iPath.getSubjectName() == subjectName]
+## Return a list of paths with coordinates overlap a given range
+#
+# @param lPaths list of Path instances
+# @param  queryName query name to extract
+# @param start starting position
+# @param end ending position
+# @return list of Path instance
+@staticmethod
+def extractPathsFromQueryCoord(lPaths, queryName, start, end):
+lExtractedPaths = []
+iAlign = Align(range_q = Range(queryName, start, end))
+for path in PathUtils.extractPathsFromQueryName(lPaths, queryName):
+if path.isQueryOverlapping(iAlign):
+lExtractedPaths.append(path)
+return lExtractedPaths
+## Return a list of lists containing query coordinates of the connections sorted in increasing order.
+#
+# @param lConnectedPaths: list of Path instances having the same identifier
+# @param minLength: threshold below which connections are not reported (default= 0 bp)
+# @note: return only connections longer than threshold
+# @note: if coordinate on query ends at 100, return 101
+# @warning: Path instances MUST be sorted in increasing order according to query coordinates
+# @warning: Path instances MUST be on direct query strand (and maybe on reverse subject strand)
+#
+@staticmethod
+def getListOfJoinCoordinatesOnQuery(lConnectedPaths, minLength=0):
+lJoinCoordinates = []
+for i in xrange(1,len(lConnectedPaths)):
+startJoin = lConnectedPaths[i-1].range_query.end
+endJoin = lConnectedPaths[i].range_query.start
+if endJoin - startJoin + 1 > minLength:
+lJoinCoordinates.append( [ startJoin + 1, endJoin - 1 ] )
+return lJoinCoordinates
+## Return the length on the query of all Path instance in the given list
+#
+# @param lPaths list of Path instances
+# @note overlapping ranges are not summed but truncated.
+#
+@staticmethod
+def getLengthOnQueryFromPathList( lPaths ):
+lSets = PathUtils.getSetListFromQueries( lPaths )
+lMergedSets = SetUtils.mergeSetsInList( lSets )
+length = SetUtils.getCumulLength( lMergedSets )
+return length
+## Convert a Path file into an Align file
+#
+# @param pathFile: name of the input Path file
+# @param alignFile: name of the output Align file
+#
+@staticmethod
+def convertPathFileIntoAlignFile(pathFile, alignFile):
+pathFileHandler = open(pathFile, "r")
+alignFileHandler = open(alignFile, "w")
+iPath = Path()
+for line in pathFileHandler:
+iPath.setFromString(line)
+iAlign = iPath.getAlignInstance()
+iAlign.write(alignFileHandler)
+pathFileHandler.close()
+alignFileHandler.close()
+#TODO: duplicated method => to rename with the name of the next method (which is called) ?
+## Convert a Path File into a Map file with query coordinates only
+#
+# @param pathFile: name of the input Path file
+# @param mapFile: name of the output Map file
+#
+@staticmethod
+def convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile ):
+pathFileHandler = open(pathFile, "r")
+mapFileHandler = open(mapFile, "w")
+p = Path()
+for line in pathFileHandler:
+p.reset()
+p.setFromTuple(line.split("\t"))
+p.writeSubjectAsMapOfQuery(mapFileHandler)
+pathFileHandler.close()
+mapFileHandler.close()
+## for each line of a given Path file, write the coordinates of the subject on the query as one line in a Map file
+#
+# @param pathFile: name of the input Path file
+# @param mapFile: name of the output Map file
+#
+@staticmethod
+def convertPathFileIntoMapFileWithSubjectsOnQueries( pathFile, mapFile ):
+PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile )
+## Merge matches on queries
+#
+# @param inFile: name of the input Path file
+# @param outFile: name of the output Path file
+#
+@staticmethod
+def mergeMatchesOnQueries(inFile, outFile):
+mapFile = "%s.map" % inFile
+PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly(inFile, mapFile)
+cmd = "mapOp"
+cmd += " -q %s" % mapFile
+cmd += " -m"
+cmd += " 2>&1 > /dev/null"
+exitStatus = os.system(cmd)
+if exitStatus != 0:
+print "ERROR: mapOp returned %i" % exitStatus
+sys.exit(1)
+os.remove(mapFile)
+mergeFile = "%s.merge" % mapFile
+mergeFileHandler = open(mergeFile, "r")
+outFileHandler = open(outFile, "w")
+m = Map()
+for line in mergeFileHandler:
+m.reset()
+m.setFromString(line, "\t")
+m.writeAsQueryOfPath(outFileHandler)
+mergeFileHandler.close()
+os.remove(mergeFile)
+outFileHandler.close()
+## Filter chains of Path(s) which length is below a given threshold
+#
+# @param lPaths: list of Path instances
+# @param minLengthChain: minimum length of a chain to be kept
+# @note: a chain may contain a single Path instance
+# @return: a list of Path instances
+#
+@staticmethod
+def filterPathListOnChainLength( lPaths, minLengthChain ):
+lFilteredPaths = []
+dPathnum2Paths = PathUtils.getDictOfListsWithIdAsKey( lPaths )
+for pathnum in dPathnum2Paths.keys():
+length = PathUtils.getLengthOnQueryFromPathList( dPathnum2Paths[ pathnum ] )
+if length >= minLengthChain:
+lFilteredPaths += dPathnum2Paths[ pathnum ]
+return lFilteredPaths
+## Return a Path list from a Path file
+#
+# @param pathFile string name of a Path file
+# @return a list of Path instances
+#
+@staticmethod
+def getPathListFromFile( pathFile ):
+lPaths = []
+with open(pathFile, "r") as pathFileHandler:
+for line in pathFileHandler:
+iPath = Path()
+iPath.setFromString(line)
+lPaths.append(iPath)
+return lPaths
+## Convert a chain into a 'pathrange'
+#
+# @param lPaths a list of Path instances with the same identifier
+# @note: the min and max of each Path is used
+#
+@staticmethod
+def convertPathListToPathrange( lPaths ):
+if len(lPaths) == 0:
+return
+if len(lPaths) == 1:
+return lPaths[0]
+iPathrange = copy.deepcopy( lPaths[0] )
+iPathrange.identity = lPaths[0].identity * lPaths[0].getLengthOnQuery()
+cumulQueryLength = iPathrange.getLengthOnQuery()
+for iPath in lPaths[1:]:
+if iPath.id != iPathrange.id:
+msg = "ERROR: two Path instances in the chain have different identifiers"
+sys.stderr.write( "%s\n" % ( msg ) )
+sys.exit(1)
+if iPathrange.range_subject.isOnDirectStrand() != iPath.range_subject.isOnDirectStrand():
+msg = "ERROR: two Path instances in the chain are on different strands"
+sys.stderr.write( "%s\n" % ( msg ) )
+sys.exit(1)
+iPathrange.range_query.start = min( iPathrange.range_query.start, iPath.range_query.start )
+iPathrange.range_query.end = max( iPathrange.range_query.end, iPath.range_query.end )
+if iPathrange.range_subject.isOnDirectStrand():
+iPathrange.range_subject.start = min( iPathrange.range_subject.start, iPath.range_subject.start )
+iPathrange.range_subject.end = max( iPathrange.range_subject.end, iPath.range_subject.end )
+else:
+iPathrange.range_subject.start = max( iPathrange.range_subject.start, iPath.range_subject.start )
+iPathrange.range_subject.end = min( iPathrange.range_subject.end, iPath.range_subject.end )
+iPathrange.e_value = min( iPathrange.e_value, iPath.e_value )
+iPathrange.score += iPath.score
+iPathrange.identity += iPath.identity * iPath.getLengthOnQuery()
+cumulQueryLength += iPath.getLengthOnQuery()
+iPathrange.identity = iPathrange.identity / float(cumulQueryLength)
+return iPathrange
+## Convert a Path file into an Align file via 'pathrange'
+#
+# @param pathFile: name of the input Path file
+# @param alignFile: name of the output Align file
+# @param verbose integer verbosity level
+# @note: the min and max of each Path is used
+#
+@staticmethod
+def convertPathFileIntoAlignFileViaPathrange( pathFile, alignFile, verbose=0 ):
+lPaths = PathUtils.getPathListFromFile( pathFile )
+dId2PathList = PathUtils.getDictOfListsWithIdAsKey( lPaths )
+lIds = dId2PathList.keys()
+lIds.sort()
+if verbose > 0:
+msg = "number of chains: %i" % ( len(lIds) )
+sys.stdout.write( "%s\n" % ( msg ) )
+sys.stdout.flush()
+alignFileHandler = open( alignFile, "w" )
+for identifier in lIds:
+iPath = PathUtils.convertPathListToPathrange( dId2PathList[ identifier ] )
+iAlign = iPath.getAlignInstance()
+iAlign.write( alignFileHandler )
+alignFileHandler.close()
+## Split a list of Path instances according to the name of the query
+#
+# @param lInPaths list of align instances
+# @return lOutPathLists list of align instances lists
+#
+@staticmethod
+def splitPathListByQueryName( lInPaths ):
+lInSortedPaths = sorted( lInPaths, key=lambda o: o.range_query.seqname )
+lOutPathLists = []
+if len(lInSortedPaths) != 0 :
+lPathsForCurrentQuery = []
+previousQuery = lInSortedPaths[0].range_query.seqname
+for iPath in lInSortedPaths :
+currentQuery = iPath.range_query.seqname
+if previousQuery != currentQuery :
+lOutPathLists.append( lPathsForCurrentQuery )
+previousQuery = currentQuery
+lPathsForCurrentQuery = []
+lPathsForCurrentQuery.append( iPath )
+lOutPathLists.append(lPathsForCurrentQuery)
+return lOutPathLists
+## Create an Path file from each list of Path instances in the input list
+#
+# @param lPathList list of lists with Path instances
+# @param pattern string
+# @param dirName string
+#
+@staticmethod
+def createPathFiles( lPathList, pattern, dirName="" ):
+nbFiles = len(lPathList)
+countFile = 1
+if dirName != "" :
+if dirName[-1] != "/":
+dirName = dirName + '/'
+os.mkdir( dirName )
+for lPath in lPathList:
+fileName = dirName + pattern  + "_%s.path" % ( str(countFile).zfill( len(str(nbFiles)) ) )
+PathUtils.writeListInFile( lPath, fileName )
+countFile += 1
+## Merge all overlapping Path instances in a list without considering the identifiers
+#  Start by sorting the Path instances by their increasing min coordinate
+#
+# @return: a new list with the merged Path instances
+#
+@staticmethod
+def mergePathsInList( lPaths ):
+lMergedPaths = []
+if len(lPaths)==0:
+return lMergedPaths
+lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths )
+prev_count = 0
+for iPath in lSortedPaths[0:]:
+if prev_count != len(lSortedPaths):
+for i in lSortedPaths[ prev_count + 1: ]:
+if iPath.isOverlapping( i ):
+iPath.merge( i )
+isAlreadyInList = False
+for newPath in lMergedPaths:
+if newPath.isOverlapping( iPath ):
+isAlreadyInList = True
+newPath.merge( iPath )
+lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath
+if not isAlreadyInList:
+lMergedPaths.append( iPath )
+prev_count += 1
+return lMergedPaths
+## Merge all overlapping Path instances in a list without considering if subjects are overlapping.
+#  Start by sorting the Path instances by their increasing min coordinate.
+#
+# @return: a new list with the merged Path instances
+#
+@staticmethod
+def mergePathsInListUsingQueryCoordsOnly( lPaths ):
+lMergedPaths = []
+if len(lPaths)==0:
+return lMergedPaths
+lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths )
+prev_count = 0
+for iPath in lSortedPaths[0:]:
+if prev_count != len(lSortedPaths):
+for i in lSortedPaths[ prev_count + 1: ]:
+if iPath.isQueryOverlapping( i ):
+iPath.merge( i )
+isAlreadyInList = False
+for newPath in lMergedPaths:
+if newPath.isQueryOverlapping( iPath ):
+isAlreadyInList = True
+newPath.merge( iPath )
+lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath
+if not isAlreadyInList:
+lMergedPaths.append( iPath )
+prev_count += 1
+return lMergedPaths
+## Convert a Path file into a GFF file
+#
+# @param pathFile: name of the input Path file
+# @param gffFile: name of the output GFF file
+# @param source: source to write in the GFF file (column 2)
+#
+# @note the 'path' query is supposed to correspond to the 'gff' first column
+#
+@staticmethod
+def convertPathFileIntoGffFile( pathFile, gffFile, source="REPET", verbose=0 ):
+dId2PathList = PathUtils.getDictOfListsWithIdAsKeyFromFile( pathFile )
+if verbose > 0:
+msg = "number of chains: %i" % ( len(dId2PathList.keys()) )
+sys.stdout.write( "%s\n" % msg )
+sys.stdout.flush()
+gffFileHandler = open( gffFile, "w" )
+for id in dId2PathList.keys():
+if len( dId2PathList[ id ] ) == 1:
+iPath = dId2PathList[ id ][0]
+string = iPath.toStringAsGff( ID="%i" % iPath.getIdentifier(),
+source=source )
+gffFileHandler.write( "%s\n" % string )
+else:
+iPathrange = PathUtils.convertPathListToPathrange( dId2PathList[ id ] )
+string = iPathrange.toStringAsGff( ID="ms%i" % iPathrange.getIdentifier(),
+source=source )
+gffFileHandler.write( "%s\n" % string )
+count = 0
+for iPath in dId2PathList[ id ]:
+count += 1
+string = iPath.toStringAsGff( type="match_part",
+ID="mp%i-%i" % ( iPath.getIdentifier(), count ),
+Parent="ms%i" % iPathrange.getIdentifier(),
+source=source )
+gffFileHandler.write( "%s\n" % string )
+gffFileHandler.close()
+## Convert a Path file into a Set file
+# replace old parser.pathrange2set
+# @param pathFile: name of the input Path file
+# @param setFile: name of the output Set file
+#
+@staticmethod
+def convertPathFileIntoSetFile( pathFile, setFile ):
+pathFileHandler = open(pathFile, "r")
+setFileHandler = open(setFile, "w")
+iPath = Path()
+for line in pathFileHandler:
+iPath.setFromString(line)
+iSet = iPath.getSubjectAsSetOfQuery()
+iSet.write(setFileHandler)
+pathFileHandler.close()
+setFileHandler.close()
+## Write Path File without duplicated Path (same query, same subject and same coordinate)
+#
+# @param inputFile: name of the input Path file
+# @param outputFile: name of the output Path file
+#
+@staticmethod
+def removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName(inputFile, outputFile):
+f = open(inputFile, "r")
+line = f.readline()
+previousQuery = ""
+previousSubject = ""
+lPaths = []
+while line:
+iPath = Path()
+iPath.setFromString(line)
+query = iPath.getQueryName()
+subject = iPath.getSubjectName()
+if (query != previousQuery or subject != previousSubject) and lPaths != []:
+lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths)
+PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a")
+lPaths = []
+lPaths.append(iPath)
+previousQuery = query
+previousSubject = subject
+line = f.readline()
+lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths)
+PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a")
+f.close()

Mercurial > repos > urgi-team > teiso

comparison TEisotools-1.1.a/commons/core/coord/PathUtils.py @ 13:feef9a0db09d draft