| 
6
 | 
     1 #! /usr/bin/env python
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # Copyright INRA-URGI 2009-2010
 | 
| 
 | 
     4 # 
 | 
| 
 | 
     5 # This software is governed by the CeCILL license under French law and
 | 
| 
 | 
     6 # abiding by the rules of distribution of free software. You can use,
 | 
| 
 | 
     7 # modify and/ or redistribute the software under the terms of the CeCILL
 | 
| 
 | 
     8 # license as circulated by CEA, CNRS and INRIA at the following URL
 | 
| 
 | 
     9 # "http://www.cecill.info".
 | 
| 
 | 
    10 # 
 | 
| 
 | 
    11 # As a counterpart to the access to the source code and rights to copy,
 | 
| 
 | 
    12 # modify and redistribute granted by the license, users are provided only
 | 
| 
 | 
    13 # with a limited warranty and the software's author, the holder of the
 | 
| 
 | 
    14 # economic rights, and the successive licensors have only limited
 | 
| 
 | 
    15 # liability.
 | 
| 
 | 
    16 # 
 | 
| 
 | 
    17 # In this respect, the user's attention is drawn to the risks associated
 | 
| 
 | 
    18 # with loading, using, modifying and/or developing or reproducing the
 | 
| 
 | 
    19 # software by the user in light of its specific status of free software,
 | 
| 
 | 
    20 # that may mean that it is complicated to manipulate, and that also
 | 
| 
 | 
    21 # therefore means that it is reserved for developers and experienced
 | 
| 
 | 
    22 # professionals having in-depth computer knowledge. Users are therefore
 | 
| 
 | 
    23 # encouraged to load and test the software's suitability as regards their
 | 
| 
 | 
    24 # requirements in conditions enabling the security of their systems and/or
 | 
| 
 | 
    25 # data to be ensured and, more generally, to use and operate it in the
 | 
| 
 | 
    26 # same conditions as regards security.
 | 
| 
 | 
    27 # 
 | 
| 
 | 
    28 # The fact that you are presently reading this means that you have had
 | 
| 
 | 
    29 # knowledge of the CeCILL license and that you accept its terms.
 | 
| 
 | 
    30 #
 | 
| 
 | 
    31 import os, struct, time, random
 | 
| 
 | 
    32 from optparse import OptionParser
 | 
| 
 | 
    33 from commons.core.parsing.ParserChooser import ParserChooser
 | 
| 
 | 
    34 from commons.core.writer.Gff3Writer import Gff3Writer
 | 
| 
 | 
    35 from SMART.Java.Python.structure.Transcript import Transcript
 | 
| 
 | 
    36 from SMART.Java.Python.structure.Interval import Interval
 | 
| 
 | 
    37 from SMART.Java.Python.ncList.NCList import NCList
 | 
| 
 | 
    38 from SMART.Java.Python.ncList.NCListCursor import NCListCursor
 | 
| 
 | 
    39 from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
 | 
| 
 | 
    40 from SMART.Java.Python.ncList.NCListHandler import NCListHandler
 | 
| 
 | 
    41 from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList
 | 
| 
 | 
    42 from SMART.Java.Python.misc.Progress import Progress
 | 
| 
 | 
    43 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
 | 
| 
 | 
    44 from SMART.Java.Python.misc import Utils
 | 
| 
 | 
    45 try:
 | 
| 
 | 
    46 	import cPickle as pickle
 | 
| 
 | 
    47 except:
 | 
| 
 | 
    48 	import pickle
 | 
| 
 | 
    49 
 | 
| 
 | 
    50 REFERENCE = 0
 | 
| 
 | 
    51 QUERY = 1
 | 
| 
 | 
    52 TYPES = (REFERENCE, QUERY)
 | 
| 
 | 
    53 TYPETOSTRING = {0: "reference", 1: "query"}
 | 
| 
 | 
    54 
 | 
| 
 | 
    55 class CompareOverlapping(object):
 | 
| 
 | 
    56 
 | 
| 
 | 
    57 	def __init__(self, verbosity = 1):
 | 
| 
 | 
    58 		self._outputFileName		   = "outputOverlaps.gff3"
 | 
| 
 | 
    59 		self._iWriter				   = None
 | 
| 
 | 
    60 		self._nbOverlappingQueries	   = 0
 | 
| 
 | 
    61 		self._nbOverlaps			   = 0
 | 
| 
 | 
    62 		self._nbLines				   = {REFERENCE: 0, QUERY: 0}
 | 
| 
 | 
    63 		self._verbosity				   = verbosity
 | 
| 
 | 
    64 		self._ncLists				   = {}
 | 
| 
 | 
    65 		self._cursors				   = {}
 | 
| 
 | 
    66 		self._splittedFileNames		   = {}
 | 
| 
 | 
    67 		self._nbElements			   = {}
 | 
| 
 | 
    68 		self._nbElementsPerChromosome  = {}
 | 
| 
 | 
    69 		self._inputFileNames		   = {REFERENCE: None,  QUERY: None}
 | 
| 
 | 
    70 		self._inputFileFormats		   = {REFERENCE: None,  QUERY: None}
 | 
| 
 | 
    71 		self._starts				   = {REFERENCE: None, QUERY: None}
 | 
| 
 | 
    72 		self._ends					   = {REFERENCE: None, QUERY: None}
 | 
| 
 | 
    73 		self._fivePrimes			   = {REFERENCE: None, QUERY: None}
 | 
| 
 | 
    74 		self._threePrimes			   = {REFERENCE: None, QUERY: None}
 | 
| 
 | 
    75 		self._ncListHandlers		   = {REFERENCE: None,  QUERY: None}
 | 
| 
 | 
    76 		self._convertedFileNames	   = {REFERENCE: False, QUERY: False}
 | 
| 
 | 
    77 		self._sorted                   = False
 | 
| 
 | 
    78 		self._index                    = False
 | 
| 
 | 
    79 		self._introns				   = False
 | 
| 
 | 
    80 		self._antisense				   = False
 | 
| 
 | 
    81 		self._colinear				   = False
 | 
| 
 | 
    82 		self._invert				   = False
 | 
| 
 | 
    83 		self._distance				   = 0
 | 
| 
 | 
    84 		self._minOverlap			   = 1
 | 
| 
 | 
    85 		self._pcOverlap				   = None
 | 
| 
 | 
    86 		self._included				   = False
 | 
| 
 | 
    87 		self._including				   = False
 | 
| 
 | 
    88 		self._outputNotOverlapping	   = False
 | 
| 
 | 
    89 		self._tmpRefFileName		   = None
 | 
| 
 | 
    90 		self._currentQueryTranscript   = None
 | 
| 
 | 
    91 		self._currentOrQueryTranscript = None
 | 
| 
 | 
    92 		self._currentExQueryTranscript = None
 | 
| 
 | 
    93 		self._randInt				   = random.randint(0, 100000)
 | 
| 
 | 
    94 		
 | 
| 
 | 
    95 	def __del__(self):
 | 
| 
 | 
    96 		for fileName in [self._tmpRefFileName] + self._convertedFileNames.values():
 | 
| 
 | 
    97 			if fileName != None and os.path.exists(fileName):
 | 
| 
 | 
    98 				os.remove(fileName)
 | 
| 
 | 
    99 
 | 
| 
 | 
   100 	def close(self):
 | 
| 
 | 
   101 		self._iWriter.close()
 | 
| 
 | 
   102 		
 | 
| 
 | 
   103 	def setInput(self, fileName, format, type):
 | 
| 
 | 
   104 		chooser = ParserChooser(self._verbosity)
 | 
| 
 | 
   105 		chooser.findFormat(format)
 | 
| 
 | 
   106 		self._inputFileNames[type]   = fileName
 | 
| 
 | 
   107 		self._inputFileFormats[type] = format
 | 
| 
 | 
   108 		
 | 
| 
 | 
   109 	def setOutput(self, outputFileName):
 | 
| 
 | 
   110 		if outputFileName != '':
 | 
| 
 | 
   111 			self._outputFileName = outputFileName
 | 
| 
 | 
   112 		self._iWriter = Gff3Writer(self._outputFileName)
 | 
| 
 | 
   113 
 | 
| 
 | 
   114 	def setSorted(self, sorted):
 | 
| 
 | 
   115 		self._sorted = sorted
 | 
| 
 | 
   116 
 | 
| 
 | 
   117 	def setIndex(self, index):
 | 
| 
 | 
   118 		self._index = index
 | 
| 
 | 
   119 
 | 
| 
 | 
   120 	def restrictToStart(self, distance, type):
 | 
| 
 | 
   121 		self._starts[type] = distance
 | 
| 
 | 
   122 		
 | 
| 
 | 
   123 	def restrictToEnd(self, distance, type):
 | 
| 
 | 
   124 		self._ends[type] = distance
 | 
| 
 | 
   125 		
 | 
| 
 | 
   126 	def extendFivePrime(self, distance, type):
 | 
| 
 | 
   127 		self._fivePrimes[type] = distance
 | 
| 
 | 
   128 		
 | 
| 
 | 
   129 	def extendThreePrime(self, distance, type):
 | 
| 
 | 
   130 		self._threePrimes[type] = distance
 | 
| 
 | 
   131 
 | 
| 
 | 
   132 	def acceptIntrons(self, boolean):
 | 
| 
 | 
   133 		self._introns = boolean
 | 
| 
 | 
   134 
 | 
| 
 | 
   135 	def getAntisenseOnly(self, boolean):
 | 
| 
 | 
   136 		self._antisense = boolean
 | 
| 
 | 
   137 		
 | 
| 
 | 
   138 	def getColinearOnly(self, boolean):
 | 
| 
 | 
   139 		self._colinear = boolean
 | 
| 
 | 
   140 
 | 
| 
 | 
   141 	def getInvert(self, boolean):
 | 
| 
 | 
   142 		self._invert = boolean
 | 
| 
 | 
   143 
 | 
| 
 | 
   144 	def setMaxDistance(self, distance):
 | 
| 
 | 
   145 		self._distance = distance
 | 
| 
 | 
   146 
 | 
| 
 | 
   147 	def setMinOverlap(self, overlap):
 | 
| 
 | 
   148 		self._minOverlap = overlap
 | 
| 
 | 
   149 
 | 
| 
 | 
   150 	def setPcOverlap(self, overlap):
 | 
| 
 | 
   151 		self._pcOverlap = overlap
 | 
| 
 | 
   152 
 | 
| 
 | 
   153 	def setIncludedOnly(self, boolean):
 | 
| 
 | 
   154 		self._included = boolean
 | 
| 
 | 
   155 		
 | 
| 
 | 
   156 	def setIncludingOnly(self, boolean):
 | 
| 
 | 
   157 		self._including = boolean
 | 
| 
 | 
   158 
 | 
| 
 | 
   159 	def includeNotOverlapping(self, boolean):
 | 
| 
 | 
   160 		self._outputNotOverlapping = boolean
 | 
| 
 | 
   161 		
 | 
| 
 | 
   162 	def transformTranscript(self, transcript, type):
 | 
| 
 | 
   163 		if self._starts[type] != None:
 | 
| 
 | 
   164 			transcript.restrictStart(self._starts[type])
 | 
| 
 | 
   165 		if self._ends[type] != None:
 | 
| 
 | 
   166 			transcript.restrictEnd(self._ends[type])
 | 
| 
 | 
   167 		if self._fivePrimes[type] != None:
 | 
| 
 | 
   168 			transcript.extendStart(self._fivePrimes[type])
 | 
| 
 | 
   169 		if self._threePrimes[type] != None:
 | 
| 
 | 
   170 			transcript.extendEnd(self._threePrimes[type])
 | 
| 
 | 
   171 		if self._introns:
 | 
| 
 | 
   172 			transcript.exons = []
 | 
| 
 | 
   173 		if type == REFERENCE and self._distance > 0:
 | 
| 
 | 
   174 			transcript.extendExons(self._distance)
 | 
| 
 | 
   175 		return transcript
 | 
| 
 | 
   176 
 | 
| 
 | 
   177 	def extendQueryTranscript(self, transcript):
 | 
| 
 | 
   178 		self._currentExQueryTranscript = Transcript()
 | 
| 
 | 
   179 		self._currentExQueryTranscript.copy(transcript)
 | 
| 
 | 
   180 		if self._fivePrimes[QUERY] != None:
 | 
| 
 | 
   181 			self._currentExQueryTranscript.extendStart(self._fivePrimes[QUERY])
 | 
| 
 | 
   182 		if self._threePrimes[QUERY] != None:
 | 
| 
 | 
   183 			self._currentExQueryTranscript.extendEnd(self._threePrimes[QUERY])
 | 
| 
 | 
   184 		transcript.exons = []
 | 
| 
 | 
   185 
 | 
| 
 | 
   186 	def createTmpRefFile(self):
 | 
| 
 | 
   187 		self._tmpRefFileName = "tmp_ref_%d.pkl" % (self._randInt)
 | 
| 
 | 
   188 		if "SMARTTMPPATH" in os.environ:
 | 
| 
 | 
   189 			self._tmpRefFileName = os.path.join(os.environ["SMARTTMPPATH"], self._tmpRefFileName)
 | 
| 
 | 
   190 		chooser = ParserChooser(self._verbosity)
 | 
| 
 | 
   191 		chooser.findFormat(self._inputFileFormats[REFERENCE])
 | 
| 
 | 
   192 		parser = chooser.getParser(self._inputFileNames[REFERENCE])
 | 
| 
 | 
   193 		writer = NCListFilePickle(self._tmpRefFileName, self._verbosity)
 | 
| 
 | 
   194 		for transcript in parser.getIterator():
 | 
| 
 | 
   195 			transcript = self.transformTranscript(transcript, REFERENCE)
 | 
| 
 | 
   196 			writer.addTranscript(transcript)
 | 
| 
 | 
   197 		writer.close()
 | 
| 
 | 
   198 		self._inputFileNames[REFERENCE]   = self._tmpRefFileName
 | 
| 
 | 
   199 		self._inputFileFormats[REFERENCE] = "pkl"
 | 
| 
 | 
   200 
 | 
| 
 | 
   201 	def createNCLists(self):
 | 
| 
 | 
   202 		self._ncLists = dict([type, {}] for type in TYPES)
 | 
| 
 | 
   203 		self._indices = dict([type, {}] for type in TYPES)
 | 
| 
 | 
   204 		self._cursors = dict([type, {}] for type in TYPES)
 | 
| 
 | 
   205 		for type in TYPES:
 | 
| 
 | 
   206 			if self._verbosity > 2:
 | 
| 
 | 
   207 				print "Creating %s NC-list..." % (TYPETOSTRING[type])
 | 
| 
 | 
   208 			self._convertedFileNames[type] = "%s_%d_%d.ncl" % (self._inputFileNames[type], self._randInt, type)
 | 
| 
 | 
   209 			ncLists = ConvertToNCList(self._verbosity)
 | 
| 
 | 
   210 			ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type])
 | 
| 
 | 
   211 			ncLists.setOutputFileName(self._convertedFileNames[type])
 | 
| 
 | 
   212 			ncLists.setSorted(self._sorted)
 | 
| 
 | 
   213 			if type == REFERENCE and self._index:
 | 
| 
 | 
   214 				ncLists.setIndex(True)
 | 
| 
 | 
   215 			ncLists.run()
 | 
| 
 | 
   216 			self._ncListHandlers[type] = NCListHandler(self._verbosity)
 | 
| 
 | 
   217 			self._ncListHandlers[type].setFileName(self._convertedFileNames[type])
 | 
| 
 | 
   218 			self._ncListHandlers[type].loadData()
 | 
| 
 | 
   219 			self._nbLines[type]					= self._ncListHandlers[type].getNbElements()
 | 
| 
 | 
   220 			self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome()
 | 
| 
 | 
   221 			self._ncLists[type]					= self._ncListHandlers[type].getNCLists()
 | 
| 
 | 
   222 			for chromosome, ncList in self._ncLists[type].iteritems():
 | 
| 
 | 
   223 				self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity)
 | 
| 
 | 
   224 				if type == REFERENCE and self._index:
 | 
| 
 | 
   225 					self._indices[REFERENCE][chromosome] = ncList.getIndex()
 | 
| 
 | 
   226 			if self._verbosity > 2:
 | 
| 
 | 
   227 				print "	...done"
 | 
| 
 | 
   228 
 | 
| 
 | 
   229 	def compare(self):
 | 
| 
 | 
   230 		nbSkips, nbMoves   = 0, 0
 | 
| 
 | 
   231 		previousChromosome = None
 | 
| 
 | 
   232 		done			   = False
 | 
| 
 | 
   233 		refNCList		   = None
 | 
| 
 | 
   234 		queryNCList		   = None
 | 
| 
 | 
   235 		startTime		   = time.time()
 | 
| 
 | 
   236 		progress		   = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity)
 | 
| 
 | 
   237 		for chromosome, queryNCList in self._ncLists[QUERY].iteritems():
 | 
| 
 | 
   238 			queryParser = self._ncListHandlers[QUERY].getParser(chromosome)
 | 
| 
 | 
   239 			queryNCList = self._ncLists[QUERY][chromosome]
 | 
| 
 | 
   240 			queryCursor = self._cursors[QUERY][chromosome]
 | 
| 
 | 
   241 			if chromosome != previousChromosome:
 | 
| 
 | 
   242 				skipChromosome		= False
 | 
| 
 | 
   243 				previousChromosome  = chromosome
 | 
| 
 | 
   244 				if chromosome not in self._ncLists[REFERENCE]:
 | 
| 
 | 
   245 					if self._outputNotOverlapping:
 | 
| 
 | 
   246 						while not queryCursor.isOut():
 | 
| 
 | 
   247 							self._currentQueryTranscript = queryCursor.getTranscript()
 | 
| 
 | 
   248 							self._writeIntervalInNewGFF3({})
 | 
| 
 | 
   249 							if queryCursor.hasChildren():
 | 
| 
 | 
   250 								queryCursor.moveDown()
 | 
| 
 | 
   251 							else:
 | 
| 
 | 
   252 								queryCursor.moveNext()
 | 
| 
 | 
   253 					progress.inc()
 | 
| 
 | 
   254 					continue
 | 
| 
 | 
   255 				refNCList = self._ncLists[REFERENCE][chromosome]
 | 
| 
 | 
   256 				refCursor = self._cursors[REFERENCE][chromosome]
 | 
| 
 | 
   257 			while True:
 | 
| 
 | 
   258 				self._currentOrQueryTranscript = queryCursor.getTranscript()
 | 
| 
 | 
   259 				self._currentQueryTranscript = Transcript()
 | 
| 
 | 
   260 				self._currentQueryTranscript.copy(self._currentOrQueryTranscript)
 | 
| 
 | 
   261 				self._currentQueryTranscript = self.transformTranscript(self._currentQueryTranscript, QUERY)
 | 
| 
 | 
   262 				self.extendQueryTranscript(self._currentOrQueryTranscript)
 | 
| 
 | 
   263 				newRefLaddr = self.checkIndex(refCursor)
 | 
| 
 | 
   264 				if newRefLaddr != None:
 | 
| 
 | 
   265 					nbMoves += 1
 | 
| 
 | 
   266 					refCursor.setLIndex(newRefLaddr)
 | 
| 
 | 
   267 					done = False
 | 
| 
 | 
   268 				refCursor, done, unmatched = self.findOverlapIter(refCursor, done)
 | 
| 
 | 
   269 				if refCursor.isOut():
 | 
| 
 | 
   270 					if not self._invert and not self._outputNotOverlapping:
 | 
| 
 | 
   271 						break
 | 
| 
 | 
   272 				if (unmatched and not self._invert and not self._outputNotOverlapping) or not queryCursor.hasChildren():
 | 
| 
 | 
   273 					queryCursor.moveNext()
 | 
| 
 | 
   274 					nbSkips += 1
 | 
| 
 | 
   275 				else:
 | 
| 
 | 
   276 					queryCursor.moveDown()
 | 
| 
 | 
   277 				if queryCursor.isOut():
 | 
| 
 | 
   278 					break
 | 
| 
 | 
   279 			progress.inc()
 | 
| 
 | 
   280 		progress.done()
 | 
| 
 | 
   281 		endTime = time.time()
 | 
| 
 | 
   282 		self._timeSpent = endTime - startTime
 | 
| 
 | 
   283 		if self._verbosity >= 10:
 | 
| 
 | 
   284 			print "# skips:   %d" % (nbSkips)
 | 
| 
 | 
   285 			print "# moves:   %d" % (nbMoves)
 | 
| 
 | 
   286 
 | 
| 
 | 
   287 	def findOverlapIter(self, cursor, done):
 | 
| 
 | 
   288 		chromosome = self._currentQueryTranscript.getChromosome()
 | 
| 
 | 
   289 		matched	= False
 | 
| 
 | 
   290 		if chromosome not in self._ncLists[REFERENCE]:
 | 
| 
 | 
   291 			return None, False, True
 | 
| 
 | 
   292 		ncList = self._ncLists[REFERENCE][chromosome]
 | 
| 
 | 
   293 		overlappingNames = {}
 | 
| 
 | 
   294 		nextDone = False
 | 
| 
 | 
   295 		firstOverlapLAddr = NCListCursor(cursor)
 | 
| 
 | 
   296 		firstOverlapLAddr.setLIndex(-1)
 | 
| 
 | 
   297 		if cursor.isOut():
 | 
| 
 | 
   298 			self._writeIntervalInNewGFF3(overlappingNames)
 | 
| 
 | 
   299 			return firstOverlapLAddr, False, True
 | 
| 
 | 
   300 		parentCursor = NCListCursor(cursor)
 | 
| 
 | 
   301 		parentCursor.moveUp()
 | 
| 
 | 
   302 		firstParentAfter = False
 | 
| 
 | 
   303 		while not parentCursor.isOut(): 
 | 
| 
 | 
   304 			if self.isOverlapping(parentCursor) == 0:
 | 
| 
 | 
   305 				matched = True
 | 
| 
 | 
   306 				if self._checkOverlap(parentCursor.getTranscript()):
 | 
| 
 | 
   307 					overlappingNames.update(self._extractID(parentCursor.getTranscript()))
 | 
| 
 | 
   308 				if firstOverlapLAddr.isOut():
 | 
| 
 | 
   309 					firstOverlapLAddr.copy(parentCursor)
 | 
| 
 | 
   310 					nextDone = True 
 | 
| 
 | 
   311 			elif self.isOverlapping(parentCursor) == 1:
 | 
| 
 | 
   312 				firstParentAfter = NCListCursor(parentCursor)
 | 
| 
 | 
   313 			parentCursor.moveUp()
 | 
| 
 | 
   314 		if firstParentAfter:
 | 
| 
 | 
   315 			written = self._writeIntervalInNewGFF3(overlappingNames)
 | 
| 
 | 
   316 			return firstParentAfter, False, not written if self._invert else not matched
 | 
| 
 | 
   317 		#This loop finds the overlaps with currentRefLAddr.#
 | 
| 
 | 
   318 		while True:
 | 
| 
 | 
   319 			parentCursor = NCListCursor(cursor)
 | 
| 
 | 
   320 			parentCursor.moveUp()
 | 
| 
 | 
   321 			#In case: Query is on the right of the RefInterval and does not overlap.
 | 
| 
 | 
   322 			overlap = self.isOverlapping(cursor)
 | 
| 
 | 
   323 			if overlap == -1:
 | 
| 
 | 
   324 				cursor.moveNext()
 | 
| 
 | 
   325 			#In case: Query overlaps with RefInterval.	
 | 
| 
 | 
   326 			elif overlap == 0:
 | 
| 
 | 
   327 				matched = True
 | 
| 
 | 
   328 				if self._checkOverlap(cursor.getTranscript()):
 | 
| 
 | 
   329 					overlappingNames.update(self._extractID(cursor.getTranscript()))
 | 
| 
 | 
   330 				if firstOverlapLAddr.compare(parentCursor):
 | 
| 
 | 
   331 					firstOverlapLAddr.copy(cursor)
 | 
| 
 | 
   332 					nextDone = True
 | 
| 
 | 
   333 				if done:
 | 
| 
 | 
   334 					cursor.moveNext()
 | 
| 
 | 
   335 				else:
 | 
| 
 | 
   336 					if not cursor.hasChildren():
 | 
| 
 | 
   337 						cursor.moveNext()
 | 
| 
 | 
   338 						if cursor.isOut():
 | 
| 
 | 
   339 							break
 | 
| 
 | 
   340 					else:
 | 
| 
 | 
   341 						cursor.moveDown()
 | 
| 
 | 
   342 			#In case: Query is on the left of the RefInterval and does not overlap.		
 | 
| 
 | 
   343 			else:
 | 
| 
 | 
   344 				if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor):
 | 
| 
 | 
   345 					firstOverlapLAddr.copy(cursor)
 | 
| 
 | 
   346 					nextDone = False # new
 | 
| 
 | 
   347 				break
 | 
| 
 | 
   348 			
 | 
| 
 | 
   349 			done = False
 | 
| 
 | 
   350 			if cursor.isOut():
 | 
| 
 | 
   351 				break
 | 
| 
 | 
   352 		written = self._writeIntervalInNewGFF3(overlappingNames)
 | 
| 
 | 
   353 		return firstOverlapLAddr, nextDone, not written if self._invert else not matched
 | 
| 
 | 
   354 	
 | 
| 
 | 
   355 	def isOverlapping(self, refTranscript):
 | 
| 
 | 
   356 		if (self._currentExQueryTranscript.getStart() <= refTranscript.getEnd() and self._currentExQueryTranscript.getEnd() >= refTranscript.getStart()):
 | 
| 
 | 
   357 			return 0   
 | 
| 
 | 
   358 		if self._currentExQueryTranscript.getEnd() < refTranscript.getStart():
 | 
| 
 | 
   359 			return 1
 | 
| 
 | 
   360 		return -1
 | 
| 
 | 
   361 
 | 
| 
 | 
   362 	def checkIndex(self, cursor):
 | 
| 
 | 
   363 		if not self._index:
 | 
| 
 | 
   364 			return None
 | 
| 
 | 
   365 		if cursor.isOut():
 | 
| 
 | 
   366 			return None
 | 
| 
 | 
   367 		chromosome = self._currentExQueryTranscript.getChromosome()
 | 
| 
 | 
   368 		nextLIndex = self._indices[REFERENCE][chromosome].getIndex(self._currentExQueryTranscript)
 | 
| 
 | 
   369 		if nextLIndex == None:
 | 
| 
 | 
   370 			return None
 | 
| 
 | 
   371 		ncList		 = self._ncLists[REFERENCE][chromosome]
 | 
| 
 | 
   372 		nextGffAddress = ncList.getRefGffAddr(nextLIndex)
 | 
| 
 | 
   373 		thisGffAddress = cursor.getGffAddress()
 | 
| 
 | 
   374 		if nextGffAddress > thisGffAddress:
 | 
| 
 | 
   375 			return nextLIndex
 | 
| 
 | 
   376 		return None
 | 
| 
 | 
   377 		
 | 
| 
 | 
   378 	def _writeIntervalInNewGFF3(self, names):
 | 
| 
 | 
   379 		nbOverlaps = 0
 | 
| 
 | 
   380 		for cpt in names.values():
 | 
| 
 | 
   381 			nbOverlaps += cpt
 | 
| 
 | 
   382 		self._nbOverlappingQueries += 1		      if Utils.xor(names, self._invert) else 0
 | 
| 
 | 
   383 		self._nbOverlaps		   += nbOverlaps  if Utils.xor(names, self._invert) else 0
 | 
| 
 | 
   384 		if names:
 | 
| 
 | 
   385 			self._currentQueryTranscript.setTagValue("overlapWith", ",".join(names))
 | 
| 
 | 
   386 			self._currentQueryTranscript.setTagValue("nbOverlaps", nbOverlaps)
 | 
| 
 | 
   387 			if self._invert:
 | 
| 
 | 
   388 				return False
 | 
| 
 | 
   389 		else:
 | 
| 
 | 
   390 			if self._outputNotOverlapping:
 | 
| 
 | 
   391 				self._currentQueryTranscript.setTagValue("nbOverlaps", 0)
 | 
| 
 | 
   392 			elif not self._invert:
 | 
| 
 | 
   393 				return False
 | 
| 
 | 
   394 		self._iWriter.addTranscript(self._currentQueryTranscript)
 | 
| 
 | 
   395 		self._iWriter.write()
 | 
| 
 | 
   396 		return True
 | 
| 
 | 
   397 		
 | 
| 
 | 
   398 	def _extractID(self, transcript):
 | 
| 
 | 
   399 		id		 = transcript.getTagValue("ID")		      if "ID"		  in transcript.getTagNames() else transcript.getUniqueName()
 | 
| 
 | 
   400 		nbElements = transcript.getTagValue("nbElements") if "nbElements" in transcript.getTagNames() else 1
 | 
| 
 | 
   401 		return {id: float(nbElements)}
 | 
| 
 | 
   402 
 | 
| 
 | 
   403 	def _checkOverlap(self, refTranscript):
 | 
| 
 | 
   404 		if self._currentQueryTranscript.getDistance(refTranscript) > self._distance:
 | 
| 
 | 
   405 			return False
 | 
| 
 | 
   406 		minOverlap = self._minOverlap
 | 
| 
 | 
   407 		if self._pcOverlap != None:
 | 
| 
 | 
   408 			minOverlap = max(self._minOverlap, self._currentQueryTranscript.getSize() / 100.0 * self._pcOverlap)
 | 
| 
 | 
   409 		if not self._currentQueryTranscript.overlapWith(refTranscript, minOverlap):
 | 
| 
 | 
   410 			return False
 | 
| 
 | 
   411 		if self._antisense and self._currentQueryTranscript.getDirection() == refTranscript.getDirection():
 | 
| 
 | 
   412 			return False
 | 
| 
 | 
   413 		if self._colinear and self._currentQueryTranscript.getDirection() != refTranscript.getDirection():
 | 
| 
 | 
   414 			return False
 | 
| 
 | 
   415 		if self._included and not refTranscript.include(self._currentQueryTranscript):
 | 
| 
 | 
   416 			return False
 | 
| 
 | 
   417 		if self._including and not self._currentQueryTranscript.include(refTranscript):
 | 
| 
 | 
   418 			return False
 | 
| 
 | 
   419 		if self._introns:
 | 
| 
 | 
   420 			return True
 | 
| 
 | 
   421 		return self._currentQueryTranscript.overlapWithExon(refTranscript, minOverlap)
 | 
| 
 | 
   422 		
 | 
| 
 | 
   423 	def run(self):
 | 
| 
 | 
   424 		self.createTmpRefFile()
 | 
| 
 | 
   425 		self.createNCLists()
 | 
| 
 | 
   426 		self.compare()
 | 
| 
 | 
   427 		self.close()
 | 
| 
 | 
   428 		if self._verbosity > 0:
 | 
| 
 | 
   429 			print "# queries: %d" % (self._nbLines[QUERY])
 | 
| 
 | 
   430 			print "# refs:	  %d" % (self._nbLines[REFERENCE])
 | 
| 
 | 
   431 			print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps)
 | 
| 
 | 
   432 			print "time: 	  %ds" % (self._timeSpent)
 | 
| 
 | 
   433 
 | 
| 
 | 
   434 
 | 
| 
 | 
   435 if __name__ == "__main__":
 | 
| 
 | 
   436 	description = "Compare Overlapping v1.0.4: Get the data which overlap with a reference set. [Category: Data Comparison]"
 | 
| 
 | 
   437 
 | 
| 
 | 
   438 	parser = OptionParser(description = description)
 | 
| 
 | 
   439 	parser.add_option("-i", "--input1",		      dest="inputFileName1", action="store",					 type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]")
 | 
| 
 | 
   440 	parser.add_option("-f", "--format1",		  dest="format1",		 action="store",					 type="string", help="format of file 1 [compulsory] [format: transcript file format]")
 | 
| 
 | 
   441 	parser.add_option("-j", "--input2",		      dest="inputFileName2", action="store",					 type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
 | 
| 
 | 
   442 	parser.add_option("-g", "--format2",		  dest="format2",		 action="store",					 type="string", help="format of file 2 [compulsory] [format: transcript file format]")
 | 
| 
 | 
   443 	parser.add_option("-o", "--output",		      dest="output",		 action="store",	  default=None,  type="string", help="output file [compulsory] [format: output file in GFF3 format]")
 | 
| 
 | 
   444 	parser.add_option("-D", "--index",	          dest="index",	         action="store_true", default=False,	            help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]")
 | 
| 
 | 
   445 	parser.add_option("-r", "--sorted",	          dest="sorted",	     action="store_true", default=False,	            help="input files are already sorted [format: boolean] [default: False]")
 | 
| 
 | 
   446 	parser.add_option("-S", "--start1",		      dest="start1",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]")
 | 
| 
 | 
   447 	parser.add_option("-s", "--start2",		      dest="start2",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]")
 | 
| 
 | 
   448 	parser.add_option("-U", "--end1",			  dest="end1",		     action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]")
 | 
| 
 | 
   449 	parser.add_option("-u", "--end2",			  dest="end2",		     action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]")
 | 
| 
 | 
   450 	parser.add_option("-t", "--intron",		      dest="introns",		 action="store_true", default=False,				help="also report introns [format: bool] [default: false]")
 | 
| 
 | 
   451 	parser.add_option("-E", "--5primeExtension1", dest="fivePrime1",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 1 [format: int]")
 | 
| 
 | 
   452 	parser.add_option("-e", "--5primeExtension2", dest="fivePrime2",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 2 [format: int]")
 | 
| 
 | 
   453 	parser.add_option("-N", "--3primeExtension1", dest="threePrime1",	 action="store",	  default=None,  type="int",	help="extension towards 3' in file 1 [format: int]")
 | 
| 
 | 
   454 	parser.add_option("-n", "--3primeExtension2", dest="threePrime2",	 action="store",	  default=None,  type="int",	help="extension towards 3' in file 2 [format: int]")
 | 
| 
 | 
   455 	parser.add_option("-c", "--colinear",		  dest="colinear",		 action="store_true", default=False,				help="colinear only [format: bool] [default: false]")
 | 
| 
 | 
   456 	parser.add_option("-a", "--antisense",		  dest="antisense",		 action="store_true", default=False,				help="antisense only [format: bool] [default: false]")
 | 
| 
 | 
   457 	parser.add_option("-d", "--distance",		  dest="distance",	     action="store",	  default=0,	 type="int",	help="accept some distance between query and reference [format: int]")
 | 
| 
 | 
   458 	parser.add_option("-k", "--included",		  dest="included",	     action="store_true", default=False,				help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]")
 | 
| 
 | 
   459 	parser.add_option("-K", "--including",		  dest="including",	     action="store_true", default=False,				help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]")
 | 
| 
 | 
   460 	parser.add_option("-m", "--minOverlap",		  dest="minOverlap",	 action="store",	  default=1,	 type="int",	help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]")
 | 
| 
 | 
   461 	parser.add_option("-p", "--pcOverlap",		  dest="pcOverlap",	     action="store",	  default=None,  type="int",	help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]")
 | 
| 
 | 
   462 	parser.add_option("-O", "--notOverlapping",   dest="notOverlapping", action="store_true", default=False,				help="also output not overlapping data [format: bool] [default: false]")
 | 
| 
 | 
   463 	parser.add_option("-x", "--exclude",		  dest="exclude",		 action="store_true", default=False,				help="invert the match [format: bool] [default: false]")
 | 
| 
 | 
   464 	parser.add_option("-v", "--verbosity",		  dest="verbosity",		 action="store",	  default=1,	 type="int",	help="trace level [format: int]")
 | 
| 
 | 
   465 	(options, args) = parser.parse_args()
 | 
| 
 | 
   466 
 | 
| 
 | 
   467 	co = CompareOverlapping(options.verbosity)
 | 
| 
 | 
   468 	co.setInput(options.inputFileName1, options.format1, QUERY)
 | 
| 
 | 
   469 	co.setInput(options.inputFileName2, options.format2, REFERENCE)
 | 
| 
 | 
   470 	co.setOutput(options.output)
 | 
| 
 | 
   471 	co.setSorted(options.sorted)
 | 
| 
 | 
   472 	co.setIndex(options.index)
 | 
| 
 | 
   473 	co.restrictToStart(options.start1, QUERY)
 | 
| 
 | 
   474 	co.restrictToStart(options.start2, REFERENCE)
 | 
| 
 | 
   475 	co.restrictToEnd(options.end1, QUERY)
 | 
| 
 | 
   476 	co.restrictToEnd(options.end2, REFERENCE)
 | 
| 
 | 
   477 	co.extendFivePrime(options.fivePrime1, QUERY)
 | 
| 
 | 
   478 	co.extendFivePrime(options.fivePrime2, REFERENCE)
 | 
| 
 | 
   479 	co.extendThreePrime(options.threePrime1, QUERY)
 | 
| 
 | 
   480 	co.extendThreePrime(options.threePrime2, REFERENCE)
 | 
| 
 | 
   481 	co.acceptIntrons(options.introns)
 | 
| 
 | 
   482 	co.getAntisenseOnly(options.antisense)
 | 
| 
 | 
   483 	co.getColinearOnly(options.colinear)
 | 
| 
 | 
   484 	co.getInvert(options.exclude)
 | 
| 
 | 
   485 	co.setMaxDistance(options.distance)
 | 
| 
 | 
   486 	co.setMinOverlap(options.minOverlap)
 | 
| 
 | 
   487 	co.setPcOverlap(options.pcOverlap)
 | 
| 
 | 
   488 	co.setIncludedOnly(options.included)
 | 
| 
 | 
   489 	co.setIncludingOnly(options.including)
 | 
| 
 | 
   490 	co.includeNotOverlapping(options.notOverlapping)
 | 
| 
 | 
   491 	co.run()
 |