| 6 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2010 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 import os, struct, time, random | 
|  | 32 from optparse import OptionParser | 
|  | 33 from commons.core.parsing.ParserChooser import ParserChooser | 
|  | 34 from commons.core.writer.Gff3Writer import Gff3Writer | 
|  | 35 from SMART.Java.Python.structure.Transcript import Transcript | 
|  | 36 from SMART.Java.Python.structure.Interval import Interval | 
|  | 37 from SMART.Java.Python.ncList.NCList import NCList | 
|  | 38 from SMART.Java.Python.ncList.NCListCursor import NCListCursor | 
|  | 39 from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle | 
|  | 40 from SMART.Java.Python.ncList.NCListHandler import NCListHandler | 
|  | 41 from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList | 
|  | 42 from SMART.Java.Python.misc.Progress import Progress | 
|  | 43 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress | 
|  | 44 from SMART.Java.Python.misc import Utils | 
|  | 45 try: | 
|  | 46 	import cPickle as pickle | 
|  | 47 except: | 
|  | 48 	import pickle | 
|  | 49 | 
|  | 50 REFERENCE = 0 | 
|  | 51 QUERY = 1 | 
|  | 52 TYPES = (REFERENCE, QUERY) | 
|  | 53 TYPETOSTRING = {0: "reference", 1: "query"} | 
|  | 54 | 
|  | 55 class CompareOverlapping(object): | 
|  | 56 | 
|  | 57 	def __init__(self, verbosity = 1): | 
|  | 58 		self._outputFileName		   = "outputOverlaps.gff3" | 
|  | 59 		self._iWriter				   = None | 
|  | 60 		self._nbOverlappingQueries	   = 0 | 
|  | 61 		self._nbOverlaps			   = 0 | 
|  | 62 		self._nbLines				   = {REFERENCE: 0, QUERY: 0} | 
|  | 63 		self._verbosity				   = verbosity | 
|  | 64 		self._ncLists				   = {} | 
|  | 65 		self._cursors				   = {} | 
|  | 66 		self._splittedFileNames		   = {} | 
|  | 67 		self._nbElements			   = {} | 
|  | 68 		self._nbElementsPerChromosome  = {} | 
|  | 69 		self._inputFileNames		   = {REFERENCE: None,  QUERY: None} | 
|  | 70 		self._inputFileFormats		   = {REFERENCE: None,  QUERY: None} | 
|  | 71 		self._starts				   = {REFERENCE: None, QUERY: None} | 
|  | 72 		self._ends					   = {REFERENCE: None, QUERY: None} | 
|  | 73 		self._fivePrimes			   = {REFERENCE: None, QUERY: None} | 
|  | 74 		self._threePrimes			   = {REFERENCE: None, QUERY: None} | 
|  | 75 		self._ncListHandlers		   = {REFERENCE: None,  QUERY: None} | 
|  | 76 		self._convertedFileNames	   = {REFERENCE: False, QUERY: False} | 
|  | 77 		self._sorted                   = False | 
|  | 78 		self._index                    = False | 
|  | 79 		self._introns				   = False | 
|  | 80 		self._antisense				   = False | 
|  | 81 		self._colinear				   = False | 
|  | 82 		self._invert				   = False | 
|  | 83 		self._distance				   = 0 | 
|  | 84 		self._minOverlap			   = 1 | 
|  | 85 		self._pcOverlap				   = None | 
|  | 86 		self._included				   = False | 
|  | 87 		self._including				   = False | 
|  | 88 		self._outputNotOverlapping	   = False | 
|  | 89 		self._tmpRefFileName		   = None | 
|  | 90 		self._currentQueryTranscript   = None | 
|  | 91 		self._currentOrQueryTranscript = None | 
|  | 92 		self._currentExQueryTranscript = None | 
|  | 93 		self._randInt				   = random.randint(0, 100000) | 
|  | 94 | 
|  | 95 	def __del__(self): | 
|  | 96 		for fileName in [self._tmpRefFileName] + self._convertedFileNames.values(): | 
|  | 97 			if fileName != None and os.path.exists(fileName): | 
|  | 98 				os.remove(fileName) | 
|  | 99 | 
|  | 100 	def close(self): | 
|  | 101 		self._iWriter.close() | 
|  | 102 | 
|  | 103 	def setInput(self, fileName, format, type): | 
|  | 104 		chooser = ParserChooser(self._verbosity) | 
|  | 105 		chooser.findFormat(format) | 
|  | 106 		self._inputFileNames[type]   = fileName | 
|  | 107 		self._inputFileFormats[type] = format | 
|  | 108 | 
|  | 109 	def setOutput(self, outputFileName): | 
|  | 110 		if outputFileName != '': | 
|  | 111 			self._outputFileName = outputFileName | 
|  | 112 		self._iWriter = Gff3Writer(self._outputFileName) | 
|  | 113 | 
|  | 114 	def setSorted(self, sorted): | 
|  | 115 		self._sorted = sorted | 
|  | 116 | 
|  | 117 	def setIndex(self, index): | 
|  | 118 		self._index = index | 
|  | 119 | 
|  | 120 	def restrictToStart(self, distance, type): | 
|  | 121 		self._starts[type] = distance | 
|  | 122 | 
|  | 123 	def restrictToEnd(self, distance, type): | 
|  | 124 		self._ends[type] = distance | 
|  | 125 | 
|  | 126 	def extendFivePrime(self, distance, type): | 
|  | 127 		self._fivePrimes[type] = distance | 
|  | 128 | 
|  | 129 	def extendThreePrime(self, distance, type): | 
|  | 130 		self._threePrimes[type] = distance | 
|  | 131 | 
|  | 132 	def acceptIntrons(self, boolean): | 
|  | 133 		self._introns = boolean | 
|  | 134 | 
|  | 135 	def getAntisenseOnly(self, boolean): | 
|  | 136 		self._antisense = boolean | 
|  | 137 | 
|  | 138 	def getColinearOnly(self, boolean): | 
|  | 139 		self._colinear = boolean | 
|  | 140 | 
|  | 141 	def getInvert(self, boolean): | 
|  | 142 		self._invert = boolean | 
|  | 143 | 
|  | 144 	def setMaxDistance(self, distance): | 
|  | 145 		self._distance = distance | 
|  | 146 | 
|  | 147 	def setMinOverlap(self, overlap): | 
|  | 148 		self._minOverlap = overlap | 
|  | 149 | 
|  | 150 	def setPcOverlap(self, overlap): | 
|  | 151 		self._pcOverlap = overlap | 
|  | 152 | 
|  | 153 	def setIncludedOnly(self, boolean): | 
|  | 154 		self._included = boolean | 
|  | 155 | 
|  | 156 	def setIncludingOnly(self, boolean): | 
|  | 157 		self._including = boolean | 
|  | 158 | 
|  | 159 	def includeNotOverlapping(self, boolean): | 
|  | 160 		self._outputNotOverlapping = boolean | 
|  | 161 | 
|  | 162 	def transformTranscript(self, transcript, type): | 
|  | 163 		if self._starts[type] != None: | 
|  | 164 			transcript.restrictStart(self._starts[type]) | 
|  | 165 		if self._ends[type] != None: | 
|  | 166 			transcript.restrictEnd(self._ends[type]) | 
|  | 167 		if self._fivePrimes[type] != None: | 
|  | 168 			transcript.extendStart(self._fivePrimes[type]) | 
|  | 169 		if self._threePrimes[type] != None: | 
|  | 170 			transcript.extendEnd(self._threePrimes[type]) | 
|  | 171 		if self._introns: | 
|  | 172 			transcript.exons = [] | 
|  | 173 		if type == REFERENCE and self._distance > 0: | 
|  | 174 			transcript.extendExons(self._distance) | 
|  | 175 		return transcript | 
|  | 176 | 
|  | 177 	def extendQueryTranscript(self, transcript): | 
|  | 178 		self._currentExQueryTranscript = Transcript() | 
|  | 179 		self._currentExQueryTranscript.copy(transcript) | 
|  | 180 		if self._fivePrimes[QUERY] != None: | 
|  | 181 			self._currentExQueryTranscript.extendStart(self._fivePrimes[QUERY]) | 
|  | 182 		if self._threePrimes[QUERY] != None: | 
|  | 183 			self._currentExQueryTranscript.extendEnd(self._threePrimes[QUERY]) | 
|  | 184 		transcript.exons = [] | 
|  | 185 | 
|  | 186 	def createTmpRefFile(self): | 
|  | 187 		self._tmpRefFileName = "tmp_ref_%d.pkl" % (self._randInt) | 
|  | 188 		if "SMARTTMPPATH" in os.environ: | 
|  | 189 			self._tmpRefFileName = os.path.join(os.environ["SMARTTMPPATH"], self._tmpRefFileName) | 
|  | 190 		chooser = ParserChooser(self._verbosity) | 
|  | 191 		chooser.findFormat(self._inputFileFormats[REFERENCE]) | 
|  | 192 		parser = chooser.getParser(self._inputFileNames[REFERENCE]) | 
|  | 193 		writer = NCListFilePickle(self._tmpRefFileName, self._verbosity) | 
|  | 194 		for transcript in parser.getIterator(): | 
|  | 195 			transcript = self.transformTranscript(transcript, REFERENCE) | 
|  | 196 			writer.addTranscript(transcript) | 
|  | 197 		writer.close() | 
|  | 198 		self._inputFileNames[REFERENCE]   = self._tmpRefFileName | 
|  | 199 		self._inputFileFormats[REFERENCE] = "pkl" | 
|  | 200 | 
|  | 201 	def createNCLists(self): | 
|  | 202 		self._ncLists = dict([type, {}] for type in TYPES) | 
|  | 203 		self._indices = dict([type, {}] for type in TYPES) | 
|  | 204 		self._cursors = dict([type, {}] for type in TYPES) | 
|  | 205 		for type in TYPES: | 
|  | 206 			if self._verbosity > 2: | 
|  | 207 				print "Creating %s NC-list..." % (TYPETOSTRING[type]) | 
|  | 208 			self._convertedFileNames[type] = "%s_%d_%d.ncl" % (self._inputFileNames[type], self._randInt, type) | 
| 46 | 209 			if "SMARTTMPPATH" in os.environ: | 
|  | 210 				self._convertedFileNames[type] = os.path.join(os.environ["SMARTTMPPATH"], self._convertedFileNames[type]) | 
| 6 | 211 			ncLists = ConvertToNCList(self._verbosity) | 
|  | 212 			ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type]) | 
|  | 213 			ncLists.setOutputFileName(self._convertedFileNames[type]) | 
|  | 214 			ncLists.setSorted(self._sorted) | 
|  | 215 			if type == REFERENCE and self._index: | 
|  | 216 				ncLists.setIndex(True) | 
|  | 217 			ncLists.run() | 
|  | 218 			self._ncListHandlers[type] = NCListHandler(self._verbosity) | 
|  | 219 			self._ncListHandlers[type].setFileName(self._convertedFileNames[type]) | 
|  | 220 			self._ncListHandlers[type].loadData() | 
|  | 221 			self._nbLines[type]					= self._ncListHandlers[type].getNbElements() | 
|  | 222 			self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome() | 
|  | 223 			self._ncLists[type]					= self._ncListHandlers[type].getNCLists() | 
|  | 224 			for chromosome, ncList in self._ncLists[type].iteritems(): | 
|  | 225 				self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity) | 
|  | 226 				if type == REFERENCE and self._index: | 
|  | 227 					self._indices[REFERENCE][chromosome] = ncList.getIndex() | 
|  | 228 			if self._verbosity > 2: | 
|  | 229 				print "	...done" | 
|  | 230 | 
|  | 231 	def compare(self): | 
|  | 232 		nbSkips, nbMoves   = 0, 0 | 
|  | 233 		previousChromosome = None | 
|  | 234 		done			   = False | 
|  | 235 		refNCList		   = None | 
|  | 236 		queryNCList		   = None | 
|  | 237 		startTime		   = time.time() | 
|  | 238 		progress		   = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity) | 
|  | 239 		for chromosome, queryNCList in self._ncLists[QUERY].iteritems(): | 
|  | 240 			queryParser = self._ncListHandlers[QUERY].getParser(chromosome) | 
|  | 241 			queryNCList = self._ncLists[QUERY][chromosome] | 
|  | 242 			queryCursor = self._cursors[QUERY][chromosome] | 
|  | 243 			if chromosome != previousChromosome: | 
|  | 244 				skipChromosome		= False | 
|  | 245 				previousChromosome  = chromosome | 
|  | 246 				if chromosome not in self._ncLists[REFERENCE]: | 
|  | 247 					if self._outputNotOverlapping: | 
|  | 248 						while not queryCursor.isOut(): | 
|  | 249 							self._currentQueryTranscript = queryCursor.getTranscript() | 
|  | 250 							self._writeIntervalInNewGFF3({}) | 
|  | 251 							if queryCursor.hasChildren(): | 
|  | 252 								queryCursor.moveDown() | 
|  | 253 							else: | 
|  | 254 								queryCursor.moveNext() | 
|  | 255 					progress.inc() | 
|  | 256 					continue | 
|  | 257 				refNCList = self._ncLists[REFERENCE][chromosome] | 
|  | 258 				refCursor = self._cursors[REFERENCE][chromosome] | 
|  | 259 			while True: | 
|  | 260 				self._currentOrQueryTranscript = queryCursor.getTranscript() | 
|  | 261 				self._currentQueryTranscript = Transcript() | 
|  | 262 				self._currentQueryTranscript.copy(self._currentOrQueryTranscript) | 
|  | 263 				self._currentQueryTranscript = self.transformTranscript(self._currentQueryTranscript, QUERY) | 
|  | 264 				self.extendQueryTranscript(self._currentOrQueryTranscript) | 
|  | 265 				newRefLaddr = self.checkIndex(refCursor) | 
|  | 266 				if newRefLaddr != None: | 
|  | 267 					nbMoves += 1 | 
|  | 268 					refCursor.setLIndex(newRefLaddr) | 
|  | 269 					done = False | 
|  | 270 				refCursor, done, unmatched = self.findOverlapIter(refCursor, done) | 
|  | 271 				if refCursor.isOut(): | 
|  | 272 					if not self._invert and not self._outputNotOverlapping: | 
|  | 273 						break | 
|  | 274 				if (unmatched and not self._invert and not self._outputNotOverlapping) or not queryCursor.hasChildren(): | 
|  | 275 					queryCursor.moveNext() | 
|  | 276 					nbSkips += 1 | 
|  | 277 				else: | 
|  | 278 					queryCursor.moveDown() | 
|  | 279 				if queryCursor.isOut(): | 
|  | 280 					break | 
|  | 281 			progress.inc() | 
|  | 282 		progress.done() | 
|  | 283 		endTime = time.time() | 
|  | 284 		self._timeSpent = endTime - startTime | 
|  | 285 		if self._verbosity >= 10: | 
|  | 286 			print "# skips:   %d" % (nbSkips) | 
|  | 287 			print "# moves:   %d" % (nbMoves) | 
|  | 288 | 
|  | 289 	def findOverlapIter(self, cursor, done): | 
|  | 290 		chromosome = self._currentQueryTranscript.getChromosome() | 
|  | 291 		matched	= False | 
|  | 292 		if chromosome not in self._ncLists[REFERENCE]: | 
|  | 293 			return None, False, True | 
|  | 294 		ncList = self._ncLists[REFERENCE][chromosome] | 
|  | 295 		overlappingNames = {} | 
|  | 296 		nextDone = False | 
|  | 297 		firstOverlapLAddr = NCListCursor(cursor) | 
|  | 298 		firstOverlapLAddr.setLIndex(-1) | 
|  | 299 		if cursor.isOut(): | 
|  | 300 			self._writeIntervalInNewGFF3(overlappingNames) | 
|  | 301 			return firstOverlapLAddr, False, True | 
|  | 302 		parentCursor = NCListCursor(cursor) | 
|  | 303 		parentCursor.moveUp() | 
|  | 304 		firstParentAfter = False | 
|  | 305 		while not parentCursor.isOut(): | 
|  | 306 			if self.isOverlapping(parentCursor) == 0: | 
|  | 307 				matched = True | 
|  | 308 				if self._checkOverlap(parentCursor.getTranscript()): | 
|  | 309 					overlappingNames.update(self._extractID(parentCursor.getTranscript())) | 
|  | 310 				if firstOverlapLAddr.isOut(): | 
|  | 311 					firstOverlapLAddr.copy(parentCursor) | 
|  | 312 					nextDone = True | 
|  | 313 			elif self.isOverlapping(parentCursor) == 1: | 
|  | 314 				firstParentAfter = NCListCursor(parentCursor) | 
|  | 315 			parentCursor.moveUp() | 
|  | 316 		if firstParentAfter: | 
|  | 317 			written = self._writeIntervalInNewGFF3(overlappingNames) | 
|  | 318 			return firstParentAfter, False, not written if self._invert else not matched | 
|  | 319 		#This loop finds the overlaps with currentRefLAddr.# | 
|  | 320 		while True: | 
|  | 321 			parentCursor = NCListCursor(cursor) | 
|  | 322 			parentCursor.moveUp() | 
|  | 323 			#In case: Query is on the right of the RefInterval and does not overlap. | 
|  | 324 			overlap = self.isOverlapping(cursor) | 
|  | 325 			if overlap == -1: | 
|  | 326 				cursor.moveNext() | 
|  | 327 			#In case: Query overlaps with RefInterval. | 
|  | 328 			elif overlap == 0: | 
|  | 329 				matched = True | 
|  | 330 				if self._checkOverlap(cursor.getTranscript()): | 
|  | 331 					overlappingNames.update(self._extractID(cursor.getTranscript())) | 
|  | 332 				if firstOverlapLAddr.compare(parentCursor): | 
|  | 333 					firstOverlapLAddr.copy(cursor) | 
|  | 334 					nextDone = True | 
|  | 335 				if done: | 
|  | 336 					cursor.moveNext() | 
|  | 337 				else: | 
|  | 338 					if not cursor.hasChildren(): | 
|  | 339 						cursor.moveNext() | 
|  | 340 						if cursor.isOut(): | 
|  | 341 							break | 
|  | 342 					else: | 
|  | 343 						cursor.moveDown() | 
|  | 344 			#In case: Query is on the left of the RefInterval and does not overlap. | 
|  | 345 			else: | 
|  | 346 				if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor): | 
|  | 347 					firstOverlapLAddr.copy(cursor) | 
|  | 348 					nextDone = False # new | 
|  | 349 				break | 
|  | 350 | 
|  | 351 			done = False | 
|  | 352 			if cursor.isOut(): | 
|  | 353 				break | 
|  | 354 		written = self._writeIntervalInNewGFF3(overlappingNames) | 
|  | 355 		return firstOverlapLAddr, nextDone, not written if self._invert else not matched | 
|  | 356 | 
|  | 357 	def isOverlapping(self, refTranscript): | 
|  | 358 		if (self._currentExQueryTranscript.getStart() <= refTranscript.getEnd() and self._currentExQueryTranscript.getEnd() >= refTranscript.getStart()): | 
|  | 359 			return 0 | 
|  | 360 		if self._currentExQueryTranscript.getEnd() < refTranscript.getStart(): | 
|  | 361 			return 1 | 
|  | 362 		return -1 | 
|  | 363 | 
|  | 364 	def checkIndex(self, cursor): | 
|  | 365 		if not self._index: | 
|  | 366 			return None | 
|  | 367 		if cursor.isOut(): | 
|  | 368 			return None | 
|  | 369 		chromosome = self._currentExQueryTranscript.getChromosome() | 
|  | 370 		nextLIndex = self._indices[REFERENCE][chromosome].getIndex(self._currentExQueryTranscript) | 
|  | 371 		if nextLIndex == None: | 
|  | 372 			return None | 
|  | 373 		ncList		 = self._ncLists[REFERENCE][chromosome] | 
|  | 374 		nextGffAddress = ncList.getRefGffAddr(nextLIndex) | 
|  | 375 		thisGffAddress = cursor.getGffAddress() | 
|  | 376 		if nextGffAddress > thisGffAddress: | 
|  | 377 			return nextLIndex | 
|  | 378 		return None | 
|  | 379 | 
|  | 380 	def _writeIntervalInNewGFF3(self, names): | 
|  | 381 		nbOverlaps = 0 | 
|  | 382 		for cpt in names.values(): | 
|  | 383 			nbOverlaps += cpt | 
|  | 384 		self._nbOverlappingQueries += 1		      if Utils.xor(names, self._invert) else 0 | 
|  | 385 		self._nbOverlaps		   += nbOverlaps  if Utils.xor(names, self._invert) else 0 | 
|  | 386 		if names: | 
|  | 387 			self._currentQueryTranscript.setTagValue("overlapWith", ",".join(names)) | 
|  | 388 			self._currentQueryTranscript.setTagValue("nbOverlaps", nbOverlaps) | 
|  | 389 			if self._invert: | 
|  | 390 				return False | 
|  | 391 		else: | 
|  | 392 			if self._outputNotOverlapping: | 
|  | 393 				self._currentQueryTranscript.setTagValue("nbOverlaps", 0) | 
|  | 394 			elif not self._invert: | 
|  | 395 				return False | 
|  | 396 		self._iWriter.addTranscript(self._currentQueryTranscript) | 
|  | 397 		self._iWriter.write() | 
|  | 398 		return True | 
|  | 399 | 
|  | 400 	def _extractID(self, transcript): | 
|  | 401 		id		 = transcript.getTagValue("ID")		      if "ID"		  in transcript.getTagNames() else transcript.getUniqueName() | 
|  | 402 		nbElements = transcript.getTagValue("nbElements") if "nbElements" in transcript.getTagNames() else 1 | 
|  | 403 		return {id: float(nbElements)} | 
|  | 404 | 
|  | 405 	def _checkOverlap(self, refTranscript): | 
|  | 406 		if self._currentQueryTranscript.getDistance(refTranscript) > self._distance: | 
|  | 407 			return False | 
|  | 408 		minOverlap = self._minOverlap | 
|  | 409 		if self._pcOverlap != None: | 
|  | 410 			minOverlap = max(self._minOverlap, self._currentQueryTranscript.getSize() / 100.0 * self._pcOverlap) | 
|  | 411 		if not self._currentQueryTranscript.overlapWith(refTranscript, minOverlap): | 
|  | 412 			return False | 
|  | 413 		if self._antisense and self._currentQueryTranscript.getDirection() == refTranscript.getDirection(): | 
|  | 414 			return False | 
|  | 415 		if self._colinear and self._currentQueryTranscript.getDirection() != refTranscript.getDirection(): | 
|  | 416 			return False | 
|  | 417 		if self._included and not refTranscript.include(self._currentQueryTranscript): | 
|  | 418 			return False | 
|  | 419 		if self._including and not self._currentQueryTranscript.include(refTranscript): | 
|  | 420 			return False | 
|  | 421 		if self._introns: | 
|  | 422 			return True | 
|  | 423 		return self._currentQueryTranscript.overlapWithExon(refTranscript, minOverlap) | 
|  | 424 | 
|  | 425 	def run(self): | 
|  | 426 		self.createTmpRefFile() | 
|  | 427 		self.createNCLists() | 
|  | 428 		self.compare() | 
|  | 429 		self.close() | 
|  | 430 		if self._verbosity > 0: | 
|  | 431 			print "# queries: %d" % (self._nbLines[QUERY]) | 
|  | 432 			print "# refs:	  %d" % (self._nbLines[REFERENCE]) | 
|  | 433 			print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps) | 
|  | 434 			print "time: 	  %ds" % (self._timeSpent) | 
|  | 435 | 
|  | 436 | 
|  | 437 if __name__ == "__main__": | 
|  | 438 	description = "Compare Overlapping v1.0.4: Get the data which overlap with a reference set. [Category: Data Comparison]" | 
|  | 439 | 
|  | 440 	parser = OptionParser(description = description) | 
|  | 441 	parser.add_option("-i", "--input1",		      dest="inputFileName1", action="store",					 type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") | 
|  | 442 	parser.add_option("-f", "--format1",		  dest="format1",		 action="store",					 type="string", help="format of file 1 [compulsory] [format: transcript file format]") | 
|  | 443 	parser.add_option("-j", "--input2",		      dest="inputFileName2", action="store",					 type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") | 
|  | 444 	parser.add_option("-g", "--format2",		  dest="format2",		 action="store",					 type="string", help="format of file 2 [compulsory] [format: transcript file format]") | 
|  | 445 	parser.add_option("-o", "--output",		      dest="output",		 action="store",	  default=None,  type="string", help="output file [compulsory] [format: output file in GFF3 format]") | 
|  | 446 	parser.add_option("-D", "--index",	          dest="index",	         action="store_true", default=False,	            help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]") | 
|  | 447 	parser.add_option("-r", "--sorted",	          dest="sorted",	     action="store_true", default=False,	            help="input files are already sorted [format: boolean] [default: False]") | 
|  | 448 	parser.add_option("-S", "--start1",		      dest="start1",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]") | 
|  | 449 	parser.add_option("-s", "--start2",		      dest="start2",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]") | 
|  | 450 	parser.add_option("-U", "--end1",			  dest="end1",		     action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]") | 
|  | 451 	parser.add_option("-u", "--end2",			  dest="end2",		     action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]") | 
|  | 452 	parser.add_option("-t", "--intron",		      dest="introns",		 action="store_true", default=False,				help="also report introns [format: bool] [default: false]") | 
|  | 453 	parser.add_option("-E", "--5primeExtension1", dest="fivePrime1",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 1 [format: int]") | 
|  | 454 	parser.add_option("-e", "--5primeExtension2", dest="fivePrime2",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 2 [format: int]") | 
|  | 455 	parser.add_option("-N", "--3primeExtension1", dest="threePrime1",	 action="store",	  default=None,  type="int",	help="extension towards 3' in file 1 [format: int]") | 
|  | 456 	parser.add_option("-n", "--3primeExtension2", dest="threePrime2",	 action="store",	  default=None,  type="int",	help="extension towards 3' in file 2 [format: int]") | 
|  | 457 	parser.add_option("-c", "--colinear",		  dest="colinear",		 action="store_true", default=False,				help="colinear only [format: bool] [default: false]") | 
|  | 458 	parser.add_option("-a", "--antisense",		  dest="antisense",		 action="store_true", default=False,				help="antisense only [format: bool] [default: false]") | 
|  | 459 	parser.add_option("-d", "--distance",		  dest="distance",	     action="store",	  default=0,	 type="int",	help="accept some distance between query and reference [format: int]") | 
|  | 460 	parser.add_option("-k", "--included",		  dest="included",	     action="store_true", default=False,				help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]") | 
|  | 461 	parser.add_option("-K", "--including",		  dest="including",	     action="store_true", default=False,				help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]") | 
|  | 462 	parser.add_option("-m", "--minOverlap",		  dest="minOverlap",	 action="store",	  default=1,	 type="int",	help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]") | 
|  | 463 	parser.add_option("-p", "--pcOverlap",		  dest="pcOverlap",	     action="store",	  default=None,  type="int",	help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]") | 
|  | 464 	parser.add_option("-O", "--notOverlapping",   dest="notOverlapping", action="store_true", default=False,				help="also output not overlapping data [format: bool] [default: false]") | 
|  | 465 	parser.add_option("-x", "--exclude",		  dest="exclude",		 action="store_true", default=False,				help="invert the match [format: bool] [default: false]") | 
|  | 466 	parser.add_option("-v", "--verbosity",		  dest="verbosity",		 action="store",	  default=1,	 type="int",	help="trace level [format: int]") | 
|  | 467 	(options, args) = parser.parse_args() | 
|  | 468 | 
|  | 469 	co = CompareOverlapping(options.verbosity) | 
|  | 470 	co.setInput(options.inputFileName1, options.format1, QUERY) | 
|  | 471 	co.setInput(options.inputFileName2, options.format2, REFERENCE) | 
|  | 472 	co.setOutput(options.output) | 
|  | 473 	co.setSorted(options.sorted) | 
|  | 474 	co.setIndex(options.index) | 
|  | 475 	co.restrictToStart(options.start1, QUERY) | 
|  | 476 	co.restrictToStart(options.start2, REFERENCE) | 
|  | 477 	co.restrictToEnd(options.end1, QUERY) | 
|  | 478 	co.restrictToEnd(options.end2, REFERENCE) | 
|  | 479 	co.extendFivePrime(options.fivePrime1, QUERY) | 
|  | 480 	co.extendFivePrime(options.fivePrime2, REFERENCE) | 
|  | 481 	co.extendThreePrime(options.threePrime1, QUERY) | 
|  | 482 	co.extendThreePrime(options.threePrime2, REFERENCE) | 
|  | 483 	co.acceptIntrons(options.introns) | 
|  | 484 	co.getAntisenseOnly(options.antisense) | 
|  | 485 	co.getColinearOnly(options.colinear) | 
|  | 486 	co.getInvert(options.exclude) | 
|  | 487 	co.setMaxDistance(options.distance) | 
|  | 488 	co.setMinOverlap(options.minOverlap) | 
|  | 489 	co.setPcOverlap(options.pcOverlap) | 
|  | 490 	co.setIncludedOnly(options.included) | 
|  | 491 	co.setIncludingOnly(options.including) | 
|  | 492 	co.includeNotOverlapping(options.notOverlapping) | 
|  | 493 	co.run() |