diff smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithOneInterval.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithOneInterval.py	Thu Jan 17 10:52:14 2013 -0500
@@ -0,0 +1,197 @@
+#! /usr/bin/env python
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+
+import struct
+import math
+import os
+from optparse import OptionParser
+from commons.core.writer.Gff3Writer import Gff3Writer
+from SMART.Java.Python.ncList.NCList import NCList
+from SMART.Java.Python.ncList.FileSorter import FileSorter
+from commons.core.parsing.ParserChooser import ParserChooser
+from SMART.Java.Python.ncList.NCListCursor import NCListCursor
+from SMART.Java.Python.structure.Transcript import Transcript
+
+LONGSIZE = struct.calcsize('l')
+
+class FindOverlapsWithOneInterval(object):
+	
+	def __init__(self, verbosity):
+		self._sortedFileName   = None
+		self._verbosity		= verbosity
+		self._overlappingNames = []
+		self._nbOverlaps	   = 0
+		self._nbWritten		= 0
+		
+	def __del__(self):
+		if self._sortedFileName and os.path.exists(self._sortedFileName):
+			os.remove(self._sortedFileName)
+	
+	def close(self):
+		self._iWriter.close()
+		
+	def setOutputFileName(self, fileName):
+		self._iWriter = Gff3Writer(fileName)
+		
+	def setFileName(self, fileName, format):
+		chooser = ParserChooser(self._verbosity)
+		chooser.findFormat(format)
+		self._parser		 = chooser.getParser(fileName)
+		self._sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0])
+		
+	def setInterval(self, chromosome, start, end):
+		self._chromosome = chromosome
+		self._start	  = start
+		self._end		= end
+		self._transcript = Transcript()
+		self._transcript.setChromosome(chromosome)
+		self._transcript.setStart(start)
+		self._transcript.setEnd(end)
+		self._transcript.setDirection("+")
+		
+	def setTranscript(self, transcript):
+		if transcript.__class__.__name__ == "Mapping":
+			transcript = transcript.getTranscript()
+		self._chromosome = transcript.getChromosome()
+		self._start	  = transcript.getStart()
+		self._end		= transcript.getEnd()
+		self._transcript = transcript
+
+	def prepareIntermediateFiles(self):
+		fs = FileSorter(self._parser, self._verbosity-4)
+		fs.selectChromosome(self._chromosome)
+		fs.perChromosome(False)
+		fs.setOutputFileName(self._sortedFileName)
+		fs.sort()
+		self._nbTotalLines = fs.getNbElements()
+		self._nbLines	  = fs.getNbElementsPerChromosome()[self._chromosome]
+			
+	def createNCList(self):
+		if self._verbosity > 2:
+			print "Creating NC-list..."
+		ncList = NCList(self._verbosity)
+		ncList.createIndex(True)
+		ncList.setChromosome(self._chromosome)
+		ncList.setFileName(self._sortedFileName)
+		ncList.setNbElements(self._nbTotalLines)
+		ncList.buildLists()
+		self.setNCList(ncList, ncList.getIndex())
+		if self._verbosity > 2:
+			print "	...done (%ds)" % (endTime - startTime)
+
+	def setNCList(self, ncList, index):
+		self._ncList = ncList
+		self._indix  = index
+
+	def binarySearch(self, cursor, startL, endL):
+		if startL > endL:
+			return None
+		middleL = (startL + endL) / 2
+		cursor.moveSibling(middleL)
+		overlap = self.isOverlapping(cursor)
+		if overlap == 0:
+			if middleL == startL:
+				return cursor
+			else:
+				return self.binarySearch(cursor, startL, middleL)
+		if overlap == -1:
+			return self.binarySearch(cursor, middleL + 1, endL)
+		return self.binarySearch(cursor, startL, middleL - 1)
+
+	def compare(self, cursor = None):
+		self._ncList.openFiles()
+		if cursor == None:
+			dump   = True
+			cursor = NCListCursor(None, self._ncList, 0, self._verbosity)
+		cursor._getSiblingData()
+		cursor = self.binarySearch(cursor, cursor._firstSiblingLIndex, cursor._lastSiblingLIndex)
+		if cursor == None:
+			return
+		while not cursor.isOut() and self.isOverlapping(cursor) == 0:
+			self.write(cursor)
+			newCursor = NCListCursor(cursor)
+			if newCursor.hasChildren():
+				newCursor.moveDown()
+				self.compare(newCursor)
+			if cursor.isLast():
+				return
+			cursor.moveRight()
+				
+	def isOverlapping(self, cursor):
+		if self._end < cursor.getStart():
+			return 1
+		if self._start > cursor.getEnd():
+			return -1
+		return 0
+
+	def write(self, cursor):
+		self._nbOverlaps += 1
+		refTranscript = cursor.getTranscript()
+		self._overlappingNames.append(refTranscript.getName())
+
+	def dumpWriter(self):
+		if (not self._overlappingNames) or self._transcript == None:
+			return
+		self._transcript.setTagValue("nbOverlaps", len(self._overlappingNames))
+		self._transcript.setTagValue("overlapsWith", "--".join(self._overlappingNames))
+		self._iWriter.addTranscript(self._transcript)
+		self._nbWritten	   += 1
+		self._overlappingNames = []
+
+	def run(self):
+		self.prepareIntermediateFiles()
+		self.createNCList()
+		self.compare()
+		self.dumpWriter()
+		self.close()
+		if self._verbosity > 0:
+			print "# refs:	%d" % (self._nbLines)
+			print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps)
+
+	
+if __name__ == "__main__":
+	description = "FindOverlapsWithOneInterval: Finds overlaps with one query interval."
+
+	parser = OptionParser(description = description)
+	parser.add_option("-i", "--input",	   dest="inputFileName",	  action="store",			type="string",  help="Input file [compulsory] [format: file in transcript format given by -f]")
+	parser.add_option("-f", "--format",	  dest="format",			 action="store",			type="string",  help="Format of previous file [compulsory] [format: transcript file format]")
+	parser.add_option("-s", "--start",	   dest="start",			  action="store",			type="int",	 help="The start of the query interval [compulsory] [format: int]")
+	parser.add_option("-e", "--end",		 dest="end",				action="store",			type="int",	 help="The end of the query interval [compulsory] [format: int]")
+	parser.add_option("-c", "--chromosome",  dest="chromosome",		 action="store",			type="string",  help="Chromosome of the query interval [compulsory] [format: string]")
+	parser.add_option("-o", "--output",	  dest="outputFileName",	 action="store",			type="string",  help="Output file [compulsory] [format: output file in GFF3 format]")
+	parser.add_option("-v", "--verbosity",   dest="verbosity",		  action="store", default=1, type="int",	 help="Trace level [format: int] [default: 1]")
+	(options, args) = parser.parse_args()
+	
+	iFOWOI = FindOverlapsWithOneInterval(options.verbosity)
+	iFOWOI.setFileName(options.inputFileName, options.format)
+	iFOWOI.setInterval(options.chromosome, options.start, options.end)
+	iFOWOI.setOutputFileName(options.outputFileName)
+	iFOWOI.run()