s_mart: SMART/Java/Python/clusterize.py comparison

comparison SMART/Java/Python/clusterize.py @ 18:94ab73e8a190

Uploaded

author	m-zytnicki
date	Mon, 29 Apr 2013 03:20:15 -0400
parents	769e306b7933
children

comparison

equal deleted inserted replaced

-:b0e8584489e6
+:94ab73e8a190
 # knowledge of the CeCILL license and that you accept its terms.
 #
 from commons.core.writer.WriterChooser import WriterChooser
 """Clusterize a set of transcripts"""
-import os
+import os, os.path, random
 from optparse import OptionParser
 from commons.core.parsing.ParserChooser import ParserChooser
 from commons.core.writer.Gff3Writer import Gff3Writer
 from SMART.Java.Python.structure.Transcript import Transcript
 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle
 from SMART.Java.Python.ncList.FileSorter import FileSorter
 from SMART.Java.Python.misc.Progress import Progress
+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
 class Clusterize(object):
-def __init__(self, verbosity):
-self.normalize         = False
-self.presorted         = False
-self.distance          = 1
-self.colinear          = False
-self.nbWritten         = 0
-self.nbMerges          = 0
-self.verbosity         = verbosity
-self.splittedFileNames = {}
-def __del__(self):
+	def __init__(self, verbosity):
-for fileName in self.splittedFileNames.values():
+		self.normalize		 = False
-os.remove(fileName)
+		self.presorted		 = False
+		self.distance		  = 1
+		self.colinear		  = False
+		self.nbWritten		 = 0
+		self.nbMerges		  = 0
+		self.verbosity		 = verbosity
+		self.splittedFileNames = {}
-def setInputFile(self, fileName, format):
+	def __del__(self):
-parserChooser = ParserChooser(self.verbosity)
+		for fileName in self.splittedFileNames.values():
-parserChooser.findFormat(format)
+			os.remove(fileName)
-self.parser = parserChooser.getParser(fileName)
-self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0])
-def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"):
+	def setInputFile(self, fileName, format):
-writerChooser = WriterChooser()
+		parserChooser = ParserChooser(self.verbosity)
-writerChooser.findFormat(format)
+		parserChooser.findFormat(format)
-self.writer = writerChooser.getWriter(fileName)
+		self.parser = parserChooser.getParser(fileName)
-self.writer.setTitle(title)
+		self.sortedFileName = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000))
-self.writer.setFeature(feature)
+		if "SMARTTMPPATH" in os.environ:
-self.writer.setFeaturePart(featurePart)
+			self.sortedFileName = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileName))
-def setDistance(self, distance):
+	def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"):
-self.distance = distance
+		writerChooser = WriterChooser()
+		writerChooser.findFormat(format)
+		self.writer = writerChooser.getWriter(fileName)
+		self.writer.setTitle(title)
+		self.writer.setFeature(feature)
+		self.writer.setFeaturePart(featurePart)
-def setColinear(self, colinear):
+	def setDistance(self, distance):
-self.colinear = colinear
+		self.distance = distance
-def setNormalize(self, normalize):
+	def setColinear(self, colinear):
-self.normalize = normalize
+		self.colinear = colinear
-def setPresorted(self, presorted):
-self.presorted = presorted
-def _sortFile(self):
+	def setNormalize(self, normalize):
-fs = FileSorter(self.parser, self.verbosity-4)
+		self.normalize = normalize
-fs.perChromosome(True)
-fs.setPresorted(self.presorted)
+	def setPresorted(self, presorted):
-fs.setOutputFileName(self.sortedFileName)
+		self.presorted = presorted
-fs.sort()
-self.splittedFileNames       = fs.getOutputFileNames()
-self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()
-self.nbElements              = fs.getNbElements()
-def _iterate(self, chromosome):
-progress    = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity)
-transcripts = []
-parser      = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity)
-for newTranscript in parser.getIterator():
-newTranscripts = []
-for oldTranscript in transcripts:
-if self._checkOverlap(newTranscript, oldTranscript):
-self._merge(newTranscript, oldTranscript)
-elif self._checkPassed(newTranscript, oldTranscript):
-self._write(oldTranscript)
-else:
-newTranscripts.append(oldTranscript)
-newTranscripts.append(newTranscript)
-transcripts = newTranscripts
-progress.inc()
-for transcript in transcripts:
-self._write(transcript)
-progress.done()
-def _merge(self, transcript1, transcript2):
+	def _sortFile(self):
-self.nbMerges += 1
+		if self.presorted:
-transcript2.setDirection(transcript1.getDirection())
+			return
-transcript1.merge(transcript2)
+		fs = FileSorter(self.parser, self.verbosity-4)
+		fs.perChromosome(True)
+		fs.setPresorted(self.presorted)
+		fs.setOutputFileName(self.sortedFileName)
+		fs.sort()
+		self.splittedFileNames       = fs.getOutputFileNames()
+		self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()
+		self.nbElements              = fs.getNbElements()
+	def _iterate(self, chromosome):
+		if chromosome == None:
+			progress = UnlimitedProgress(10000, "Reading input file", self.verbosity)
+			parser   = self.parser
+		else:
+			progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity)
+			parser   = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity)
+		transcripts     = []
+		self.nbElements = 0
+		for newTranscript in parser.getIterator():
+			newTranscripts = []
+			if newTranscript.__class__.__name__ == "Mapping":
+				newTranscript = newTranscript.getTranscript()
+			for oldTranscript in transcripts:
+				if self._checkOverlap(newTranscript, oldTranscript):
+					self._merge(newTranscript, oldTranscript)
+				elif self._checkPassed(newTranscript, oldTranscript):
+					self._write(oldTranscript)
+				else:
+					newTranscripts.append(oldTranscript)
+			newTranscripts.append(newTranscript)
+			transcripts = newTranscripts
+			self.nbElements += 1
+			progress.inc()
+		for transcript in transcripts:
+			self._write(transcript)
+		progress.done()
-def _write(self, transcript):
+	def _merge(self, transcript1, transcript2):
-self.nbWritten += 1
+		self.nbMerges += 1
-self.writer.addTranscript(transcript)
+		transcript2.setDirection(transcript1.getDirection())
+		transcript1.merge(transcript2)
-def _checkOverlap(self, transcript1, transcript2):
+	def _write(self, transcript):
-if self.colinear and transcript1.getDirection() != transcript2.getDirection():
+		self.nbWritten += 1
-return False
+		self.writer.addTranscript(transcript)
-if transcript1.getDistance(transcript2) > self.distance:
-return False
-return True
-def _checkPassed(self, transcript1, transcript2):
+	def _checkOverlap(self, transcript1, transcript2):
-return (transcript1.getDistance(transcript2) > self.distance)
+		if transcript1.getChromosome() != transcript2.getChromosome():
+			return False
+		if self.colinear and transcript1.getDirection() != transcript2.getDirection():
+			return False
+		if transcript1.getDistance(transcript2) > self.distance:
+			return False
+		return True
-def run(self):
+	def _checkPassed(self, transcript1, transcript2):
-self._sortFile()
+		return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance))
-for chromosome in sorted(self.splittedFileNames.keys()):
-self._iterate(chromosome)
+	def run(self):
-self.writer.close()
+		self._sortFile()
-if self.verbosity > 0:
+		if self.presorted:
-print "# input:   %d" % (self.nbElements)
+			self._iterate(None)
-print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100))
+		else:
-print "# merges:  %d" % (self.nbMerges)
+			for chromosome in sorted(self.splittedFileNames.keys()):
+				self._iterate(chromosome)
+		self.writer.close()
+		if self.verbosity > 0:
+			print "# input:   %d" % (self.nbElements)
+			print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100))
+			print "# merges:  %d" % (self.nbMerges)
 if __name__ == "__main__":
-description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]"
+	description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]"
-parser = OptionParser(description = description)
+	parser = OptionParser(description = description)
-parser.add_option("-i", "--input",     dest="inputFileName",  action="store",                     type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
+	parser.add_option("-i", "--input",     dest="inputFileName",  action="store",				     type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
-parser.add_option("-f", "--format",    dest="format",         action="store",                     type="string", help="format of file [format: transcript file format]")
+	parser.add_option("-f", "--format",    dest="format",		 action="store",				     type="string", help="format of file [format: transcript file format]")
-parser.add_option("-o", "--output",    dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
+	parser.add_option("-o", "--output",    dest="outputFileName", action="store",				     type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
-parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store",     default="gff",             type="string", help="output file format [format: transcript file format]")
+	parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store",     default="gff",		     type="string", help="output file format [format: transcript file format]")
-parser.add_option("-c", "--colinear",  dest="colinear",       action="store_true", default=False,                help="merge colinear transcripts only [format: bool] [default: false]")
+	parser.add_option("-c", "--colinear",  dest="colinear",       action="store_true", default=False,				help="merge colinear transcripts only [format: bool] [default: false]")
-parser.add_option("-d", "--distance",  dest="distance",       action="store",      default=0,     type="int",    help="max. distance between two transcripts to be merged [format: int] [default: 0]")
+	parser.add_option("-d", "--distance",  dest="distance",       action="store",      default=0,     type="int",    help="max. distance between two transcripts to be merged [format: int] [default: 0]")
-parser.add_option("-n", "--normalize", dest="normalize",      action="store_true", default=False,                help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
+	parser.add_option("-n", "--normalize", dest="normalize",      action="store_true", default=False,				help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
-parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int] [default: 1]")
+	parser.add_option("-s", "--sorted",    dest="sorted",		 action="store_true", default=False,				help="input is already sorted [format: bool] [default: false]")
-(options, args) = parser.parse_args()
+	parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int] [default: 1]")
+	(options, args) = parser.parse_args()
-c = Clusterize(options.verbosity)
-c.setInputFile(options.inputFileName, options.format)
+	c = Clusterize(options.verbosity)
-c.setOutputFileName(options.outputFileName, options.outputFormat)
+	c.setInputFile(options.inputFileName, options.format)
-c.setColinear(options.colinear)
+	c.setOutputFileName(options.outputFileName, options.outputFormat)
-c.setDistance(options.distance)
+	c.setColinear(options.colinear)
-c.setNormalize(options.normalize)
+	c.setDistance(options.distance)
-c.run()
+	c.setNormalize(options.normalize)
+	c.setPresorted(options.sorted)
+	c.run()

Mercurial > repos > yufei-luo > s_mart

comparison SMART/Java/Python/clusterize.py @ 18:94ab73e8a190