diff SMART/Java/Python/clusterize.py @ 60:90f4b29d884f

Uploaded
author m-zytnicki
date Fri, 21 Feb 2014 08:32:36 -0500
parents 169d364ddd91
children f4de72c80eac
line wrap: on
line diff
--- a/SMART/Java/Python/clusterize.py	Mon Feb 10 03:39:09 2014 -0500
+++ b/SMART/Java/Python/clusterize.py	Fri Feb 21 08:32:36 2014 -0500
@@ -33,6 +33,7 @@
 
 import os, os.path, random
 from optparse import OptionParser
+from heapq import heappush, heappop
 from commons.core.parsing.ParserChooser import ParserChooser
 from commons.core.writer.Gff3Writer import Gff3Writer
 from SMART.Java.Python.structure.Transcript import Transcript
@@ -44,26 +45,31 @@
 class Clusterize(object):
 
 	def __init__(self, verbosity):
-		self.normalize		 = False
-		self.presorted		 = False
-		self.distance		  = 1
-		self.colinear		  = False
-		self.nbWritten		 = 0
-		self.nbMerges		  = 0
-		self.verbosity		 = verbosity
+		self.parsers           = {}
+		self.sortedFileNames   = {}
+		self.normalize         = False
+		self.presorted         = False
+		self.distance          = 1
+		self.collinear         = False
+		self.nbWritten         = 0
+		self.nbMerges          = 0
+		self.verbosity         = verbosity
 		self.splittedFileNames = {}
+		self.chromosomes       = set()
 
 	def __del__(self):
-		for fileName in self.splittedFileNames.values():
-			os.remove(fileName)
+		for fileName1 in self.splittedFileNames:
+			for fileName2 in self.splittedFileNames[fileName1].values():
+				os.remove(fileName2)
 
-	def setInputFile(self, fileName, format):
+	def setInputFiles(self, fileNames, format):
 		parserChooser = ParserChooser(self.verbosity)
 		parserChooser.findFormat(format)
-		self.parser = parserChooser.getParser(fileName)
-		self.sortedFileName = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000))
-		if "SMARTTMPPATH" in os.environ:
-			self.sortedFileName = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileName))
+		for fileName in fileNames:
+			self.parsers[fileName] = parserChooser.getParser(fileName)
+			self.sortedFileNames[fileName] = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000))
+			if "SMARTTMPPATH" in os.environ:
+				self.sortedFileNames[fileName] = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileNames[fileName]))
 
 	def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"):
 		writerChooser = WriterChooser()
@@ -76,8 +82,8 @@
 	def setDistance(self, distance):
 		self.distance = distance
 
-	def setColinear(self, colinear):
-		self.colinear = colinear
+	def setColinear(self, collinear):
+		self.collinear = collinear
 
 	def setNormalize(self, normalize):
 		self.normalize = normalize
@@ -85,42 +91,60 @@
 	def setPresorted(self, presorted):
 		self.presorted = presorted
 
-	def _sortFile(self):
+	def _sortFiles(self):
 		if self.presorted:
 			return
-		fs = FileSorter(self.parser, self.verbosity-4)
-		fs.perChromosome(True)
-		fs.setPresorted(self.presorted)
-		fs.setOutputFileName(self.sortedFileName)
-		fs.sort()
-		self.splittedFileNames       = fs.getOutputFileNames()
-		self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()
-		self.nbElements              = fs.getNbElements()
+		for fileName, parser in self.parsers.iteritems():
+			fs = FileSorter(parser, self.verbosity-4)
+			fs.perChromosome(True)
+			fs.setPresorted(self.presorted)
+			fs.setOutputFileName(self.sortedFileNames[fileName])
+			fs.sort()
+			self.splittedFileNames[fileName] = fs.getOutputFileNames()
+			self.nbElementsPerChromosome     = fs.getNbElementsPerChromosome()
+			self.nbElements                  = fs.getNbElements()
+			self.chromosomes.update(self.splittedFileNames[fileName].keys())
 		
-	def _iterate(self, chromosome):
-		if chromosome == None:
-			progress = UnlimitedProgress(10000, "Reading input file", self.verbosity)
-			parser   = self.parser
-		else:
-			progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity)
-			parser   = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity)
+	def _iterate(self):
+		progress = UnlimitedProgress(10000, "Reading input file", self.verbosity)
 		transcripts = []
-		for newTranscript in parser.getIterator():
-			newTranscripts = []
-			if newTranscript.__class__.__name__ == "Mapping":
-				newTranscript = newTranscript.getTranscript()
-			for oldTranscript in transcripts:
-				if self._checkOverlap(newTranscript, oldTranscript):
-					self._merge(newTranscript, oldTranscript)
-				elif self._checkPassed(newTranscript, oldTranscript):
-					self._write(oldTranscript)
-				else:
-					newTranscripts.append(oldTranscript)
-			newTranscripts.append(newTranscript)
-			transcripts = newTranscripts
-			progress.inc()
-		for transcript in transcripts:
-			self._write(transcript)
+		heap        = []
+		parsersSets = []
+		if self.chromosomes:
+			parsersSets.append(self.parsers.values())
+		else:
+			for chromosome in self.chromosomes:
+				parsersSets.append([self.splittedFileNames[fileName][chromosome] for fileName in self.splittedFileNames if chromosome in self.splittedFileNames[fileName]])
+		for parsers in parsersSets:
+			for parser in parsers:
+				iterator = parser.getIterator()
+				for transcript in iterator:
+					if transcript.__class__.__name__ == "Mapping":
+						transcript = transcript.getTranscript()
+					heappush(heap, (transcript.getChromosome(), transcript.getStart(), -transcript.getEnd(), transcript, iterator))
+					break
+			while heap:
+				chromosome, start, end, newTranscript, iterator = heappop(heap)
+				for transcript in iterator:
+					if transcript.__class__.__name__ == "Mapping":
+						transcript = transcript.getTranscript()
+					heappush(heap, (transcript.getChromosome(), transcript.getStart(), -transcript.getEnd(), transcript, iterator))
+					break
+				newTranscripts = []
+				if newTranscript.__class__.__name__ == "Mapping":
+					newTranscript = newTranscript.getTranscript()
+				for oldTranscript in transcripts:
+					if self._checkOverlap(newTranscript, oldTranscript):
+						self._merge(newTranscript, oldTranscript)
+					elif self._checkPassed(newTranscript, oldTranscript):
+						self._write(oldTranscript)
+					else:
+						newTranscripts.append(oldTranscript)
+				newTranscripts.append(newTranscript)
+				transcripts = newTranscripts
+				progress.inc()
+			for transcript in transcripts:
+				self._write(transcript)
 		progress.done()
 
 	def _merge(self, transcript1, transcript2):
@@ -135,7 +159,7 @@
 	def _checkOverlap(self, transcript1, transcript2):
 		if transcript1.getChromosome() != transcript2.getChromosome():
 			return False
-		if self.colinear and transcript1.getDirection() != transcript2.getDirection():
+		if self.collinear and transcript1.getDirection() != transcript2.getDirection():
 			return False
 		if transcript1.getDistance(transcript2) > self.distance:
 			return False
@@ -145,12 +169,8 @@
 		return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance))
 
 	def run(self):
-		self._sortFile()
-		if self.presorted:
-			self._iterate(None)
-		else:
-			for chromosome in sorted(self.splittedFileNames.keys()):
-				self._iterate(chromosome)
+		self._sortFiles()
+		self._iterate()
 		self.writer.close()
 		if self.verbosity > 0:
 			print "# input:   %d" % (self.nbElements)
@@ -162,21 +182,21 @@
 	description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]"
 
 	parser = OptionParser(description = description)
-	parser.add_option("-i", "--input",     dest="inputFileName",  action="store",				     type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
-	parser.add_option("-f", "--format",    dest="format",		 action="store",				     type="string", help="format of file [format: transcript file format]")
-	parser.add_option("-o", "--output",    dest="outputFileName", action="store",				     type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
-	parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store",     default="gff",		     type="string", help="output file format [format: transcript file format]")
-	parser.add_option("-c", "--colinear",  dest="colinear",       action="store_true", default=False,				help="merge colinear transcripts only [format: bool] [default: false]")
-	parser.add_option("-d", "--distance",  dest="distance",       action="store",      default=0,     type="int",    help="max. distance between two transcripts to be merged [format: int] [default: 0]")
-	parser.add_option("-n", "--normalize", dest="normalize",      action="store_true", default=False,				help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
-	parser.add_option("-s", "--sorted",    dest="sorted",		 action="store_true", default=False,				help="input is already sorted [format: bool] [default: false]")
-	parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int] [default: 1]")
+	parser.add_option("-i", "--inputs",       dest="inputFileNames", action="store",				     type="string", help="input files (separated by commas) [compulsory] [format: string]")
+	parser.add_option("-f", "--format",       dest="format",		 action="store",				     type="string", help="format of file [format: transcript file format]")
+	parser.add_option("-o", "--output",       dest="outputFileName", action="store",				     type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
+	parser.add_option("-u", "--outputFormat", dest="outputFormat",   action="store",      default="gff",		        type="string", help="output file format [format: transcript file format]")
+	parser.add_option("-c", "--collinear",    dest="collinear",      action="store_true", default=False,				help="merge collinear transcripts only [format: bool] [default: false]")
+	parser.add_option("-d", "--distance",     dest="distance",       action="store",      default=0,     type="int",    help="max. distance between two transcripts to be merged [format: int] [default: 0]")
+	parser.add_option("-n", "--normalize",    dest="normalize",      action="store_true", default=False,				help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
+	parser.add_option("-s", "--sorted",       dest="sorted",		 action="store_true", default=False,				help="input is already sorted [format: bool] [default: false]")
+	parser.add_option("-v", "--verbosity",    dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int] [default: 1]")
 	(options, args) = parser.parse_args()
 
 	c = Clusterize(options.verbosity)
-	c.setInputFile(options.inputFileName, options.format)
+	c.setInputFiles(options.inputFileNames.split(","), options.format)
 	c.setOutputFileName(options.outputFileName, options.outputFormat)
-	c.setColinear(options.colinear)
+	c.setColinear(options.collinear)
 	c.setDistance(options.distance)
 	c.setNormalize(options.normalize)
 	c.setPresorted(options.sorted)