s_mart: SMART/Java/Python/ncList/FileSorter.py comparison

comparison SMART/Java/Python/ncList/FileSorter.py @ 68:85e80c21b1f7 draft

Uploaded

author	m-zytnicki
date	Mon, 16 Nov 2015 12:00:32 -0500
parents	769e306b7933
children

comparison

equal deleted inserted replaced

-:f4de72c80eac
+:85e80c21b1f7
 from SMART.Java.Python.structure.Transcript import Transcript
 from SMART.Java.Python.misc.Progress import Progress
 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
 BUFFER_SIZE = 100 * 1024
+NB_FILES    = 10000000000
 class FileSorter(object):
 	def __init__(self, parser, verbosity = 1):
 		self._parser				  = parser
 		self._chunks[chromosome].append(outputChunk)
 		for transcript in chunk:
 			outputChunk.write(pickle.dumps(transcript, -1))
 		outputChunk.close()
-	def _merge(self, chunks):
+	def _merge(self, chunks, chromosome, outputHandle):
+		currentOutputChunkId = len(chunks)
+		while len(chunks) > NB_FILES:
+			outputChunk           = open("%s_%s_%06i.tmp" % (self._prefix, chromosome, currentChunkId), "wb", 32000)
+			currentOutputChunkId += 1
+			currentChunks         = chunks[:NB_FILES]
+			chunks                = chunks[NB_FILES:] + outputChunk
+			self._mergeParts(currentChunks, outputChunk)
+			outputChunk.close()
+		self._mergeParts(chunks, outputHandle)
+	def _mergeParts(self, chunks, outputHandle):
 		values = []
 		for chunk in chunks:
 			chunk = open(chunk.name, "rb")
 			try:
 				transcript = pickle.load(chunk)
 				except:
 					pass
 			else:
 				heappush(values, (start, end, transcript, chunk))
 		while values:
-			start, end, transcript, chunk = heappop(values)
+			start, end, transcript, chUnk = heappop(values)
-			yield transcript
+			pickle.dump(transcript, outputHandle, -1)
 			try:
 				transcript = pickle.load(chunk)
 				start	   = transcript.getStart()
 				end		   = -transcript.getEnd()
 			except EOFError:
 			progress = Progress(len(self._chunks), "Writing sorted file %s" % (self._parser.fileName), self._verbosity)
 			for chromosome in self._chunks:
 				if self._perChromosome:
 					self._outputFileNames[chromosome] = "%s_%s.pkl" % (self._outputFileName, chromosome)
 					outputHandle = open(self._outputFileNames[chromosome], "wb")
-				for sequence in self._merge(self._chunks[chromosome]):
+				self._merge(self._chunks[chromosome], chromosome, outputHandle)
-					pickle.dump(sequence, outputHandle, -1)
 				if self._perChromosome:
 					outputHandle.close()
 				progress.inc()
 			if not self._perChromosome:
 				outputHandle.close()

Mercurial > repos > yufei-luo > s_mart

comparison SMART/Java/Python/ncList/FileSorter.py @ 68:85e80c21b1f7 draft