changeset 68:85e80c21b1f7 draft

Uploaded
author m-zytnicki
date Mon, 16 Nov 2015 12:00:32 -0500
parents f4de72c80eac
children 1473ab954708
files SMART/Java/Python/ncList/FileSorter.py
diffstat 1 files changed, 16 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/SMART/Java/Python/ncList/FileSorter.py	Mon Nov 16 11:59:35 2015 -0500
+++ b/SMART/Java/Python/ncList/FileSorter.py	Mon Nov 16 12:00:32 2015 -0500
@@ -41,6 +41,7 @@
 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
 
 BUFFER_SIZE = 100 * 1024
+NB_FILES    = 10000000000
 
 class FileSorter(object):
 
@@ -120,7 +121,18 @@
 			outputChunk.write(pickle.dumps(transcript, -1))
 		outputChunk.close()
 		
-	def _merge(self, chunks):
+	def _merge(self, chunks, chromosome, outputHandle):
+		currentOutputChunkId = len(chunks)
+		while len(chunks) > NB_FILES:
+			outputChunk           = open("%s_%s_%06i.tmp" % (self._prefix, chromosome, currentChunkId), "wb", 32000)
+			currentOutputChunkId += 1
+			currentChunks         = chunks[:NB_FILES]
+			chunks                = chunks[NB_FILES:] + outputChunk
+			self._mergeParts(currentChunks, outputChunk)
+			outputChunk.close()
+		self._mergeParts(chunks, outputHandle)
+
+	def _mergeParts(self, chunks, outputHandle):
 		values = []
 		for chunk in chunks:
 			chunk = open(chunk.name, "rb")
@@ -138,8 +150,8 @@
 			else:
 				heappush(values, (start, end, transcript, chunk))
 		while values:
-			start, end, transcript, chunk = heappop(values)
-			yield transcript
+			start, end, transcript, chUnk = heappop(values)
+			pickle.dump(transcript, outputHandle, -1)
 			try:
 				transcript = pickle.load(chunk)
 				start	   = transcript.getStart()
@@ -189,8 +201,7 @@
 				if self._perChromosome:
 					self._outputFileNames[chromosome] = "%s_%s.pkl" % (self._outputFileName, chromosome)
 					outputHandle = open(self._outputFileNames[chromosome], "wb")
-				for sequence in self._merge(self._chunks[chromosome]):
-					pickle.dump(sequence, outputHandle, -1)
+				self._merge(self._chunks[chromosome], chromosome, outputHandle)
 				if self._perChromosome:
 					outputHandle.close()
 				progress.inc()