# HG changeset patch # User m-zytnicki # Date 1447693232 18000 # Node ID 85e80c21b1f731694164649fe49fc30b0e24671e # Parent f4de72c80eacf86fc5051ef36149a62e8686fa02 Uploaded diff -r f4de72c80eac -r 85e80c21b1f7 SMART/Java/Python/ncList/FileSorter.py --- a/SMART/Java/Python/ncList/FileSorter.py Mon Nov 16 11:59:35 2015 -0500 +++ b/SMART/Java/Python/ncList/FileSorter.py Mon Nov 16 12:00:32 2015 -0500 @@ -41,6 +41,7 @@ from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress BUFFER_SIZE = 100 * 1024 +NB_FILES = 10000000000 class FileSorter(object): @@ -120,7 +121,18 @@ outputChunk.write(pickle.dumps(transcript, -1)) outputChunk.close() - def _merge(self, chunks): + def _merge(self, chunks, chromosome, outputHandle): + currentOutputChunkId = len(chunks) + while len(chunks) > NB_FILES: + outputChunk = open("%s_%s_%06i.tmp" % (self._prefix, chromosome, currentChunkId), "wb", 32000) + currentOutputChunkId += 1 + currentChunks = chunks[:NB_FILES] + chunks = chunks[NB_FILES:] + outputChunk + self._mergeParts(currentChunks, outputChunk) + outputChunk.close() + self._mergeParts(chunks, outputHandle) + + def _mergeParts(self, chunks, outputHandle): values = [] for chunk in chunks: chunk = open(chunk.name, "rb") @@ -138,8 +150,8 @@ else: heappush(values, (start, end, transcript, chunk)) while values: - start, end, transcript, chunk = heappop(values) - yield transcript + start, end, transcript, chUnk = heappop(values) + pickle.dump(transcript, outputHandle, -1) try: transcript = pickle.load(chunk) start = transcript.getStart() @@ -189,8 +201,7 @@ if self._perChromosome: self._outputFileNames[chromosome] = "%s_%s.pkl" % (self._outputFileName, chromosome) outputHandle = open(self._outputFileNames[chromosome], "wb") - for sequence in self._merge(self._chunks[chromosome]): - pickle.dump(sequence, outputHandle, -1) + self._merge(self._chunks[chromosome], chromosome, outputHandle) if self._perChromosome: outputHandle.close() progress.inc()