Repository 's_mart'
hg clone https://toolshed.g2.bx.psu.edu/repos/yufei-luo/s_mart

Changeset 68:85e80c21b1f7 (2015-11-16)
Previous changeset 67:f4de72c80eac (2015-11-16) Next changeset 69:1473ab954708 (2015-11-18)
Commit message:
Uploaded
modified:
SMART/Java/Python/ncList/FileSorter.py
b
diff -r f4de72c80eac -r 85e80c21b1f7 SMART/Java/Python/ncList/FileSorter.py
--- a/SMART/Java/Python/ncList/FileSorter.py Mon Nov 16 11:59:35 2015 -0500
+++ b/SMART/Java/Python/ncList/FileSorter.py Mon Nov 16 12:00:32 2015 -0500
[
@@ -41,6 +41,7 @@
 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
 
 BUFFER_SIZE = 100 * 1024
+NB_FILES    = 10000000000
 
 class FileSorter(object):
 
@@ -120,7 +121,18 @@
  outputChunk.write(pickle.dumps(transcript, -1))
  outputChunk.close()
 
- def _merge(self, chunks):
+ def _merge(self, chunks, chromosome, outputHandle):
+ currentOutputChunkId = len(chunks)
+ while len(chunks) > NB_FILES:
+ outputChunk           = open("%s_%s_%06i.tmp" % (self._prefix, chromosome, currentChunkId), "wb", 32000)
+ currentOutputChunkId += 1
+ currentChunks         = chunks[:NB_FILES]
+ chunks                = chunks[NB_FILES:] + outputChunk
+ self._mergeParts(currentChunks, outputChunk)
+ outputChunk.close()
+ self._mergeParts(chunks, outputHandle)
+
+ def _mergeParts(self, chunks, outputHandle):
  values = []
  for chunk in chunks:
  chunk = open(chunk.name, "rb")
@@ -138,8 +150,8 @@
  else:
  heappush(values, (start, end, transcript, chunk))
  while values:
- start, end, transcript, chunk = heappop(values)
- yield transcript
+ start, end, transcript, chUnk = heappop(values)
+ pickle.dump(transcript, outputHandle, -1)
  try:
  transcript = pickle.load(chunk)
  start    = transcript.getStart()
@@ -189,8 +201,7 @@
  if self._perChromosome:
  self._outputFileNames[chromosome] = "%s_%s.pkl" % (self._outputFileName, chromosome)
  outputHandle = open(self._outputFileNames[chromosome], "wb")
- for sequence in self._merge(self._chunks[chromosome]):
- pickle.dump(sequence, outputHandle, -1)
+ self._merge(self._chunks[chromosome], chromosome, outputHandle)
  if self._perChromosome:
  outputHandle.close()
  progress.inc()