Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/ncList/NCList.py @ 6:769e306b7933
Change the repository level.
| author | yufei-luo |
|---|---|
| date | Fri, 18 Jan 2013 04:54:14 -0500 |
| parents | |
| children | 169d364ddd91 |
comparison
equal
deleted
inserted
replaced
| 5:ea3082881bf8 | 6:769e306b7933 |
|---|---|
| 1 #! /usr/bin/env python | |
| 2 # | |
| 3 # Copyright INRA-URGI 2009-2010 | |
| 4 # | |
| 5 # This software is governed by the CeCILL license under French law and | |
| 6 # abiding by the rules of distribution of free software. You can use, | |
| 7 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 9 # "http://www.cecill.info". | |
| 10 # | |
| 11 # As a counterpart to the access to the source code and rights to copy, | |
| 12 # modify and redistribute granted by the license, users are provided only | |
| 13 # with a limited warranty and the software's author, the holder of the | |
| 14 # economic rights, and the successive licensors have only limited | |
| 15 # liability. | |
| 16 # | |
| 17 # In this respect, the user's attention is drawn to the risks associated | |
| 18 # with loading, using, modifying and/or developing or reproducing the | |
| 19 # software by the user in light of its specific status of free software, | |
| 20 # that may mean that it is complicated to manipulate, and that also | |
| 21 # therefore means that it is reserved for developers and experienced | |
| 22 # professionals having in-depth computer knowledge. Users are therefore | |
| 23 # encouraged to load and test the software's suitability as regards their | |
| 24 # requirements in conditions enabling the security of their systems and/or | |
| 25 # data to be ensured and, more generally, to use and operate it in the | |
| 26 # same conditions as regards security. | |
| 27 # | |
| 28 # The fact that you are presently reading this means that you have had | |
| 29 # knowledge of the CeCILL license and that you accept its terms. | |
| 30 # | |
| 31 import os, os.path | |
| 32 import struct | |
| 33 import shelve | |
| 34 import sys | |
| 35 from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle | |
| 36 from SMART.Java.Python.ncList.NCIndex import NCIndex | |
| 37 from SMART.Java.Python.misc.Progress import Progress | |
| 38 | |
| 39 LONG_SIZE = struct.calcsize('l') | |
| 40 | |
| 41 H = 0 | |
| 42 L = 1 | |
| 43 T = 2 | |
| 44 G = 3 | |
| 45 | |
| 46 H_CELL_SIZE = 2 | |
| 47 L_CELL_SIZE = 5 | |
| 48 T_CELL_SIZE = 6 | |
| 49 | |
| 50 START = 0 | |
| 51 END = 1 | |
| 52 ADDRESS = 2 | |
| 53 LIST = 3 | |
| 54 PARENT = 4 | |
| 55 NEW = 5 | |
| 56 LENGTH = 1 | |
| 57 | |
| 58 def pack(input): | |
| 59 return struct.pack("l", long(input)) | |
| 60 def unpack(input): | |
| 61 return struct.unpack("l", input)[0] | |
| 62 | |
| 63 | |
| 64 class NCList(object): | |
| 65 | |
| 66 def __init__(self, verbosity): | |
| 67 self._verbosity = verbosity | |
| 68 self._subPos = 0 | |
| 69 self._parentPos = 0 | |
| 70 self._nbLines = 0 | |
| 71 self._nbLists = 0 | |
| 72 self._chromosome = None | |
| 73 self._transcriptFileName = None | |
| 74 self._lHandle = None | |
| 75 self._hHandle = None | |
| 76 self._tHandle = None | |
| 77 self._parser = None | |
| 78 self._sizeDict = {H: H_CELL_SIZE, L: L_CELL_SIZE, T: T_CELL_SIZE} | |
| 79 self._offsets = {H: 0, L: 0, G: 0} | |
| 80 self._fileNameDict = {} | |
| 81 self._handleDict = {} | |
| 82 self._createIndex = False | |
| 83 self._missingValues = dict([table, {}] for table in self._sizeDict) | |
| 84 self._missingValues[T][LIST] = -1 | |
| 85 self._missingValues[L][LIST] = 0 | |
| 86 self._missingValues[T][NEW] = -1 | |
| 87 | |
| 88 def __del__(self): | |
| 89 for handle in (self._lHandle, self._hHandle): | |
| 90 if handle != None: | |
| 91 handle.close() | |
| 92 | |
| 93 def createIndex(self, boolean): | |
| 94 self._createIndex = boolean | |
| 95 | |
| 96 def setChromosome(self, chromosome): | |
| 97 self._chromosome = chromosome | |
| 98 | |
| 99 def setFileName(self, fileName): | |
| 100 self._transcriptFileName = fileName | |
| 101 self._parser = NCListFileUnpickle(fileName, self._verbosity) | |
| 102 self._setFileNames(fileName) | |
| 103 | |
| 104 def setNbElements(self, nbElements): | |
| 105 self._nbLines = nbElements | |
| 106 | |
| 107 def setOffset(self, fileType, offset): | |
| 108 self._offsets[fileType] = offset | |
| 109 | |
| 110 def _setFileNames(self, fileName): | |
| 111 if self._chromosome != None and fileName != None: | |
| 112 coreName = os.path.splitext(fileName)[0] | |
| 113 if "SMARTTMPPATH" in os.environ: | |
| 114 coreName = os.path.join(os.environ["SMARTTMPPATH"], coreName) | |
| 115 self._hFileName = "%s_H.bin" % (coreName) | |
| 116 self._lFileName = "%s_L.bin" % (coreName) | |
| 117 self._tFileName = "%s_T.bin" % (coreName) | |
| 118 self._fileNameDict = {H: self._hFileName, L: self._lFileName, T: self._tFileName} | |
| 119 | |
| 120 def getSizeFirstList(self): | |
| 121 return self._sizeFirstList | |
| 122 | |
| 123 def _writeSubListIntoH(self, SubListAddr, SubListLength): | |
| 124 self._hHandle.write(pack(SubListAddr)) | |
| 125 self._hHandle.write(pack(SubListLength)) | |
| 126 self._subPos += H_CELL_SIZE | |
| 127 | |
| 128 def _writeParentIntoL(self, readAddr, subListAddr, parentAddr, start, end): | |
| 129 self._lHandle.write(pack(start)) | |
| 130 self._lHandle.write(pack(end)) | |
| 131 self._lHandle.write(pack(readAddr)) | |
| 132 self._lHandle.write(pack(subListAddr)) | |
| 133 self._lHandle.write(pack(parentAddr)) | |
| 134 self._parentPos += L_CELL_SIZE | |
| 135 | |
| 136 def getLLineElements(self, subListLAddr): | |
| 137 if subListLAddr == -1 or subListLAddr == None: | |
| 138 #print "reading bad from L", subListLAddr | |
| 139 return -1, -1, -1, -1, -1 | |
| 140 else: | |
| 141 self._lHandle.seek(subListLAddr * L_CELL_SIZE * LONG_SIZE + self._offsets[L]) | |
| 142 start = self._lHandle.read(LONG_SIZE) | |
| 143 if len(start) < LONG_SIZE: | |
| 144 #print "reading very bad from L", subListLAddr | |
| 145 return -1, -1, -1, -1, -1 | |
| 146 start = unpack(start) | |
| 147 end = unpack(self._lHandle.read(LONG_SIZE)) | |
| 148 gff3Addr = unpack(self._lHandle.read(LONG_SIZE)) | |
| 149 subListHAddr = unpack(self._lHandle.read(LONG_SIZE)) | |
| 150 parentLAddr = unpack(self._lHandle.read(LONG_SIZE)) | |
| 151 #print "reading from L", subListLAddr, "-->", gff3Addr, subListHAddr, parentLAddr, start, end | |
| 152 return gff3Addr, subListHAddr, parentLAddr, start, end | |
| 153 | |
| 154 def getHLineElements(self, subListHAddr): | |
| 155 self._hHandle.seek(subListHAddr * H_CELL_SIZE * LONG_SIZE + self._offsets[H]) | |
| 156 subListStartBin = self._hHandle.read(LONG_SIZE) | |
| 157 if len(subListStartBin) < 8 : | |
| 158 #print "reading bad from H" | |
| 159 return -1, -1 | |
| 160 subListStart = unpack(subListStartBin) | |
| 161 subListElementsNb = unpack(self._hHandle.read(LONG_SIZE)) | |
| 162 #print "reading from H", subListHAddr, "-->", subListStart, subListElementsNb | |
| 163 return subListStart, subListElementsNb | |
| 164 | |
| 165 def getRefGffAddr(self, currentRefLAddr): | |
| 166 RefGff3Addr, subListHAddr, parentLAddr, start, end = self.getLLineElements(currentRefLAddr) | |
| 167 return RefGff3Addr | |
| 168 | |
| 169 def getIntervalFromAdress(self, address): | |
| 170 self._parser.gotoAddress(int(address) + self._offsets[G]) | |
| 171 iTranscrit = self._parser.getNextTranscript() | |
| 172 return iTranscrit | |
| 173 | |
| 174 def removeFiles(self): | |
| 175 return | |
| 176 | |
| 177 def buildLists(self): | |
| 178 if self._createIndex: | |
| 179 self._index = NCIndex(self._verbosity) | |
| 180 self._createTables() | |
| 181 self._labelLists() | |
| 182 self._computeSubStart() | |
| 183 self._computeAbsPosition() | |
| 184 self._cleanFiles() | |
| 185 | |
| 186 def _createTables(self): | |
| 187 self._initLists() | |
| 188 self._createTable(H, self._nbLists) | |
| 189 self._createTable(T, self._nbLines) | |
| 190 self._createTable(L, self._nbLines) | |
| 191 self._fillTables() | |
| 192 | |
| 193 def _initLists(self): | |
| 194 previousTranscript = None | |
| 195 self._nbLists = 1 | |
| 196 progress = Progress(self._nbLines, "Initializing lists", self._verbosity-5) | |
| 197 for transcript in self._parser.getIterator(): | |
| 198 if self._isIncluded(transcript, previousTranscript): | |
| 199 self._nbLists += 1 | |
| 200 previousTranscript = transcript | |
| 201 progress.inc() | |
| 202 progress.done() | |
| 203 | |
| 204 def _isIncluded(self, transcript1, transcript2): | |
| 205 return transcript1 != None and transcript2 != None and transcript1.getStart() >= transcript2.getStart() and transcript1.getEnd() <= transcript2.getEnd() | |
| 206 | |
| 207 def _createTable(self, name, size): | |
| 208 handle = open(self._fileNameDict[name], "w+b") | |
| 209 progress = Progress(self._sizeDict[name] * size, "Initializing table %d" % (name), self._verbosity-5) | |
| 210 for i in xrange(self._sizeDict[name] * size): | |
| 211 handle.write(pack(-1)) | |
| 212 progress.inc() | |
| 213 progress.done() | |
| 214 self._handleDict[name] = handle | |
| 215 | |
| 216 def _fillTables(self): | |
| 217 progress = Progress(self._nbLines, "Filling table T", self._verbosity-5) | |
| 218 for i, transcript in enumerate(self._parser.getIterator()): | |
| 219 self._writeValue(T, i, START, transcript.getStart()) | |
| 220 self._writeValue(T, i, END, transcript.getEnd()) | |
| 221 self._writeValue(T, i, ADDRESS, self._parser.getCurrentTranscriptAddress()) | |
| 222 self._writeValue(T, i, PARENT, -1) | |
| 223 self._writeValue(T, i, LIST, -1) | |
| 224 progress.inc() | |
| 225 progress.done() | |
| 226 progress = Progress(self._nbLists, "Filling table H", self._verbosity-5) | |
| 227 for i in xrange(self._nbLists): | |
| 228 self._writeValue(H, i, LENGTH, 0) | |
| 229 progress.inc() | |
| 230 progress.done() | |
| 231 | |
| 232 def _labelLists(self): | |
| 233 progress = Progress(self._nbLines, "Getting table structure", self._verbosity-5) | |
| 234 nextL = 0 | |
| 235 for i in xrange(self._nbLines): | |
| 236 p = i - 1 | |
| 237 start = self._readValue(T, i, START) | |
| 238 end = self._readValue(T, i, END) | |
| 239 while p != -1 and (start < self._readValue(T, p, START) or end > self._readValue(T, p, END)): | |
| 240 p = self._readValue(T, p, PARENT) | |
| 241 thisL = self._readValue(T, p, LIST) | |
| 242 if thisL == -1: | |
| 243 #print "entering" | |
| 244 thisL = nextL | |
| 245 nextL += 1 | |
| 246 length = 0 | |
| 247 self._writeValue(T, p, LIST, thisL) | |
| 248 else: | |
| 249 length = self._readValue(H, thisL, LENGTH) | |
| 250 self._writeValue(T, i, PARENT, p) | |
| 251 self._writeValue(H, thisL, LENGTH, length + 1) | |
| 252 progress.inc() | |
| 253 progress.done() | |
| 254 | |
| 255 def _computeSubStart(self): | |
| 256 progress = Progress(self._nbLines, "Getting table sub-lists", self._verbosity-5) | |
| 257 total = 0 | |
| 258 for i in xrange(self._nbLists): | |
| 259 self._writeValue(H, i, START, total) | |
| 260 total += self._readValue(H, i, LENGTH) | |
| 261 self._writeValue(H, i, LENGTH, 0) | |
| 262 progress.inc() | |
| 263 progress.done() | |
| 264 | |
| 265 def _computeAbsPosition(self): | |
| 266 progress = Progress(self._nbLines, "Writing table", self._verbosity-5) | |
| 267 self._sizeFirstList = 0 | |
| 268 for i in xrange(self._nbLines): | |
| 269 s = self._readValue(T, i, START) | |
| 270 e = self._readValue(T, i, END) | |
| 271 a = self._readValue(T, i, ADDRESS) | |
| 272 pt = self._readValue(T, i, PARENT) | |
| 273 h = self._readValue(T, pt, LIST) | |
| 274 pl = self._readValue(T, pt, NEW) | |
| 275 nb = self._readValue(H, h, LENGTH) | |
| 276 l = self._readValue(H, h, START) + nb | |
| 277 self._writeValue(T, i, NEW, l) | |
| 278 self._writeValue(L, l, START, s) | |
| 279 self._writeValue(L, l, END, e) | |
| 280 self._writeValue(L, l, ADDRESS, a) | |
| 281 self._writeValue(L, l, LIST, -1) | |
| 282 self._writeValue(L, l, PARENT, pl) | |
| 283 self._writeValue(H, h, LENGTH, nb+1) | |
| 284 if nb == 0: | |
| 285 #print "adding it" | |
| 286 self._writeValue(L, pl, LIST, h) | |
| 287 if pl == -1: | |
| 288 self._sizeFirstList += 1 | |
| 289 if self._createIndex: | |
| 290 self._index.addTranscript(e, l) | |
| 291 progress.inc() | |
| 292 progress.done() | |
| 293 | |
| 294 def closeFiles(self): | |
| 295 for handle in self._handleDict.values(): | |
| 296 handle.close() | |
| 297 del self._handleDict | |
| 298 self._lHandle = None | |
| 299 self._hHandle = None | |
| 300 self._tHandle = None | |
| 301 self._parser = None | |
| 302 | |
| 303 def openFiles(self): | |
| 304 self._lHandle = open(self._fileNameDict[L], "rb") | |
| 305 self._hHandle = open(self._fileNameDict[H], "rb") | |
| 306 self._handleDict = {H: self._hHandle, L: self._lHandle} | |
| 307 self._parser = NCListFileUnpickle(self._transcriptFileName, self._verbosity) | |
| 308 | |
| 309 def _cleanFiles(self): | |
| 310 self.closeFiles() | |
| 311 os.remove(self._fileNameDict[T]) | |
| 312 | |
| 313 def _getPosition(self, table, line, key): | |
| 314 handle = self._handleDict[table] | |
| 315 handle.seek(self._sizeDict[table] * line * LONG_SIZE + key * LONG_SIZE) | |
| 316 return handle | |
| 317 | |
| 318 def _writeValue(self, table, line, key, value): | |
| 319 #print "writing", table, line, key, "<-", value | |
| 320 if line == -1: | |
| 321 self._missingValues[table][key] = value | |
| 322 return | |
| 323 handle = self._getPosition(table, line, key) | |
| 324 handle.write(pack(value)) | |
| 325 | |
| 326 def _readValue(self, table, line, key): | |
| 327 #print "reading", table, line, key, "->", | |
| 328 if line == -1: | |
| 329 #print self._missingValues[table][key] | |
| 330 return self._missingValues[table][key] | |
| 331 handle = self._getPosition(table, line, key) | |
| 332 r = unpack(handle.read(LONG_SIZE)) | |
| 333 #print r | |
| 334 return r | |
| 335 | |
| 336 def getIndex(self): | |
| 337 return self._index |
