comparison SMART/Java/Python/ncList/NCList.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children 169d364ddd91
comparison
equal deleted inserted replaced
5:ea3082881bf8 6:769e306b7933
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import os, os.path
32 import struct
33 import shelve
34 import sys
35 from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
36 from SMART.Java.Python.ncList.NCIndex import NCIndex
37 from SMART.Java.Python.misc.Progress import Progress
38
39 LONG_SIZE = struct.calcsize('l')
40
41 H = 0
42 L = 1
43 T = 2
44 G = 3
45
46 H_CELL_SIZE = 2
47 L_CELL_SIZE = 5
48 T_CELL_SIZE = 6
49
50 START = 0
51 END = 1
52 ADDRESS = 2
53 LIST = 3
54 PARENT = 4
55 NEW = 5
56 LENGTH = 1
57
58 def pack(input):
59 return struct.pack("l", long(input))
60 def unpack(input):
61 return struct.unpack("l", input)[0]
62
63
64 class NCList(object):
65
66 def __init__(self, verbosity):
67 self._verbosity = verbosity
68 self._subPos = 0
69 self._parentPos = 0
70 self._nbLines = 0
71 self._nbLists = 0
72 self._chromosome = None
73 self._transcriptFileName = None
74 self._lHandle = None
75 self._hHandle = None
76 self._tHandle = None
77 self._parser = None
78 self._sizeDict = {H: H_CELL_SIZE, L: L_CELL_SIZE, T: T_CELL_SIZE}
79 self._offsets = {H: 0, L: 0, G: 0}
80 self._fileNameDict = {}
81 self._handleDict = {}
82 self._createIndex = False
83 self._missingValues = dict([table, {}] for table in self._sizeDict)
84 self._missingValues[T][LIST] = -1
85 self._missingValues[L][LIST] = 0
86 self._missingValues[T][NEW] = -1
87
88 def __del__(self):
89 for handle in (self._lHandle, self._hHandle):
90 if handle != None:
91 handle.close()
92
93 def createIndex(self, boolean):
94 self._createIndex = boolean
95
96 def setChromosome(self, chromosome):
97 self._chromosome = chromosome
98
99 def setFileName(self, fileName):
100 self._transcriptFileName = fileName
101 self._parser = NCListFileUnpickle(fileName, self._verbosity)
102 self._setFileNames(fileName)
103
104 def setNbElements(self, nbElements):
105 self._nbLines = nbElements
106
107 def setOffset(self, fileType, offset):
108 self._offsets[fileType] = offset
109
110 def _setFileNames(self, fileName):
111 if self._chromosome != None and fileName != None:
112 coreName = os.path.splitext(fileName)[0]
113 if "SMARTTMPPATH" in os.environ:
114 coreName = os.path.join(os.environ["SMARTTMPPATH"], coreName)
115 self._hFileName = "%s_H.bin" % (coreName)
116 self._lFileName = "%s_L.bin" % (coreName)
117 self._tFileName = "%s_T.bin" % (coreName)
118 self._fileNameDict = {H: self._hFileName, L: self._lFileName, T: self._tFileName}
119
120 def getSizeFirstList(self):
121 return self._sizeFirstList
122
123 def _writeSubListIntoH(self, SubListAddr, SubListLength):
124 self._hHandle.write(pack(SubListAddr))
125 self._hHandle.write(pack(SubListLength))
126 self._subPos += H_CELL_SIZE
127
128 def _writeParentIntoL(self, readAddr, subListAddr, parentAddr, start, end):
129 self._lHandle.write(pack(start))
130 self._lHandle.write(pack(end))
131 self._lHandle.write(pack(readAddr))
132 self._lHandle.write(pack(subListAddr))
133 self._lHandle.write(pack(parentAddr))
134 self._parentPos += L_CELL_SIZE
135
136 def getLLineElements(self, subListLAddr):
137 if subListLAddr == -1 or subListLAddr == None:
138 #print "reading bad from L", subListLAddr
139 return -1, -1, -1, -1, -1
140 else:
141 self._lHandle.seek(subListLAddr * L_CELL_SIZE * LONG_SIZE + self._offsets[L])
142 start = self._lHandle.read(LONG_SIZE)
143 if len(start) < LONG_SIZE:
144 #print "reading very bad from L", subListLAddr
145 return -1, -1, -1, -1, -1
146 start = unpack(start)
147 end = unpack(self._lHandle.read(LONG_SIZE))
148 gff3Addr = unpack(self._lHandle.read(LONG_SIZE))
149 subListHAddr = unpack(self._lHandle.read(LONG_SIZE))
150 parentLAddr = unpack(self._lHandle.read(LONG_SIZE))
151 #print "reading from L", subListLAddr, "-->", gff3Addr, subListHAddr, parentLAddr, start, end
152 return gff3Addr, subListHAddr, parentLAddr, start, end
153
154 def getHLineElements(self, subListHAddr):
155 self._hHandle.seek(subListHAddr * H_CELL_SIZE * LONG_SIZE + self._offsets[H])
156 subListStartBin = self._hHandle.read(LONG_SIZE)
157 if len(subListStartBin) < 8 :
158 #print "reading bad from H"
159 return -1, -1
160 subListStart = unpack(subListStartBin)
161 subListElementsNb = unpack(self._hHandle.read(LONG_SIZE))
162 #print "reading from H", subListHAddr, "-->", subListStart, subListElementsNb
163 return subListStart, subListElementsNb
164
165 def getRefGffAddr(self, currentRefLAddr):
166 RefGff3Addr, subListHAddr, parentLAddr, start, end = self.getLLineElements(currentRefLAddr)
167 return RefGff3Addr
168
169 def getIntervalFromAdress(self, address):
170 self._parser.gotoAddress(int(address) + self._offsets[G])
171 iTranscrit = self._parser.getNextTranscript()
172 return iTranscrit
173
174 def removeFiles(self):
175 return
176
177 def buildLists(self):
178 if self._createIndex:
179 self._index = NCIndex(self._verbosity)
180 self._createTables()
181 self._labelLists()
182 self._computeSubStart()
183 self._computeAbsPosition()
184 self._cleanFiles()
185
186 def _createTables(self):
187 self._initLists()
188 self._createTable(H, self._nbLists)
189 self._createTable(T, self._nbLines)
190 self._createTable(L, self._nbLines)
191 self._fillTables()
192
193 def _initLists(self):
194 previousTranscript = None
195 self._nbLists = 1
196 progress = Progress(self._nbLines, "Initializing lists", self._verbosity-5)
197 for transcript in self._parser.getIterator():
198 if self._isIncluded(transcript, previousTranscript):
199 self._nbLists += 1
200 previousTranscript = transcript
201 progress.inc()
202 progress.done()
203
204 def _isIncluded(self, transcript1, transcript2):
205 return transcript1 != None and transcript2 != None and transcript1.getStart() >= transcript2.getStart() and transcript1.getEnd() <= transcript2.getEnd()
206
207 def _createTable(self, name, size):
208 handle = open(self._fileNameDict[name], "w+b")
209 progress = Progress(self._sizeDict[name] * size, "Initializing table %d" % (name), self._verbosity-5)
210 for i in xrange(self._sizeDict[name] * size):
211 handle.write(pack(-1))
212 progress.inc()
213 progress.done()
214 self._handleDict[name] = handle
215
216 def _fillTables(self):
217 progress = Progress(self._nbLines, "Filling table T", self._verbosity-5)
218 for i, transcript in enumerate(self._parser.getIterator()):
219 self._writeValue(T, i, START, transcript.getStart())
220 self._writeValue(T, i, END, transcript.getEnd())
221 self._writeValue(T, i, ADDRESS, self._parser.getCurrentTranscriptAddress())
222 self._writeValue(T, i, PARENT, -1)
223 self._writeValue(T, i, LIST, -1)
224 progress.inc()
225 progress.done()
226 progress = Progress(self._nbLists, "Filling table H", self._verbosity-5)
227 for i in xrange(self._nbLists):
228 self._writeValue(H, i, LENGTH, 0)
229 progress.inc()
230 progress.done()
231
232 def _labelLists(self):
233 progress = Progress(self._nbLines, "Getting table structure", self._verbosity-5)
234 nextL = 0
235 for i in xrange(self._nbLines):
236 p = i - 1
237 start = self._readValue(T, i, START)
238 end = self._readValue(T, i, END)
239 while p != -1 and (start < self._readValue(T, p, START) or end > self._readValue(T, p, END)):
240 p = self._readValue(T, p, PARENT)
241 thisL = self._readValue(T, p, LIST)
242 if thisL == -1:
243 #print "entering"
244 thisL = nextL
245 nextL += 1
246 length = 0
247 self._writeValue(T, p, LIST, thisL)
248 else:
249 length = self._readValue(H, thisL, LENGTH)
250 self._writeValue(T, i, PARENT, p)
251 self._writeValue(H, thisL, LENGTH, length + 1)
252 progress.inc()
253 progress.done()
254
255 def _computeSubStart(self):
256 progress = Progress(self._nbLines, "Getting table sub-lists", self._verbosity-5)
257 total = 0
258 for i in xrange(self._nbLists):
259 self._writeValue(H, i, START, total)
260 total += self._readValue(H, i, LENGTH)
261 self._writeValue(H, i, LENGTH, 0)
262 progress.inc()
263 progress.done()
264
265 def _computeAbsPosition(self):
266 progress = Progress(self._nbLines, "Writing table", self._verbosity-5)
267 self._sizeFirstList = 0
268 for i in xrange(self._nbLines):
269 s = self._readValue(T, i, START)
270 e = self._readValue(T, i, END)
271 a = self._readValue(T, i, ADDRESS)
272 pt = self._readValue(T, i, PARENT)
273 h = self._readValue(T, pt, LIST)
274 pl = self._readValue(T, pt, NEW)
275 nb = self._readValue(H, h, LENGTH)
276 l = self._readValue(H, h, START) + nb
277 self._writeValue(T, i, NEW, l)
278 self._writeValue(L, l, START, s)
279 self._writeValue(L, l, END, e)
280 self._writeValue(L, l, ADDRESS, a)
281 self._writeValue(L, l, LIST, -1)
282 self._writeValue(L, l, PARENT, pl)
283 self._writeValue(H, h, LENGTH, nb+1)
284 if nb == 0:
285 #print "adding it"
286 self._writeValue(L, pl, LIST, h)
287 if pl == -1:
288 self._sizeFirstList += 1
289 if self._createIndex:
290 self._index.addTranscript(e, l)
291 progress.inc()
292 progress.done()
293
294 def closeFiles(self):
295 for handle in self._handleDict.values():
296 handle.close()
297 del self._handleDict
298 self._lHandle = None
299 self._hHandle = None
300 self._tHandle = None
301 self._parser = None
302
303 def openFiles(self):
304 self._lHandle = open(self._fileNameDict[L], "rb")
305 self._hHandle = open(self._fileNameDict[H], "rb")
306 self._handleDict = {H: self._hHandle, L: self._lHandle}
307 self._parser = NCListFileUnpickle(self._transcriptFileName, self._verbosity)
308
309 def _cleanFiles(self):
310 self.closeFiles()
311 os.remove(self._fileNameDict[T])
312
313 def _getPosition(self, table, line, key):
314 handle = self._handleDict[table]
315 handle.seek(self._sizeDict[table] * line * LONG_SIZE + key * LONG_SIZE)
316 return handle
317
318 def _writeValue(self, table, line, key, value):
319 #print "writing", table, line, key, "<-", value
320 if line == -1:
321 self._missingValues[table][key] = value
322 return
323 handle = self._getPosition(table, line, key)
324 handle.write(pack(value))
325
326 def _readValue(self, table, line, key):
327 #print "reading", table, line, key, "->",
328 if line == -1:
329 #print self._missingValues[table][key]
330 return self._missingValues[table][key]
331 handle = self._getPosition(table, line, key)
332 r = unpack(handle.read(LONG_SIZE))
333 #print r
334 return r
335
336 def getIndex(self):
337 return self._index