Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/ncList/NCList.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children | 169d364ddd91 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCList.py Fri Jan 18 04:54:14 2013 -0500 @@ -0,0 +1,337 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, os.path +import struct +import shelve +import sys +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.NCIndex import NCIndex +from SMART.Java.Python.misc.Progress import Progress + +LONG_SIZE = struct.calcsize('l') + +H = 0 +L = 1 +T = 2 +G = 3 + +H_CELL_SIZE = 2 +L_CELL_SIZE = 5 +T_CELL_SIZE = 6 + +START = 0 +END = 1 +ADDRESS = 2 +LIST = 3 +PARENT = 4 +NEW = 5 +LENGTH = 1 + +def pack(input): + return struct.pack("l", long(input)) +def unpack(input): + return struct.unpack("l", input)[0] + + +class NCList(object): + + def __init__(self, verbosity): + self._verbosity = verbosity + self._subPos = 0 + self._parentPos = 0 + self._nbLines = 0 + self._nbLists = 0 + self._chromosome = None + self._transcriptFileName = None + self._lHandle = None + self._hHandle = None + self._tHandle = None + self._parser = None + self._sizeDict = {H: H_CELL_SIZE, L: L_CELL_SIZE, T: T_CELL_SIZE} + self._offsets = {H: 0, L: 0, G: 0} + self._fileNameDict = {} + self._handleDict = {} + self._createIndex = False + self._missingValues = dict([table, {}] for table in self._sizeDict) + self._missingValues[T][LIST] = -1 + self._missingValues[L][LIST] = 0 + self._missingValues[T][NEW] = -1 + + def __del__(self): + for handle in (self._lHandle, self._hHandle): + if handle != None: + handle.close() + + def createIndex(self, boolean): + self._createIndex = boolean + + def setChromosome(self, chromosome): + self._chromosome = chromosome + + def setFileName(self, fileName): + self._transcriptFileName = fileName + self._parser = NCListFileUnpickle(fileName, self._verbosity) + self._setFileNames(fileName) + + def setNbElements(self, nbElements): + self._nbLines = nbElements + + def setOffset(self, fileType, offset): + self._offsets[fileType] = offset + + def _setFileNames(self, fileName): + if self._chromosome != None and fileName != None: + coreName = os.path.splitext(fileName)[0] + if "SMARTTMPPATH" in os.environ: + coreName = os.path.join(os.environ["SMARTTMPPATH"], coreName) + self._hFileName = "%s_H.bin" % (coreName) + self._lFileName = "%s_L.bin" % (coreName) + self._tFileName = "%s_T.bin" % (coreName) + self._fileNameDict = {H: self._hFileName, L: self._lFileName, T: self._tFileName} + + def getSizeFirstList(self): + return self._sizeFirstList + + def _writeSubListIntoH(self, SubListAddr, SubListLength): + self._hHandle.write(pack(SubListAddr)) + self._hHandle.write(pack(SubListLength)) + self._subPos += H_CELL_SIZE + + def _writeParentIntoL(self, readAddr, subListAddr, parentAddr, start, end): + self._lHandle.write(pack(start)) + self._lHandle.write(pack(end)) + self._lHandle.write(pack(readAddr)) + self._lHandle.write(pack(subListAddr)) + self._lHandle.write(pack(parentAddr)) + self._parentPos += L_CELL_SIZE + + def getLLineElements(self, subListLAddr): + if subListLAddr == -1 or subListLAddr == None: + #print "reading bad from L", subListLAddr + return -1, -1, -1, -1, -1 + else: + self._lHandle.seek(subListLAddr * L_CELL_SIZE * LONG_SIZE + self._offsets[L]) + start = self._lHandle.read(LONG_SIZE) + if len(start) < LONG_SIZE: + #print "reading very bad from L", subListLAddr + return -1, -1, -1, -1, -1 + start = unpack(start) + end = unpack(self._lHandle.read(LONG_SIZE)) + gff3Addr = unpack(self._lHandle.read(LONG_SIZE)) + subListHAddr = unpack(self._lHandle.read(LONG_SIZE)) + parentLAddr = unpack(self._lHandle.read(LONG_SIZE)) + #print "reading from L", subListLAddr, "-->", gff3Addr, subListHAddr, parentLAddr, start, end + return gff3Addr, subListHAddr, parentLAddr, start, end + + def getHLineElements(self, subListHAddr): + self._hHandle.seek(subListHAddr * H_CELL_SIZE * LONG_SIZE + self._offsets[H]) + subListStartBin = self._hHandle.read(LONG_SIZE) + if len(subListStartBin) < 8 : + #print "reading bad from H" + return -1, -1 + subListStart = unpack(subListStartBin) + subListElementsNb = unpack(self._hHandle.read(LONG_SIZE)) + #print "reading from H", subListHAddr, "-->", subListStart, subListElementsNb + return subListStart, subListElementsNb + + def getRefGffAddr(self, currentRefLAddr): + RefGff3Addr, subListHAddr, parentLAddr, start, end = self.getLLineElements(currentRefLAddr) + return RefGff3Addr + + def getIntervalFromAdress(self, address): + self._parser.gotoAddress(int(address) + self._offsets[G]) + iTranscrit = self._parser.getNextTranscript() + return iTranscrit + + def removeFiles(self): + return + + def buildLists(self): + if self._createIndex: + self._index = NCIndex(self._verbosity) + self._createTables() + self._labelLists() + self._computeSubStart() + self._computeAbsPosition() + self._cleanFiles() + + def _createTables(self): + self._initLists() + self._createTable(H, self._nbLists) + self._createTable(T, self._nbLines) + self._createTable(L, self._nbLines) + self._fillTables() + + def _initLists(self): + previousTranscript = None + self._nbLists = 1 + progress = Progress(self._nbLines, "Initializing lists", self._verbosity-5) + for transcript in self._parser.getIterator(): + if self._isIncluded(transcript, previousTranscript): + self._nbLists += 1 + previousTranscript = transcript + progress.inc() + progress.done() + + def _isIncluded(self, transcript1, transcript2): + return transcript1 != None and transcript2 != None and transcript1.getStart() >= transcript2.getStart() and transcript1.getEnd() <= transcript2.getEnd() + + def _createTable(self, name, size): + handle = open(self._fileNameDict[name], "w+b") + progress = Progress(self._sizeDict[name] * size, "Initializing table %d" % (name), self._verbosity-5) + for i in xrange(self._sizeDict[name] * size): + handle.write(pack(-1)) + progress.inc() + progress.done() + self._handleDict[name] = handle + + def _fillTables(self): + progress = Progress(self._nbLines, "Filling table T", self._verbosity-5) + for i, transcript in enumerate(self._parser.getIterator()): + self._writeValue(T, i, START, transcript.getStart()) + self._writeValue(T, i, END, transcript.getEnd()) + self._writeValue(T, i, ADDRESS, self._parser.getCurrentTranscriptAddress()) + self._writeValue(T, i, PARENT, -1) + self._writeValue(T, i, LIST, -1) + progress.inc() + progress.done() + progress = Progress(self._nbLists, "Filling table H", self._verbosity-5) + for i in xrange(self._nbLists): + self._writeValue(H, i, LENGTH, 0) + progress.inc() + progress.done() + + def _labelLists(self): + progress = Progress(self._nbLines, "Getting table structure", self._verbosity-5) + nextL = 0 + for i in xrange(self._nbLines): + p = i - 1 + start = self._readValue(T, i, START) + end = self._readValue(T, i, END) + while p != -1 and (start < self._readValue(T, p, START) or end > self._readValue(T, p, END)): + p = self._readValue(T, p, PARENT) + thisL = self._readValue(T, p, LIST) + if thisL == -1: + #print "entering" + thisL = nextL + nextL += 1 + length = 0 + self._writeValue(T, p, LIST, thisL) + else: + length = self._readValue(H, thisL, LENGTH) + self._writeValue(T, i, PARENT, p) + self._writeValue(H, thisL, LENGTH, length + 1) + progress.inc() + progress.done() + + def _computeSubStart(self): + progress = Progress(self._nbLines, "Getting table sub-lists", self._verbosity-5) + total = 0 + for i in xrange(self._nbLists): + self._writeValue(H, i, START, total) + total += self._readValue(H, i, LENGTH) + self._writeValue(H, i, LENGTH, 0) + progress.inc() + progress.done() + + def _computeAbsPosition(self): + progress = Progress(self._nbLines, "Writing table", self._verbosity-5) + self._sizeFirstList = 0 + for i in xrange(self._nbLines): + s = self._readValue(T, i, START) + e = self._readValue(T, i, END) + a = self._readValue(T, i, ADDRESS) + pt = self._readValue(T, i, PARENT) + h = self._readValue(T, pt, LIST) + pl = self._readValue(T, pt, NEW) + nb = self._readValue(H, h, LENGTH) + l = self._readValue(H, h, START) + nb + self._writeValue(T, i, NEW, l) + self._writeValue(L, l, START, s) + self._writeValue(L, l, END, e) + self._writeValue(L, l, ADDRESS, a) + self._writeValue(L, l, LIST, -1) + self._writeValue(L, l, PARENT, pl) + self._writeValue(H, h, LENGTH, nb+1) + if nb == 0: + #print "adding it" + self._writeValue(L, pl, LIST, h) + if pl == -1: + self._sizeFirstList += 1 + if self._createIndex: + self._index.addTranscript(e, l) + progress.inc() + progress.done() + + def closeFiles(self): + for handle in self._handleDict.values(): + handle.close() + del self._handleDict + self._lHandle = None + self._hHandle = None + self._tHandle = None + self._parser = None + + def openFiles(self): + self._lHandle = open(self._fileNameDict[L], "rb") + self._hHandle = open(self._fileNameDict[H], "rb") + self._handleDict = {H: self._hHandle, L: self._lHandle} + self._parser = NCListFileUnpickle(self._transcriptFileName, self._verbosity) + + def _cleanFiles(self): + self.closeFiles() + os.remove(self._fileNameDict[T]) + + def _getPosition(self, table, line, key): + handle = self._handleDict[table] + handle.seek(self._sizeDict[table] * line * LONG_SIZE + key * LONG_SIZE) + return handle + + def _writeValue(self, table, line, key, value): + #print "writing", table, line, key, "<-", value + if line == -1: + self._missingValues[table][key] = value + return + handle = self._getPosition(table, line, key) + handle.write(pack(value)) + + def _readValue(self, table, line, key): + #print "reading", table, line, key, "->", + if line == -1: + #print self._missingValues[table][key] + return self._missingValues[table][key] + handle = self._getPosition(table, line, key) + r = unpack(handle.read(LONG_SIZE)) + #print r + return r + + def getIndex(self): + return self._index