Mercurial > repos > iuc > ngsutils_bam_filter
diff ngsutils/support/bgzip.py @ 0:4e4e4093d65d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author | iuc |
---|---|
date | Wed, 11 Nov 2015 13:04:07 -0500 |
parents | |
children | 7a68005de299 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/bgzip.py Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,137 @@ +#!/usr/bin/env python +''' +Extract the blocks from a BGZip file. + +BAM files are stored as blocks in a bgzip archive. This class +will load the bgzip archive and output the block information. +''' + +import sys +import os +import struct + + +class BGZip(object): + def __init__(self, fname): + self.fname = fname + self.pos = 0 + self.fileobj = open(self.fname) + self.fsize = os.stat(self.fname).st_size + self.cur_chunk = 0 + + self.cpos = 0 + self.cdata = 0 + + def close(self): + self.fileobj.close() + + def seek(self, offset): + bgz_offset = offset >> 16 + chunk_offset = offset & 0xFFFF + + self.fileobj.seek(bgz_offset) + self.read_chunk() + self.chunk_pos = chunk_offset + + def read(self, amount, whence=0): + if whence not in [0, 1]: + print "Bad Whence value!: %s" % whence + return + + if whence == 0: + self.seek(0, 0) + + ### read into chunk, if not enough data in chunk, read next chunk + ret = '' + while amount and self.pos < self.fsize: + if len(self.cdata) - self.cpos < amount: + ret += self.cdata[self.cpos:self.cpos + amount] + self.cpos += amount + return ret + else: + ret += self.cdata[self.cpos:] + amount = amount - len(ret) + self.read_chunk() + return ret + + def read_chunk(self): + self.fileobj.seek(10) + id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH') + subpos = 0 + bsize = 0 + + while subpos < xlen: + si1, si2, slen = self._read_fields('<BBH') + if si1 == 66 and si2 == 67: + bsize, = self._read_fields('<H') + else: + self.fileobj.seek(slen, 1) + self.pos += slen + + subpos += 6 + slen + + cdata_size = bsize - xlen - 19 + self.cdata = self.fileobj.read(cdata_size) # inflate value + self.fileobj.seek(8) + + self.cur_chunk += 1 + self.cpos = 0 + + def dump(self): + self.fileobj.seek(0) + block_num = 0 + + while self.pos < self.fsize: + print "[BLOCK %s]" % block_num + # read header + id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH') + print 'id1: %s' % id1 + print 'id2: %s' % id2 + print 'cm: %s' % cm + print 'flg: %s' % flg + print 'mtime: %s' % mtime + print 'xfl: %s' % xfl + print 'os: %s' % os + print 'xlen: %s' % xlen + + # read subfields + subpos = 0 + bsize = 0 + + while subpos < xlen: + si1, si2, slen = self._read_fields('<BBH') + print ' si1: %s' % si1 + print ' si1: %s' % si2 + print ' slen: %s' % slen + print ' data: [%s]' % slen + + if si1 == 66 and si2 == 67: + bsize, = self._read_fields('<H') + else: + self.fileobj.seek(slen, 1) + self.pos += slen + + subpos += 6 + slen + + cdata_size = bsize - xlen - 19 + + print 'bsize: %s' % bsize + print 'cdata: [%s]' % (cdata_size) + + self.fileobj.seek(cdata_size, 1) + self.pos += cdata_size + crc, isize = self._read_fields('<II') + + print "crc: %s" % crc + print "isize: %s" % isize + # print "POS: %s" % self.pos + + block_num += 1 + + def _read_fields(self, field_types): + size = struct.calcsize(field_types) + self.pos += size + return struct.unpack(field_types, self.fileobj.read(size)) + +if __name__ == '__main__': + print BGZip(sys.argv[1]).dump()