Mercurial > repos > thondeboer > neat_genreads
view py/biopython_modified_bgzf.py @ 7:fc1c7b6fb7b6 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author | thondeboer |
---|---|
date | Tue, 15 May 2018 18:12:29 -0400 |
parents | 6e75a84e9338 |
children |
line wrap: on
line source
#!/usr/bin/env python # Copyright 2010-2013 by Peter Cock. # All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. """ ############################################################################ ####### ####### ####### 06/02/2015: ####### ####### - I picked out the bits and pieces of code needed ####### ####### to write BAM files, removed python 3.0 compatibility ####### ####### ####### ############################################################################ """ import zlib import struct _bgzf_header = b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00" _bgzf_eof = b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00" class BgzfWriter(object): def __init__(self, filename=None, mode="w", fileobj=None, compresslevel=6): if fileobj: assert filename is None handle = fileobj else: if "w" not in mode.lower() \ and "a" not in mode.lower(): raise ValueError("Must use write or append mode, not %r" % mode) if "a" in mode.lower(): handle = open(filename, "ab") else: handle = open(filename, "wb") self._text = "b" not in mode.lower() self._handle = handle self._buffer = b"" self.compresslevel = compresslevel def _write_block(self, block): start_offset = self._handle.tell() assert len(block) <= 65536 # Giving a negative window bits means no gzip/zlib headers, -15 used in samtools c = zlib.compressobj(self.compresslevel, zlib.DEFLATED, -15, zlib.DEF_MEM_LEVEL, 0) compressed = c.compress(block) + c.flush() del c assert len(compressed) < 65536, "TODO - Didn't compress enough, try less data in this block" crc = zlib.crc32(block) # Should cope with a mix of Python platforms... if crc < 0: crc = struct.pack("<i", crc) else: crc = struct.pack("<I", crc) bsize = struct.pack("<H", len(compressed) + 25) # includes -1 crc = struct.pack("<I", zlib.crc32(block) & 0xffffffff) uncompressed_length = struct.pack("<I", len(block)) data = _bgzf_header + bsize + compressed + crc + uncompressed_length self._handle.write(data) def write(self, data): data_len = len(data) if len(self._buffer) + data_len < 65536: self._buffer += data return else: self._buffer += data while len(self._buffer) >= 65536: self._write_block(self._buffer[:65536]) self._buffer = self._buffer[65536:] def flush(self): while len(self._buffer) >= 65536: self._write_block(self._buffer[:65535]) self._buffer = self._buffer[65535:] self._write_block(self._buffer) self._buffer = b"" self._handle.flush() def close(self): """Flush data, write 28 bytes empty BGZF EOF marker, and close the BGZF file.""" if self._buffer: self.flush() # samtools will look for a magic EOF marker, just a 28 byte empty BGZF block, # and if it is missing warns the BAM file may be truncated. In addition to # samtools writing this block, so too does bgzip - so we should too. self._handle.write(_bgzf_eof) self._handle.flush() self._handle.close() def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() if __name__ == "__main__": pass