diff ezBAMQC/src/htslib/cram/cram_io.h @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ezBAMQC/src/htslib/cram/cram_io.h	Thu Mar 24 17:12:52 2016 -0400
@@ -0,0 +1,596 @@
+/*
+Copyright (c) 2012-2014 Genome Research Ltd.
+Author: James Bonfield <jkb@sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*! \file
+ * Include cram.h instead.
+ *
+ * This is an internal part of the CRAM system and is automatically included
+ * when you #include cram.h.
+ *
+ * Implements the low level CRAM I/O primitives.
+ * This includes basic data types such as byte, int, ITF-8,
+ * maps, bitwise I/O, etc.
+ */
+
+#ifndef _CRAM_IO_H_
+#define _CRAM_IO_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ITF8_MACROS
+
+#include <stdint.h>
+#include <cram/misc.h>
+
+/**@{ ----------------------------------------------------------------------
+ * ITF8 encoding and decoding.
+ *
+ * Also see the itf8_get and itf8_put macros.
+ */
+
+/*! INTERNAL: Converts two characters into an integer for use in switch{} */
+#define CRAM_KEY(a,b) (((a)<<8)|((b)))
+
+/*! Reads an integer in ITF-8 encoding from 'fd' and stores it in
+ * *val.
+ *
+ * @return
+ * Returns the number of bytes read on success;
+ *        -1 on failure
+ */
+int itf8_decode(cram_fd *fd, int32_t *val);
+
+#ifndef ITF8_MACROS
+/*! Reads an integer in ITF-8 encoding from 'cp' and stores it in
+ * *val.
+ *
+ * @return
+ * Returns the number of bytes read on success;
+ *        -1 on failure
+ */
+int itf8_get(char *cp, int32_t *val_p);
+
+/*! Stores a value to memory in ITF-8 format.
+ *
+ * @return
+ * Returns the number of bytes required to store the number.
+ * This is a maximum of 5 bytes.
+ */
+int itf8_put(char *cp, int32_t val);
+
+#else
+
+/*
+ * Macro implementations of the above
+ */
+#define itf8_get(c,v) (((uc)(c)[0]<0x80)?(*(v)=(uc)(c)[0],1):(((uc)(c)[0]<0xc0)?(*(v)=(((uc)(c)[0]<<8)|(uc)(c)[1])&0x3fff,2):(((uc)(c)[0]<0xe0)?(*(v)=(((uc)(c)[0]<<16)|((uc)(c)[1]<<8)|(uc)(c)[2])&0x1fffff,3):(((uc)(c)[0]<0xf0)?(*(v)=(((uc)(c)[0]<<24)|((uc)(c)[1]<<16)|((uc)(c)[2]<<8)|(uc)(c)[3])&0x0fffffff,4):(*(v)=(((uc)(c)[0]&0x0f)<<28)|((uc)(c)[1]<<20)|((uc)(c)[2]<<12)|((uc)(c)[3]<<4)|((uc)(c)[4]&0x0f),5)))))
+
+#define itf8_put(c,v) ((!((v)&~0x7f))?((c)[0]=(v),1):(!((v)&~0x3fff))?((c)[0]=((v)>>8)|0x80,(c)[1]=(v)&0xff,2):(!((v)&~0x1fffff))?((c)[0]=((v)>>16)|0xc0,(c)[1]=((v)>>8)&0xff,(c)[2]=(v)&0xff,3):(!((v)&~0xfffffff))?((c)[0]=((v)>>24)|0xe0,(c)[1]=((v)>>16)&0xff,(c)[2]=((v)>>8)&0xff,(c)[3]=(v)&0xff,4):((c)[0]=0xf0|(((v)>>28)&0xff),(c)[1]=((v)>>20)&0xff,(c)[2]=((v)>>12)&0xff,(c)[3]=((v)>>4)&0xff,(c)[4]=(v)&0xf,5))
+
+#define itf8_size(v) ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5)
+
+#endif
+
+int ltf8_get(char *cp, int64_t *val_p);
+int ltf8_put(char *cp, int64_t val);
+
+/*! Pushes a value in ITF8 format onto the end of a block.
+ *
+ * This shouldn't be used for high-volume data as it is not the fastest
+ * method.
+ *
+ * @return
+ * Returns the number of bytes written
+ */
+int itf8_put_blk(cram_block *blk, int val);
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * CRAM blocks - the dynamically growable data block. We have code to
+ * create, update, (un)compress and read/write.
+ *
+ * These are derived from the deflate_interlaced.c blocks, but with the
+ * CRAM extension of content types and IDs.
+ */
+
+/*! Allocates a new cram_block structure with a specified content_type and
+ * id.
+ *
+ * @return
+ * Returns block pointer on success;
+ *         NULL on failure
+ */
+cram_block *cram_new_block(enum cram_content_type content_type,
+			   int content_id);
+
+/*! Reads a block from a cram file.
+ *
+ * @return
+ * Returns cram_block pointer on success;
+ *         NULL on failure
+ */
+cram_block *cram_read_block(cram_fd *fd);
+
+/*! Writes a CRAM block.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int cram_write_block(cram_fd *fd, cram_block *b);
+
+/*! Frees a CRAM block, deallocating internal data too.
+ */
+void cram_free_block(cram_block *b);
+
+/*! Uncompress a memory block using Zlib.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size);
+
+/*! Uncompresses a CRAM block, if compressed.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int cram_uncompress_block(cram_block *b);
+
+/*! Compresses a block.
+ *
+ * Compresses a block using one of two different zlib strategies. If we only
+ * want one choice set strat2 to be -1.
+ *
+ * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
+ * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
+ * significantly faster.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
+			int method, int level);
+
+cram_metrics *cram_new_metrics(void);
+char *cram_block_method2str(enum cram_block_method m);
+char *cram_content_type2str(enum cram_content_type t);
+
+/* --- Accessor macros for manipulating blocks on a byte by byte basis --- */
+
+/* Block size and data pointer. */
+#define BLOCK_SIZE(b) ((b)->byte)
+#define BLOCK_DATA(b) ((b)->data)
+
+/* Returns the address one past the end of the block */
+#define BLOCK_END(b) (&(b)->data[(b)->byte])
+
+/* Request block to be at least 'l' bytes long */
+#define BLOCK_RESIZE(b,l)					\
+    do {							\
+	while((b)->alloc <= (l)) {				\
+	    (b)->alloc = (b)->alloc ? (b)->alloc*1.5 : 1024;	\
+	    (b)->data = realloc((b)->data, (b)->alloc);		\
+	}							\
+     } while(0)
+
+/* Ensure the block can hold at least another 'l' bytes */
+#define BLOCK_GROW(b,l) BLOCK_RESIZE((b), BLOCK_SIZE((b)) + (l))
+
+/* Append string 's' of length 'l' */
+#define BLOCK_APPEND(b,s,l)		  \
+    do {				  \
+        BLOCK_GROW((b),(l));		  \
+        memcpy(BLOCK_END((b)), (s), (l)); \
+	BLOCK_SIZE((b)) += (l);		  \
+    } while (0)
+
+/* Append as single character 'c' */
+#define BLOCK_APPEND_CHAR(b,c)		  \
+    do {				  \
+        BLOCK_GROW((b),1);		  \
+	(b)->data[(b)->byte++] = (c);	  \
+    } while (0)
+
+/* Append a single unsigned integer */
+#define BLOCK_APPEND_UINT(b,i)		             \
+    do {					     \
+        unsigned char *cp;			     \
+        BLOCK_GROW((b),11);			     \
+	cp = &(b)->data[(b)->byte];		     \
+        (b)->byte += append_uint32(cp, (i)) - cp;	\
+    } while (0)
+
+static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i) {
+    uint32_t j;
+
+    if (i == 0) {
+	*cp++ = '0';
+	return cp;
+    }
+
+    if (i < 100)        goto b1;
+    if (i < 10000)      goto b3;
+    if (i < 1000000)    goto b5;
+    if (i < 100000000)  goto b7;
+
+    if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
+    if ((j = i / 100000000))  {*cp++ = j + '0'; i -= j*100000000;  goto x7;}
+ b7:if ((j = i / 10000000))   {*cp++ = j + '0'; i -= j*10000000;   goto x6;}
+    if ((j = i / 1000000))    {*cp++ = j + '0', i -= j*1000000;    goto x5;}
+ b5:if ((j = i / 100000))     {*cp++ = j + '0', i -= j*100000;     goto x4;}
+    if ((j = i / 10000))      {*cp++ = j + '0', i -= j*10000;      goto x3;}
+ b3:if ((j = i / 1000))       {*cp++ = j + '0', i -= j*1000;       goto x2;}
+    if ((j = i / 100))        {*cp++ = j + '0', i -= j*100;        goto x1;}
+ b1:if ((j = i / 10))         {*cp++ = j + '0', i -= j*10;         goto x0;}
+    if (i)                     *cp++ = i + '0';
+    return cp;
+
+ x8: *cp++ = i / 100000000 + '0', i %= 100000000;
+ x7: *cp++ = i / 10000000  + '0', i %= 10000000;
+ x6: *cp++ = i / 1000000   + '0', i %= 1000000;
+ x5: *cp++ = i / 100000    + '0', i %= 100000;
+ x4: *cp++ = i / 10000     + '0', i %= 10000;
+ x3: *cp++ = i / 1000      + '0', i %= 1000;
+ x2: *cp++ = i / 100       + '0', i %= 100;
+ x1: *cp++ = i / 10        + '0', i %= 10;
+ x0: *cp++ = i             + '0';
+
+    return cp;
+}
+
+static inline unsigned char *append_sub32(unsigned char *cp, uint32_t i) {
+    *cp++ = i / 100000000 + '0', i %= 100000000;
+    *cp++ = i / 10000000  + '0', i %= 10000000;
+    *cp++ = i / 1000000   + '0', i %= 1000000;
+    *cp++ = i / 100000    + '0', i %= 100000;
+    *cp++ = i / 10000     + '0', i %= 10000;
+    *cp++ = i / 1000      + '0', i %= 1000;
+    *cp++ = i / 100       + '0', i %= 100;
+    *cp++ = i / 10        + '0', i %= 10;
+    *cp++ = i             + '0';
+
+    return cp;
+}
+
+static inline unsigned char *append_uint64(unsigned char *cp, uint64_t i) {
+    uint64_t j;
+
+    if (i <= 0xffffffff)
+	return append_uint32(cp, i);
+
+    if ((j = i/1000000000) > 1000000000) {
+	cp = append_uint32(cp, j/1000000000);
+	j %= 1000000000;
+	cp = append_sub32(cp, j);
+    } else {
+	cp = append_uint32(cp, i / 1000000000);
+    }
+    cp = append_sub32(cp, i % 1000000000);
+
+    return cp;
+}
+
+#define BLOCK_UPLEN(b) \
+    (b)->comp_size = (b)->uncomp_size = BLOCK_SIZE((b))
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * Reference sequence handling
+ */
+
+/*! Loads a reference set from fn and stores in the cram_fd.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int cram_load_reference(cram_fd *fd, char *fn);
+
+/*! Generates a lookup table in refs based on the SQ headers in SAM_hdr.
+ *
+ * Indexes references by the order they appear in a BAM file. This may not
+ * necessarily be the same order they appear in the fasta reference file.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int refs2id(refs_t *r, SAM_hdr *bfd);
+
+void refs_free(refs_t *r);
+
+/*! Returns a portion of a reference sequence from start to end inclusive.
+ *
+ * The returned pointer is owned by the cram_file fd and should not be freed
+ * by the caller. It is valid only until the next cram_get_ref is called
+ * with the same fd parameter (so is thread-safe if given multiple files).
+ *
+ * To return the entire reference sequence, specify start as 1 and end
+ * as 0.
+ *
+ * @return
+ * Returns reference on success;
+ *         NULL on failure
+ */
+char *cram_get_ref(cram_fd *fd, int id, int start, int end);
+void cram_ref_incr(refs_t *r, int id);
+void cram_ref_decr(refs_t *r, int id);
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * Containers
+ */
+
+/*! Creates a new container, specifying the maximum number of slices
+ * and records permitted.
+ *
+ * @return
+ * Returns cram_container ptr on success;
+ *         NULL on failure
+ */
+cram_container *cram_new_container(int nrec, int nslice);
+void cram_free_container(cram_container *c);
+
+/*! Reads a container header.
+ *
+ * @return
+ * Returns cram_container on success;
+ *         NULL on failure or no container left (fd->err == 0).
+ */
+cram_container *cram_read_container(cram_fd *fd);
+
+/*! Writes a container structure.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int cram_write_container(cram_fd *fd, cram_container *h);
+
+/*! Flushes a container to disk.
+ *
+ * Flushes a completely or partially full container to disk, writing
+ * container structure, header and blocks. This also calls the encoder
+ * functions.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int cram_flush_container(cram_fd *fd, cram_container *c);
+int cram_flush_container_mt(cram_fd *fd, cram_container *c);
+
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * Compression headers; the first part of the container
+ */
+
+/*! Creates a new blank container compression header
+ *
+ * @return
+ * Returns header ptr on success;
+ *         NULL on failure
+ */
+cram_block_compression_hdr *cram_new_compression_header(void);
+
+/*! Frees a cram_block_compression_hdr */
+void cram_free_compression_header(cram_block_compression_hdr *hdr);
+
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * Slices and slice headers
+ */
+
+/*! Frees a slice header */
+void cram_free_slice_header(cram_block_slice_hdr *hdr);
+
+/*! Frees a slice */
+void cram_free_slice(cram_slice *s);
+
+/*! Creates a new empty slice in memory, for subsequent writing to
+ * disk.
+ *
+ * @return
+ * Returns cram_slice ptr on success;
+ *         NULL on failure
+ */
+cram_slice *cram_new_slice(enum cram_content_type type, int nrecs);
+
+/*! Loads an entire slice.
+ *
+ * FIXME: In 1.0 the native unit of slices within CRAM is broken
+ * as slices contain references to objects in other slices.
+ * To work around this while keeping the slice oriented outer loop
+ * we read all slices and stitch them together into a fake large
+ * slice instead.
+ *
+ * @return
+ * Returns cram_slice ptr on success;
+ *         NULL on failure
+ */
+cram_slice *cram_read_slice(cram_fd *fd);
+
+
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * CRAM file definition (header)
+ */
+
+/*! Reads a CRAM file definition structure.
+ *
+ * @return
+ * Returns file_def ptr on success;
+ *         NULL on failure
+ */
+cram_file_def *cram_read_file_def(cram_fd *fd);
+
+/*! Writes a cram_file_def structure to cram_fd.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int cram_write_file_def(cram_fd *fd, cram_file_def *def);
+
+/*! Frees a cram_file_def structure. */
+void cram_free_file_def(cram_file_def *def);
+
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * SAM header I/O
+ */
+
+/*! Reads the SAM header from the first CRAM data block.
+ *
+ * Also performs minimal parsing to extract read-group
+ * and sample information.
+ *
+ * @return
+ * Returns SAM hdr ptr on success;
+ *         NULL on failure
+ */
+SAM_hdr *cram_read_SAM_hdr(cram_fd *fd);
+
+/*! Writes a CRAM SAM header.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr);
+
+
+/**@}*/
+/**@{ ----------------------------------------------------------------------
+ * The top-level cram opening, closing and option handling
+ */
+
+/*! Opens a CRAM file for read (mode "rb") or write ("wb").
+ *
+ * The filename may be "-" to indicate stdin or stdout.
+ *
+ * @return
+ * Returns file handle on success;
+ *         NULL on failure.
+ */
+cram_fd *cram_open(const char *filename, const char *mode);
+
+/*! Opens an existing stream for reading or writing.
+ *
+ * @return
+ * Returns file handle on success;
+ *         NULL on failure.
+ */
+cram_fd *cram_dopen(struct hFILE *fp, const char *filename, const char *mode);
+
+/*! Closes a CRAM file.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int cram_close(cram_fd *fd);
+
+/*
+ * Seek within a CRAM file.
+ *
+ * Returns 0 on success
+ *        -1 on failure
+ */
+int cram_seek(cram_fd *fd, off_t offset, int whence);
+
+/*
+ * Flushes a CRAM file.
+ * Useful for when writing to stdout without wishing to close the stream.
+ *
+ * Returns 0 on success
+ *        -1 on failure
+ */
+int cram_flush(cram_fd *fd);
+
+/*! Checks for end of file on a cram_fd stream.
+ *
+ * @return
+ * Returns 0 if not at end of file
+ *         1 if we hit an expected EOF (end of range or EOF block)
+ *         2 for other EOF (end of stream without EOF block)
+ */
+int cram_eof(cram_fd *fd);
+
+/*! Sets options on the cram_fd.
+ *
+ * See CRAM_OPT_* definitions in cram_structs.h.
+ * Use this immediately after opening.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int cram_set_option(cram_fd *fd, enum cram_option opt, ...);
+
+/*! Sets options on the cram_fd.
+ *
+ * See CRAM_OPT_* definitions in cram_structs.h.
+ * Use this immediately after opening.
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args);
+
+/*!
+ * Attaches a header to a cram_fd.
+ *
+ * This should be used when creating a new cram_fd for writing where
+ * we have an SAM_hdr already constructed (eg from a file we've read
+ * in).
+ *
+ * @return
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+int cram_set_header(cram_fd *fd, SAM_hdr *hdr);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CRAM_IO_H_ */