Mercurial > repos > youngkim > ezbamqc
diff ezBAMQC/src/htslib/cram/cram_structs.h @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ezBAMQC/src/htslib/cram/cram_structs.h Thu Mar 24 17:12:52 2016 -0400 @@ -0,0 +1,809 @@ +/* +Copyright (c) 2012-2013 Genome Research Ltd. +Author: James Bonfield <jkb@sanger.ac.uk> + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _CRAM_STRUCTS_H_ +#define _CRAM_STRUCTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Defines in-memory structs for the basic file-format objects in the + * CRAM format. + * + * The basic file format is: + * File-def SAM-hdr Container Container ... + * + * Container: + * Service-block data-block data-block ... + * + * Multiple blocks in a container are grouped together as slices, + * also sometimes referred to as landmarks in the spec. + */ + + +#include <stdint.h> + +#include "cram/thread_pool.h" +#include "cram/string_alloc.h" +#include "htslib/khash.h" + +// Generic hash-map integer -> integer +KHASH_MAP_INIT_INT(m_i2i, int) + +// Generic hash-set integer -> (existance) +KHASH_SET_INIT_INT(s_i2i) + +// For brevity +typedef unsigned char uc; + +/* + * A union for the preservation map. Required for khash. + */ +typedef union { + int i; + char *p; +} pmap_t; + +// Generates static functions here which isn't ideal, but we have no way +// currently to declare the kh_map_t structure here without also declaring a +// duplicate in the .c files due to the nature of the KHASH macros. +KHASH_MAP_INIT_STR(map, pmap_t) + +struct hFILE; + +#define SEQS_PER_SLICE 10000 +#define SLICE_PER_CNT 1 + +#define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" + +#define MAX_STAT_VAL 1024 +//#define MAX_STAT_VAL 16 +typedef struct { + int freqs[MAX_STAT_VAL]; + khash_t(m_i2i) *h; + int nsamp; // total number of values added + int nvals; // total number of unique values added +} cram_stats; + +/* NB: matches java impl, not the spec */ +enum cram_encoding { + E_NULL = 0, + E_EXTERNAL = 1, + E_GOLOMB = 2, + E_HUFFMAN = 3, + E_BYTE_ARRAY_LEN = 4, + E_BYTE_ARRAY_STOP = 5, + E_BETA = 6, + E_SUBEXP = 7, + E_GOLOMB_RICE = 8, + E_GAMMA = 9 +}; + +enum cram_external_type { + E_INT = 1, + E_LONG = 2, + E_BYTE = 3, + E_BYTE_ARRAY = 4, + E_BYTE_ARRAY_BLOCK = 5, +}; + +/* External IDs used by this implementation (only assumed during writing) */ +enum cram_DS_ID { + DS_CORE = 0, + DS_aux = 1, // aux_blk + DS_aux_OQ = 2, + DS_aux_BQ = 3, + DS_aux_BD = 4, + DS_aux_BI = 5, + DS_aux_FZ = 6, // also ZM:B + DS_aux_oq = 7, // other qualities + DS_aux_os = 8, // other sequences + DS_aux_oz = 9, // other strings + DS_ref, + DS_RN, // name_blk + DS_QS, // qual_blk + DS_IN, // base_blk + DS_SC, // soft_blk + + DS_BF, // start loop + DS_CF, + DS_AP, + DS_RG, + DS_MQ, + DS_NS, + DS_MF, + DS_TS, + DS_NP, + DS_NF, + DS_RL, + DS_FN, + DS_FC, + DS_FP, + DS_DL, + DS_BA, + DS_BS, + DS_TL, + DS_RI, + DS_RS, + DS_PD, + DS_HC, + DS_BB, + DS_QQ, + + DS_TN, // end loop + + DS_RN_len, + DS_SC_len, + DS_BB_len, + DS_QQ_len, + + DS_TC, // CRAM v1.0 tags + DS_TM, // test + DS_TV, // test + + DS_END, +}; + +/* "File Definition Structure" */ +typedef struct { + char magic[4]; + uint8_t major_version; + uint8_t minor_version; + char file_id[20]; // Filename or SHA1 checksum +} cram_file_def; + +#define CRAM_MAJOR_VERS(v) ((v) >> 8) +#define CRAM_MINOR_VERS(v) ((v) & 0xff) + +struct cram_slice; + +enum cram_block_method { + ERROR = -1, + RAW = 0, + GZIP = 1, + BZIP2 = 2, + LZMA = 3, + RANS = 4, // Generic; either order + RANS0 = 4, + RANS1 = 10, // Not externalised; stored as RANS (generic) + GZIP_RLE = 11, // NB: not externalised in CRAM +}; + +enum cram_content_type { + CT_ERROR = -1, + FILE_HEADER = 0, + COMPRESSION_HEADER = 1, + MAPPED_SLICE = 2, + UNMAPPED_SLICE = 3, // CRAM V1.0 only + EXTERNAL = 4, + CORE = 5, +}; + +/* Compression metrics */ +typedef struct { + // number of trials and time to next trial + int trial; + int next_trial; + + // aggregate sizes during trials + int sz_gz_rle; + int sz_gz_def; + int sz_rans0; + int sz_rans1; + int sz_bzip2; + int sz_lzma; + + // resultant method from trials + int method; + int strat; + + // Revisions of method, to allow culling of continually failing ones. + int gz_rle_cnt; + int gz_def_cnt; + int rans0_cnt; + int rans1_cnt; + int bzip2_cnt; + int lzma_cnt; + int revised_method; + + double gz_rle_extra; + double gz_def_extra; + double rans0_extra; + double rans1_extra; + double bzip2_extra; + double lzma_extra; +} cram_metrics; + +/* Block */ +typedef struct { + enum cram_block_method method, orig_method; + enum cram_content_type content_type; + int32_t content_id; + int32_t comp_size; + int32_t uncomp_size; + uint32_t crc32; + int32_t idx; /* offset into data */ + unsigned char *data; + + // For bit I/O + size_t alloc; + size_t byte; + int bit; +} cram_block; + +struct cram_codec; /* defined in cram_codecs.h */ +struct cram_map; + +#define CRAM_MAP_HASH 32 +#define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1)) + +/* Compression header block */ +typedef struct { + int32_t ref_seq_id; + int32_t ref_seq_start; + int32_t ref_seq_span; + int32_t num_records; + int32_t num_landmarks; + int32_t *landmark; + + /* Flags from preservation map */ + int mapped_qs_included; + int unmapped_qs_included; + int unmapped_placed; + int qs_included; + int read_names_included; + int AP_delta; + // indexed by ref-base and subst. code + char substitution_matrix[5][4]; + + // TD Dictionary as a concatenated block + cram_block *TD_blk; // Tag Dictionary + int nTL; // number of TL entries in TD + unsigned char **TL; // array of size nTL, pointer into TD_blk. + khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices + string_alloc_t *TD_keys; // Pooled keys for TD hash. + + khash_t(map) *preservation_map; + struct cram_map *rec_encoding_map[CRAM_MAP_HASH]; + struct cram_map *tag_encoding_map[CRAM_MAP_HASH]; + + struct cram_codec *codecs[DS_END]; + + char *uncomp; // A single block of uncompressed data + size_t uncomp_size, uncomp_alloc; + + unsigned int data_series; // See cram_fields enum below +} cram_block_compression_hdr; + +typedef struct cram_map { + int key; /* 0xe0 + 3 bytes */ + enum cram_encoding encoding; + int offset; /* Offset into a single block of memory */ + int size; /* Size */ + struct cram_codec *codec; + struct cram_map *next; // for noddy internal hash +} cram_map; + +/* Mapped or unmapped slice header block */ +typedef struct { + enum cram_content_type content_type; + int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */ + int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */ + int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */ + int32_t num_records; + int64_t record_counter; + int32_t num_blocks; + int32_t num_content_ids; + int32_t *block_content_ids; + int32_t ref_base_id; /* if content_type == MAPPED_SLICE */ + unsigned char md5[16]; +} cram_block_slice_hdr; + +struct ref_entry; + +/* + * Container. + * + * Conceptually a container is split into slices, and slices into blocks. + * However on disk it's just a list of blocks and we need to query the + * block types to identify the start/end points of the slices. + * + * OR... are landmarks the start/end points of slices? + */ +typedef struct { + int32_t length; + int32_t ref_seq_id; + int32_t ref_seq_start; + int32_t ref_seq_span; + int64_t record_counter; + int64_t num_bases; + int32_t num_records; + int32_t num_blocks; + int32_t num_landmarks; + int32_t *landmark; + + /* Size of container header above */ + size_t offset; + + /* Compression header is always the first block? */ + cram_block_compression_hdr *comp_hdr; + cram_block *comp_hdr_block; + + /* For construction purposes */ + int max_slice, curr_slice; // maximum number of slices + int max_rec, curr_rec; // current and max recs per slice + int max_c_rec, curr_c_rec; // current and max recs per container + int slice_rec; // rec no. for start of this slice + int curr_ref; // current ref ID. -2 for no previous + int last_pos; // last record position + struct cram_slice **slices, *slice; + int pos_sorted; // boolean, 1=>position sorted data + int max_apos; // maximum position, used if pos_sorted==0 + int last_slice; // number of reads in last slice (0 for 1st) + int multi_seq; // true if packing multi seqs per cont/slice + int unsorted; // true is AP_delta is 0. + + /* Copied from fd before encoding, to allow multi-threading */ + int ref_start, first_base, last_base, ref_id, ref_end; + char *ref; + //struct ref_entry *ref; + + /* For multi-threading */ + bam_seq_t **bams; + + /* Statistics for encoding */ + cram_stats *stats[DS_END]; + + khash_t(s_i2i) *tags_used; // set of tag types in use, for tag encoding map + int *refs_used; // array of frequency of ref seq IDs + + uint32_t crc32; // CRC32 +} cram_container; + +/* + * A single cram record + */ +typedef struct { + struct cram_slice *s; // Filled out by cram_decode only + + int32_t ref_id; // fixed for all recs in slice? + int32_t flags; // BF + int32_t cram_flags; // CF + int32_t len; // RL + int32_t apos; // AP + int32_t rg; // RG + int32_t name; // RN; idx to s->names_blk + int32_t name_len; + int32_t mate_line; // index to another cram_record + int32_t mate_ref_id; + int32_t mate_pos; // NP + int32_t tlen; // TS + + // Auxiliary data + int32_t ntags; // TC + int32_t aux; // idx to s->aux_blk + int32_t aux_size; // total size of packed ntags in aux_blk +#ifndef TN_external + int32_t TN_idx; // TN; idx to s->TN; +#else + int32_t tn; // idx to s->tn_blk +#endif + int TL; + + int32_t seq; // idx to s->seqs_blk + int32_t qual; // idx to s->qual_blk + int32_t cigar; // idx to s->cigar + int32_t ncigar; + int32_t aend; // alignment end + int32_t mqual; // MQ + + int32_t feature; // idx to s->feature + int32_t nfeature; // number of features + int32_t mate_flags; // MF +} cram_record; + +// Accessor macros as an analogue of the bam ones +#define cram_qname(c) (&(c)->s->name_blk->data[(c)->name]) +#define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq]) +#define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual]) +#define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux]) +#define cram_seqi(c,i) (cram_seq((c))[(i)]) +#define cram_name_len(c) ((c)->name_len) +#define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0) +#define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0) +#define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar]) + +/* + * A feature is a base difference, used for the sequence reference encoding. + * (We generate these internally when writing CRAM.) + */ +typedef struct { + union { + struct { + int pos; + int code; + int base; // substitution code + } X; + struct { + int pos; + int code; + int base; // actual base & qual + int qual; + } B; + struct { + int pos; + int code; + int seq_idx; // index to s->seqs_blk + int len; + } b; + struct { + int pos; + int code; + int qual; + } Q; + struct { + int pos; + int code; + int len; + int seq_idx; // soft-clip multiple bases + } S; + struct { + int pos; + int code; + int len; + int seq_idx; // insertion multiple bases + } I; + struct { + int pos; + int code; + int base; // insertion single base + } i; + struct { + int pos; + int code; + int len; + } D; + struct { + int pos; + int code; + int len; + } N; + struct { + int pos; + int code; + int len; + } P; + struct { + int pos; + int code; + int len; + } H; + }; +} cram_feature; + +/* + * A slice is really just a set of blocks, but it + * is the logical unit for decoding a number of + * sequences. + */ +typedef struct cram_slice { + cram_block_slice_hdr *hdr; + cram_block *hdr_block; + cram_block **block; + cram_block **block_by_id; + + /* State used during encoding/decoding */ + int last_apos, max_apos; + + /* Array of decoded cram records */ + cram_record *crecs; + + /* An dynamically growing buffers for data pointed + * to by crecs[] array. + */ + uint32_t *cigar; + uint32_t cigar_alloc; + uint32_t ncigar; + + cram_feature *features; + int nfeatures; + int afeatures; // allocated size of features + +#ifndef TN_external + // TN field (Tag Name) + uint32_t *TN; + int nTN, aTN; // used and allocated size for TN[] +#else + cram_block *tn_blk; + int tn_id; +#endif + + // For variable sized elements which are always external blocks. + cram_block *name_blk; + cram_block *seqs_blk; + cram_block *qual_blk; + cram_block *base_blk; + cram_block *soft_blk; + cram_block *aux_blk; + cram_block *aux_OQ_blk; + cram_block *aux_BQ_blk; + cram_block *aux_BD_blk; + cram_block *aux_BI_blk; + cram_block *aux_FZ_blk; + cram_block *aux_oq_blk; + cram_block *aux_os_blk; + cram_block *aux_oz_blk; + + string_alloc_t *pair_keys; // Pooled keys for pair hash. + khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice. + + char *ref; // slice of current reference + int ref_start; // start position of current reference; + int ref_end; // end position of current reference; + int ref_id; +} cram_slice; + +/*----------------------------------------------------------------------------- + * Consider moving reference handling to cram_refs.[ch] + */ +// from fa.fai / samtools faidx files +typedef struct ref_entry { + char *name; + char *fn; + int64_t length; + int64_t offset; + int bases_per_line; + int line_length; + int64_t count; // for shared references so we know to dealloc seq + char *seq; +} ref_entry; + +KHASH_MAP_INIT_STR(refs, ref_entry*) + +// References structure. +typedef struct { + string_alloc_t *pool; // String pool for holding filenames and SN vals + + khash_t(refs) *h_meta; // ref_entry*, index by name + ref_entry **ref_id; // ref_entry*, index by ID + int nref; // number of ref_entry + + char *fn; // current file opened + BGZF *fp; // and the hFILE* to go with it. + + int count; // how many cram_fd sharing this refs struct + + pthread_mutex_t lock; // Mutex for multi-threaded updating + ref_entry *last; // Last queried sequence + int last_id; // Used in cram_ref_decr_locked to delay free +} refs_t; + +/*----------------------------------------------------------------------------- + * CRAM index + * + * Detect format by number of entries per line. + * 5 => 1.0 (refid, start, nseq, C offset, slice) + * 6 => 1.1 (refid, start, span, C offset, S offset, S size) + * + * Indices are stored in a nested containment list, which is trivial to set + * up as the indices are on sorted data so we're appending to the nclist + * in sorted order. Basically if a slice entirely fits within a previous + * slice then we append to that slices list. This is done recursively. + * + * Lists are sorted on two dimensions: ref id + slice coords. + */ +typedef struct cram_index { + int nslice, nalloc; // total number of slices + struct cram_index *e; // array of size nslice + + int refid; // 1.0 1.1 + int start; // 1.0 1.1 + int end; // 1.1 + int nseq; // 1.0 - undocumented + int slice; // 1.0 landmark index, 1.1 landmark value + int len; // 1.1 - size of slice in bytes + int64_t offset; // 1.0 1.1 +} cram_index; + +typedef struct { + int refid; + int start; + int end; +} cram_range; + +/*----------------------------------------------------------------------------- + */ +/* CRAM File handle */ + +typedef struct spare_bams { + bam_seq_t **bams; + struct spare_bams *next; +} spare_bams; + +typedef struct cram_fd { + struct hFILE *fp; + int mode; // 'r' or 'w' + int version; + cram_file_def *file_def; + SAM_hdr *header; + + char *prefix; + int64_t record_counter; + int err; + + // Most recent compression header decoded + //cram_block_compression_hdr *comp_hdr; + //cram_block_slice_hdr *slice_hdr; + + // Current container being processed. + cram_container *ctr; + + // positions for encoding or decoding + int first_base, last_base; + + // cached reference portion + refs_t *refs; // ref meta-data structure + char *ref, *ref_free; // current portion held in memory + int ref_id; + int ref_start; + int ref_end; + char *ref_fn; // reference fasta filename + + // compression level and metrics + int level; + cram_metrics *m[DS_END]; + + // options + int decode_md; // Whether to export MD and NM tags + int verbose; + int seqs_per_slice; + int slices_per_container; + int embed_ref; + int no_ref; + int ignore_md5; + int use_bz2; + int use_rans; + int use_lzma; + int shared_ref; + unsigned int required_fields; + cram_range range; + + // lookup tables, stored here so we can be trivially multi-threaded + unsigned int bam_flag_swap[0x1000]; // cram -> bam flags + unsigned int cram_flag_swap[0x1000];// bam -> cram flags + unsigned char L1[256]; // ACGT{*} ->0123{4} + unsigned char L2[256]; // ACGTN{*}->01234{5} + char cram_sub_matrix[32][32]; // base substituion codes + + int index_sz; + cram_index *index; // array, sizeof index_sz + off_t first_container; + int eof; + int last_slice; // number of recs encoded in last slice + int multi_seq; + int unsorted; + int empty_container; // Marker for EOF block + + // thread pool + int own_pool; + t_pool *pool; + t_results_queue *rqueue; + pthread_mutex_t metrics_lock; + pthread_mutex_t ref_lock; + spare_bams *bl; + pthread_mutex_t bam_list_lock; + void *job_pending; + int ooc; // out of containers. +} cram_fd; + +// Translation of required fields to cram data series +enum cram_fields { + CRAM_BF = 0x00000001, + CRAM_AP = 0x00000002, + CRAM_FP = 0x00000004, + CRAM_RL = 0x00000008, + CRAM_DL = 0x00000010, + CRAM_NF = 0x00000020, + CRAM_BA = 0x00000040, + CRAM_QS = 0x00000080, + CRAM_FC = 0x00000100, + CRAM_FN = 0x00000200, + CRAM_BS = 0x00000400, + CRAM_IN = 0x00000800, + CRAM_RG = 0x00001000, + CRAM_MQ = 0x00002000, + CRAM_TL = 0x00004000, + CRAM_RN = 0x00008000, + CRAM_NS = 0x00010000, + CRAM_NP = 0x00020000, + CRAM_TS = 0x00040000, + CRAM_MF = 0x00080000, + CRAM_CF = 0x00100000, + CRAM_RI = 0x00200000, + CRAM_RS = 0x00400000, + CRAM_PD = 0x00800000, + CRAM_HC = 0x01000000, + CRAM_SC = 0x02000000, + CRAM_BB = 0x04000000, + CRAM_BB_len = 0x08000000, + CRAM_QQ = 0x10000000, + CRAM_QQ_len = 0x20000000, + CRAM_aux= 0x40000000, + CRAM_ALL= 0x7fffffff, +}; + +// A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may +// encode a base difference, but we don't need to know what it is for CIGAR. +// If we have a soft-clip or insertion, we do need SC/IN though to know how +// long that array is. +#define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \ + CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF) + +#define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_QS | CRAM_BS | \ + CRAM_RL | CRAM_AP | CRAM_BB | CRAM_QQ) + +/* BF bitfields */ +/* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */ +#define CRAM_FPAIRED 256 +#define CRAM_FPROPER_PAIR 128 +#define CRAM_FUNMAP 64 +#define CRAM_FREVERSE 32 +#define CRAM_FREAD1 16 +#define CRAM_FREAD2 8 +#define CRAM_FSECONDARY 4 +#define CRAM_FQCFAIL 2 +#define CRAM_FDUP 1 + +#define DS_aux_S "\001" +#define DS_aux_OQ_S "\002" +#define DS_aux_BQ_S "\003" +#define DS_aux_BD_S "\004" +#define DS_aux_BI_S "\005" +#define DS_aux_FZ_S "\006" +#define DS_aux_oq_S "\007" +#define DS_aux_os_S "\010" +#define DS_aux_oz_S "\011" + +#define CRAM_M_REVERSE 1 +#define CRAM_M_UNMAP 2 + + +/* CF bitfields */ +#define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0) +#define CRAM_FLAG_DETACHED (1<<1) +#define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) + +#ifdef __cplusplus +} +#endif + +#endif /* _CRAM_STRUCTS_H_ */