Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/cram/cram_structs.h @ 0:dfa3745e5fd8
Uploaded
| author | youngkim |
|---|---|
| date | Thu, 24 Mar 2016 17:12:52 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfa3745e5fd8 |
|---|---|
| 1 /* | |
| 2 Copyright (c) 2012-2013 Genome Research Ltd. | |
| 3 Author: James Bonfield <jkb@sanger.ac.uk> | |
| 4 | |
| 5 Redistribution and use in source and binary forms, with or without | |
| 6 modification, are permitted provided that the following conditions are met: | |
| 7 | |
| 8 1. Redistributions of source code must retain the above copyright notice, | |
| 9 this list of conditions and the following disclaimer. | |
| 10 | |
| 11 2. Redistributions in binary form must reproduce the above copyright notice, | |
| 12 this list of conditions and the following disclaimer in the documentation | |
| 13 and/or other materials provided with the distribution. | |
| 14 | |
| 15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger | |
| 16 Institute nor the names of its contributors may be used to endorse or promote | |
| 17 products derived from this software without specific prior written permission. | |
| 18 | |
| 19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND | |
| 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE | |
| 23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| 25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
| 26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 29 */ | |
| 30 | |
| 31 #ifndef _CRAM_STRUCTS_H_ | |
| 32 #define _CRAM_STRUCTS_H_ | |
| 33 | |
| 34 #ifdef __cplusplus | |
| 35 extern "C" { | |
| 36 #endif | |
| 37 | |
| 38 /* | |
| 39 * Defines in-memory structs for the basic file-format objects in the | |
| 40 * CRAM format. | |
| 41 * | |
| 42 * The basic file format is: | |
| 43 * File-def SAM-hdr Container Container ... | |
| 44 * | |
| 45 * Container: | |
| 46 * Service-block data-block data-block ... | |
| 47 * | |
| 48 * Multiple blocks in a container are grouped together as slices, | |
| 49 * also sometimes referred to as landmarks in the spec. | |
| 50 */ | |
| 51 | |
| 52 | |
| 53 #include <stdint.h> | |
| 54 | |
| 55 #include "cram/thread_pool.h" | |
| 56 #include "cram/string_alloc.h" | |
| 57 #include "htslib/khash.h" | |
| 58 | |
| 59 // Generic hash-map integer -> integer | |
| 60 KHASH_MAP_INIT_INT(m_i2i, int) | |
| 61 | |
| 62 // Generic hash-set integer -> (existance) | |
| 63 KHASH_SET_INIT_INT(s_i2i) | |
| 64 | |
| 65 // For brevity | |
| 66 typedef unsigned char uc; | |
| 67 | |
| 68 /* | |
| 69 * A union for the preservation map. Required for khash. | |
| 70 */ | |
| 71 typedef union { | |
| 72 int i; | |
| 73 char *p; | |
| 74 } pmap_t; | |
| 75 | |
| 76 // Generates static functions here which isn't ideal, but we have no way | |
| 77 // currently to declare the kh_map_t structure here without also declaring a | |
| 78 // duplicate in the .c files due to the nature of the KHASH macros. | |
| 79 KHASH_MAP_INIT_STR(map, pmap_t) | |
| 80 | |
| 81 struct hFILE; | |
| 82 | |
| 83 #define SEQS_PER_SLICE 10000 | |
| 84 #define SLICE_PER_CNT 1 | |
| 85 | |
| 86 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" | |
| 87 | |
| 88 #define MAX_STAT_VAL 1024 | |
| 89 //#define MAX_STAT_VAL 16 | |
| 90 typedef struct { | |
| 91 int freqs[MAX_STAT_VAL]; | |
| 92 khash_t(m_i2i) *h; | |
| 93 int nsamp; // total number of values added | |
| 94 int nvals; // total number of unique values added | |
| 95 } cram_stats; | |
| 96 | |
| 97 /* NB: matches java impl, not the spec */ | |
| 98 enum cram_encoding { | |
| 99 E_NULL = 0, | |
| 100 E_EXTERNAL = 1, | |
| 101 E_GOLOMB = 2, | |
| 102 E_HUFFMAN = 3, | |
| 103 E_BYTE_ARRAY_LEN = 4, | |
| 104 E_BYTE_ARRAY_STOP = 5, | |
| 105 E_BETA = 6, | |
| 106 E_SUBEXP = 7, | |
| 107 E_GOLOMB_RICE = 8, | |
| 108 E_GAMMA = 9 | |
| 109 }; | |
| 110 | |
| 111 enum cram_external_type { | |
| 112 E_INT = 1, | |
| 113 E_LONG = 2, | |
| 114 E_BYTE = 3, | |
| 115 E_BYTE_ARRAY = 4, | |
| 116 E_BYTE_ARRAY_BLOCK = 5, | |
| 117 }; | |
| 118 | |
| 119 /* External IDs used by this implementation (only assumed during writing) */ | |
| 120 enum cram_DS_ID { | |
| 121 DS_CORE = 0, | |
| 122 DS_aux = 1, // aux_blk | |
| 123 DS_aux_OQ = 2, | |
| 124 DS_aux_BQ = 3, | |
| 125 DS_aux_BD = 4, | |
| 126 DS_aux_BI = 5, | |
| 127 DS_aux_FZ = 6, // also ZM:B | |
| 128 DS_aux_oq = 7, // other qualities | |
| 129 DS_aux_os = 8, // other sequences | |
| 130 DS_aux_oz = 9, // other strings | |
| 131 DS_ref, | |
| 132 DS_RN, // name_blk | |
| 133 DS_QS, // qual_blk | |
| 134 DS_IN, // base_blk | |
| 135 DS_SC, // soft_blk | |
| 136 | |
| 137 DS_BF, // start loop | |
| 138 DS_CF, | |
| 139 DS_AP, | |
| 140 DS_RG, | |
| 141 DS_MQ, | |
| 142 DS_NS, | |
| 143 DS_MF, | |
| 144 DS_TS, | |
| 145 DS_NP, | |
| 146 DS_NF, | |
| 147 DS_RL, | |
| 148 DS_FN, | |
| 149 DS_FC, | |
| 150 DS_FP, | |
| 151 DS_DL, | |
| 152 DS_BA, | |
| 153 DS_BS, | |
| 154 DS_TL, | |
| 155 DS_RI, | |
| 156 DS_RS, | |
| 157 DS_PD, | |
| 158 DS_HC, | |
| 159 DS_BB, | |
| 160 DS_QQ, | |
| 161 | |
| 162 DS_TN, // end loop | |
| 163 | |
| 164 DS_RN_len, | |
| 165 DS_SC_len, | |
| 166 DS_BB_len, | |
| 167 DS_QQ_len, | |
| 168 | |
| 169 DS_TC, // CRAM v1.0 tags | |
| 170 DS_TM, // test | |
| 171 DS_TV, // test | |
| 172 | |
| 173 DS_END, | |
| 174 }; | |
| 175 | |
| 176 /* "File Definition Structure" */ | |
| 177 typedef struct { | |
| 178 char magic[4]; | |
| 179 uint8_t major_version; | |
| 180 uint8_t minor_version; | |
| 181 char file_id[20]; // Filename or SHA1 checksum | |
| 182 } cram_file_def; | |
| 183 | |
| 184 #define CRAM_MAJOR_VERS(v) ((v) >> 8) | |
| 185 #define CRAM_MINOR_VERS(v) ((v) & 0xff) | |
| 186 | |
| 187 struct cram_slice; | |
| 188 | |
| 189 enum cram_block_method { | |
| 190 ERROR = -1, | |
| 191 RAW = 0, | |
| 192 GZIP = 1, | |
| 193 BZIP2 = 2, | |
| 194 LZMA = 3, | |
| 195 RANS = 4, // Generic; either order | |
| 196 RANS0 = 4, | |
| 197 RANS1 = 10, // Not externalised; stored as RANS (generic) | |
| 198 GZIP_RLE = 11, // NB: not externalised in CRAM | |
| 199 }; | |
| 200 | |
| 201 enum cram_content_type { | |
| 202 CT_ERROR = -1, | |
| 203 FILE_HEADER = 0, | |
| 204 COMPRESSION_HEADER = 1, | |
| 205 MAPPED_SLICE = 2, | |
| 206 UNMAPPED_SLICE = 3, // CRAM V1.0 only | |
| 207 EXTERNAL = 4, | |
| 208 CORE = 5, | |
| 209 }; | |
| 210 | |
| 211 /* Compression metrics */ | |
| 212 typedef struct { | |
| 213 // number of trials and time to next trial | |
| 214 int trial; | |
| 215 int next_trial; | |
| 216 | |
| 217 // aggregate sizes during trials | |
| 218 int sz_gz_rle; | |
| 219 int sz_gz_def; | |
| 220 int sz_rans0; | |
| 221 int sz_rans1; | |
| 222 int sz_bzip2; | |
| 223 int sz_lzma; | |
| 224 | |
| 225 // resultant method from trials | |
| 226 int method; | |
| 227 int strat; | |
| 228 | |
| 229 // Revisions of method, to allow culling of continually failing ones. | |
| 230 int gz_rle_cnt; | |
| 231 int gz_def_cnt; | |
| 232 int rans0_cnt; | |
| 233 int rans1_cnt; | |
| 234 int bzip2_cnt; | |
| 235 int lzma_cnt; | |
| 236 int revised_method; | |
| 237 | |
| 238 double gz_rle_extra; | |
| 239 double gz_def_extra; | |
| 240 double rans0_extra; | |
| 241 double rans1_extra; | |
| 242 double bzip2_extra; | |
| 243 double lzma_extra; | |
| 244 } cram_metrics; | |
| 245 | |
| 246 /* Block */ | |
| 247 typedef struct { | |
| 248 enum cram_block_method method, orig_method; | |
| 249 enum cram_content_type content_type; | |
| 250 int32_t content_id; | |
| 251 int32_t comp_size; | |
| 252 int32_t uncomp_size; | |
| 253 uint32_t crc32; | |
| 254 int32_t idx; /* offset into data */ | |
| 255 unsigned char *data; | |
| 256 | |
| 257 // For bit I/O | |
| 258 size_t alloc; | |
| 259 size_t byte; | |
| 260 int bit; | |
| 261 } cram_block; | |
| 262 | |
| 263 struct cram_codec; /* defined in cram_codecs.h */ | |
| 264 struct cram_map; | |
| 265 | |
| 266 #define CRAM_MAP_HASH 32 | |
| 267 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1)) | |
| 268 | |
| 269 /* Compression header block */ | |
| 270 typedef struct { | |
| 271 int32_t ref_seq_id; | |
| 272 int32_t ref_seq_start; | |
| 273 int32_t ref_seq_span; | |
| 274 int32_t num_records; | |
| 275 int32_t num_landmarks; | |
| 276 int32_t *landmark; | |
| 277 | |
| 278 /* Flags from preservation map */ | |
| 279 int mapped_qs_included; | |
| 280 int unmapped_qs_included; | |
| 281 int unmapped_placed; | |
| 282 int qs_included; | |
| 283 int read_names_included; | |
| 284 int AP_delta; | |
| 285 // indexed by ref-base and subst. code | |
| 286 char substitution_matrix[5][4]; | |
| 287 | |
| 288 // TD Dictionary as a concatenated block | |
| 289 cram_block *TD_blk; // Tag Dictionary | |
| 290 int nTL; // number of TL entries in TD | |
| 291 unsigned char **TL; // array of size nTL, pointer into TD_blk. | |
| 292 khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices | |
| 293 string_alloc_t *TD_keys; // Pooled keys for TD hash. | |
| 294 | |
| 295 khash_t(map) *preservation_map; | |
| 296 struct cram_map *rec_encoding_map[CRAM_MAP_HASH]; | |
| 297 struct cram_map *tag_encoding_map[CRAM_MAP_HASH]; | |
| 298 | |
| 299 struct cram_codec *codecs[DS_END]; | |
| 300 | |
| 301 char *uncomp; // A single block of uncompressed data | |
| 302 size_t uncomp_size, uncomp_alloc; | |
| 303 | |
| 304 unsigned int data_series; // See cram_fields enum below | |
| 305 } cram_block_compression_hdr; | |
| 306 | |
| 307 typedef struct cram_map { | |
| 308 int key; /* 0xe0 + 3 bytes */ | |
| 309 enum cram_encoding encoding; | |
| 310 int offset; /* Offset into a single block of memory */ | |
| 311 int size; /* Size */ | |
| 312 struct cram_codec *codec; | |
| 313 struct cram_map *next; // for noddy internal hash | |
| 314 } cram_map; | |
| 315 | |
| 316 /* Mapped or unmapped slice header block */ | |
| 317 typedef struct { | |
| 318 enum cram_content_type content_type; | |
| 319 int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */ | |
| 320 int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */ | |
| 321 int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */ | |
| 322 int32_t num_records; | |
| 323 int64_t record_counter; | |
| 324 int32_t num_blocks; | |
| 325 int32_t num_content_ids; | |
| 326 int32_t *block_content_ids; | |
| 327 int32_t ref_base_id; /* if content_type == MAPPED_SLICE */ | |
| 328 unsigned char md5[16]; | |
| 329 } cram_block_slice_hdr; | |
| 330 | |
| 331 struct ref_entry; | |
| 332 | |
| 333 /* | |
| 334 * Container. | |
| 335 * | |
| 336 * Conceptually a container is split into slices, and slices into blocks. | |
| 337 * However on disk it's just a list of blocks and we need to query the | |
| 338 * block types to identify the start/end points of the slices. | |
| 339 * | |
| 340 * OR... are landmarks the start/end points of slices? | |
| 341 */ | |
| 342 typedef struct { | |
| 343 int32_t length; | |
| 344 int32_t ref_seq_id; | |
| 345 int32_t ref_seq_start; | |
| 346 int32_t ref_seq_span; | |
| 347 int64_t record_counter; | |
| 348 int64_t num_bases; | |
| 349 int32_t num_records; | |
| 350 int32_t num_blocks; | |
| 351 int32_t num_landmarks; | |
| 352 int32_t *landmark; | |
| 353 | |
| 354 /* Size of container header above */ | |
| 355 size_t offset; | |
| 356 | |
| 357 /* Compression header is always the first block? */ | |
| 358 cram_block_compression_hdr *comp_hdr; | |
| 359 cram_block *comp_hdr_block; | |
| 360 | |
| 361 /* For construction purposes */ | |
| 362 int max_slice, curr_slice; // maximum number of slices | |
| 363 int max_rec, curr_rec; // current and max recs per slice | |
| 364 int max_c_rec, curr_c_rec; // current and max recs per container | |
| 365 int slice_rec; // rec no. for start of this slice | |
| 366 int curr_ref; // current ref ID. -2 for no previous | |
| 367 int last_pos; // last record position | |
| 368 struct cram_slice **slices, *slice; | |
| 369 int pos_sorted; // boolean, 1=>position sorted data | |
| 370 int max_apos; // maximum position, used if pos_sorted==0 | |
| 371 int last_slice; // number of reads in last slice (0 for 1st) | |
| 372 int multi_seq; // true if packing multi seqs per cont/slice | |
| 373 int unsorted; // true is AP_delta is 0. | |
| 374 | |
| 375 /* Copied from fd before encoding, to allow multi-threading */ | |
| 376 int ref_start, first_base, last_base, ref_id, ref_end; | |
| 377 char *ref; | |
| 378 //struct ref_entry *ref; | |
| 379 | |
| 380 /* For multi-threading */ | |
| 381 bam_seq_t **bams; | |
| 382 | |
| 383 /* Statistics for encoding */ | |
| 384 cram_stats *stats[DS_END]; | |
| 385 | |
| 386 khash_t(s_i2i) *tags_used; // set of tag types in use, for tag encoding map | |
| 387 int *refs_used; // array of frequency of ref seq IDs | |
| 388 | |
| 389 uint32_t crc32; // CRC32 | |
| 390 } cram_container; | |
| 391 | |
| 392 /* | |
| 393 * A single cram record | |
| 394 */ | |
| 395 typedef struct { | |
| 396 struct cram_slice *s; // Filled out by cram_decode only | |
| 397 | |
| 398 int32_t ref_id; // fixed for all recs in slice? | |
| 399 int32_t flags; // BF | |
| 400 int32_t cram_flags; // CF | |
| 401 int32_t len; // RL | |
| 402 int32_t apos; // AP | |
| 403 int32_t rg; // RG | |
| 404 int32_t name; // RN; idx to s->names_blk | |
| 405 int32_t name_len; | |
| 406 int32_t mate_line; // index to another cram_record | |
| 407 int32_t mate_ref_id; | |
| 408 int32_t mate_pos; // NP | |
| 409 int32_t tlen; // TS | |
| 410 | |
| 411 // Auxiliary data | |
| 412 int32_t ntags; // TC | |
| 413 int32_t aux; // idx to s->aux_blk | |
| 414 int32_t aux_size; // total size of packed ntags in aux_blk | |
| 415 #ifndef TN_external | |
| 416 int32_t TN_idx; // TN; idx to s->TN; | |
| 417 #else | |
| 418 int32_t tn; // idx to s->tn_blk | |
| 419 #endif | |
| 420 int TL; | |
| 421 | |
| 422 int32_t seq; // idx to s->seqs_blk | |
| 423 int32_t qual; // idx to s->qual_blk | |
| 424 int32_t cigar; // idx to s->cigar | |
| 425 int32_t ncigar; | |
| 426 int32_t aend; // alignment end | |
| 427 int32_t mqual; // MQ | |
| 428 | |
| 429 int32_t feature; // idx to s->feature | |
| 430 int32_t nfeature; // number of features | |
| 431 int32_t mate_flags; // MF | |
| 432 } cram_record; | |
| 433 | |
| 434 // Accessor macros as an analogue of the bam ones | |
| 435 #define cram_qname(c) (&(c)->s->name_blk->data[(c)->name]) | |
| 436 #define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq]) | |
| 437 #define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual]) | |
| 438 #define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux]) | |
| 439 #define cram_seqi(c,i) (cram_seq((c))[(i)]) | |
| 440 #define cram_name_len(c) ((c)->name_len) | |
| 441 #define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0) | |
| 442 #define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0) | |
| 443 #define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar]) | |
| 444 | |
| 445 /* | |
| 446 * A feature is a base difference, used for the sequence reference encoding. | |
| 447 * (We generate these internally when writing CRAM.) | |
| 448 */ | |
| 449 typedef struct { | |
| 450 union { | |
| 451 struct { | |
| 452 int pos; | |
| 453 int code; | |
| 454 int base; // substitution code | |
| 455 } X; | |
| 456 struct { | |
| 457 int pos; | |
| 458 int code; | |
| 459 int base; // actual base & qual | |
| 460 int qual; | |
| 461 } B; | |
| 462 struct { | |
| 463 int pos; | |
| 464 int code; | |
| 465 int seq_idx; // index to s->seqs_blk | |
| 466 int len; | |
| 467 } b; | |
| 468 struct { | |
| 469 int pos; | |
| 470 int code; | |
| 471 int qual; | |
| 472 } Q; | |
| 473 struct { | |
| 474 int pos; | |
| 475 int code; | |
| 476 int len; | |
| 477 int seq_idx; // soft-clip multiple bases | |
| 478 } S; | |
| 479 struct { | |
| 480 int pos; | |
| 481 int code; | |
| 482 int len; | |
| 483 int seq_idx; // insertion multiple bases | |
| 484 } I; | |
| 485 struct { | |
| 486 int pos; | |
| 487 int code; | |
| 488 int base; // insertion single base | |
| 489 } i; | |
| 490 struct { | |
| 491 int pos; | |
| 492 int code; | |
| 493 int len; | |
| 494 } D; | |
| 495 struct { | |
| 496 int pos; | |
| 497 int code; | |
| 498 int len; | |
| 499 } N; | |
| 500 struct { | |
| 501 int pos; | |
| 502 int code; | |
| 503 int len; | |
| 504 } P; | |
| 505 struct { | |
| 506 int pos; | |
| 507 int code; | |
| 508 int len; | |
| 509 } H; | |
| 510 }; | |
| 511 } cram_feature; | |
| 512 | |
| 513 /* | |
| 514 * A slice is really just a set of blocks, but it | |
| 515 * is the logical unit for decoding a number of | |
| 516 * sequences. | |
| 517 */ | |
| 518 typedef struct cram_slice { | |
| 519 cram_block_slice_hdr *hdr; | |
| 520 cram_block *hdr_block; | |
| 521 cram_block **block; | |
| 522 cram_block **block_by_id; | |
| 523 | |
| 524 /* State used during encoding/decoding */ | |
| 525 int last_apos, max_apos; | |
| 526 | |
| 527 /* Array of decoded cram records */ | |
| 528 cram_record *crecs; | |
| 529 | |
| 530 /* An dynamically growing buffers for data pointed | |
| 531 * to by crecs[] array. | |
| 532 */ | |
| 533 uint32_t *cigar; | |
| 534 uint32_t cigar_alloc; | |
| 535 uint32_t ncigar; | |
| 536 | |
| 537 cram_feature *features; | |
| 538 int nfeatures; | |
| 539 int afeatures; // allocated size of features | |
| 540 | |
| 541 #ifndef TN_external | |
| 542 // TN field (Tag Name) | |
| 543 uint32_t *TN; | |
| 544 int nTN, aTN; // used and allocated size for TN[] | |
| 545 #else | |
| 546 cram_block *tn_blk; | |
| 547 int tn_id; | |
| 548 #endif | |
| 549 | |
| 550 // For variable sized elements which are always external blocks. | |
| 551 cram_block *name_blk; | |
| 552 cram_block *seqs_blk; | |
| 553 cram_block *qual_blk; | |
| 554 cram_block *base_blk; | |
| 555 cram_block *soft_blk; | |
| 556 cram_block *aux_blk; | |
| 557 cram_block *aux_OQ_blk; | |
| 558 cram_block *aux_BQ_blk; | |
| 559 cram_block *aux_BD_blk; | |
| 560 cram_block *aux_BI_blk; | |
| 561 cram_block *aux_FZ_blk; | |
| 562 cram_block *aux_oq_blk; | |
| 563 cram_block *aux_os_blk; | |
| 564 cram_block *aux_oz_blk; | |
| 565 | |
| 566 string_alloc_t *pair_keys; // Pooled keys for pair hash. | |
| 567 khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice. | |
| 568 | |
| 569 char *ref; // slice of current reference | |
| 570 int ref_start; // start position of current reference; | |
| 571 int ref_end; // end position of current reference; | |
| 572 int ref_id; | |
| 573 } cram_slice; | |
| 574 | |
| 575 /*----------------------------------------------------------------------------- | |
| 576 * Consider moving reference handling to cram_refs.[ch] | |
| 577 */ | |
| 578 // from fa.fai / samtools faidx files | |
| 579 typedef struct ref_entry { | |
| 580 char *name; | |
| 581 char *fn; | |
| 582 int64_t length; | |
| 583 int64_t offset; | |
| 584 int bases_per_line; | |
| 585 int line_length; | |
| 586 int64_t count; // for shared references so we know to dealloc seq | |
| 587 char *seq; | |
| 588 } ref_entry; | |
| 589 | |
| 590 KHASH_MAP_INIT_STR(refs, ref_entry*) | |
| 591 | |
| 592 // References structure. | |
| 593 typedef struct { | |
| 594 string_alloc_t *pool; // String pool for holding filenames and SN vals | |
| 595 | |
| 596 khash_t(refs) *h_meta; // ref_entry*, index by name | |
| 597 ref_entry **ref_id; // ref_entry*, index by ID | |
| 598 int nref; // number of ref_entry | |
| 599 | |
| 600 char *fn; // current file opened | |
| 601 BGZF *fp; // and the hFILE* to go with it. | |
| 602 | |
| 603 int count; // how many cram_fd sharing this refs struct | |
| 604 | |
| 605 pthread_mutex_t lock; // Mutex for multi-threaded updating | |
| 606 ref_entry *last; // Last queried sequence | |
| 607 int last_id; // Used in cram_ref_decr_locked to delay free | |
| 608 } refs_t; | |
| 609 | |
| 610 /*----------------------------------------------------------------------------- | |
| 611 * CRAM index | |
| 612 * | |
| 613 * Detect format by number of entries per line. | |
| 614 * 5 => 1.0 (refid, start, nseq, C offset, slice) | |
| 615 * 6 => 1.1 (refid, start, span, C offset, S offset, S size) | |
| 616 * | |
| 617 * Indices are stored in a nested containment list, which is trivial to set | |
| 618 * up as the indices are on sorted data so we're appending to the nclist | |
| 619 * in sorted order. Basically if a slice entirely fits within a previous | |
| 620 * slice then we append to that slices list. This is done recursively. | |
| 621 * | |
| 622 * Lists are sorted on two dimensions: ref id + slice coords. | |
| 623 */ | |
| 624 typedef struct cram_index { | |
| 625 int nslice, nalloc; // total number of slices | |
| 626 struct cram_index *e; // array of size nslice | |
| 627 | |
| 628 int refid; // 1.0 1.1 | |
| 629 int start; // 1.0 1.1 | |
| 630 int end; // 1.1 | |
| 631 int nseq; // 1.0 - undocumented | |
| 632 int slice; // 1.0 landmark index, 1.1 landmark value | |
| 633 int len; // 1.1 - size of slice in bytes | |
| 634 int64_t offset; // 1.0 1.1 | |
| 635 } cram_index; | |
| 636 | |
| 637 typedef struct { | |
| 638 int refid; | |
| 639 int start; | |
| 640 int end; | |
| 641 } cram_range; | |
| 642 | |
| 643 /*----------------------------------------------------------------------------- | |
| 644 */ | |
| 645 /* CRAM File handle */ | |
| 646 | |
| 647 typedef struct spare_bams { | |
| 648 bam_seq_t **bams; | |
| 649 struct spare_bams *next; | |
| 650 } spare_bams; | |
| 651 | |
| 652 typedef struct cram_fd { | |
| 653 struct hFILE *fp; | |
| 654 int mode; // 'r' or 'w' | |
| 655 int version; | |
| 656 cram_file_def *file_def; | |
| 657 SAM_hdr *header; | |
| 658 | |
| 659 char *prefix; | |
| 660 int64_t record_counter; | |
| 661 int err; | |
| 662 | |
| 663 // Most recent compression header decoded | |
| 664 //cram_block_compression_hdr *comp_hdr; | |
| 665 //cram_block_slice_hdr *slice_hdr; | |
| 666 | |
| 667 // Current container being processed. | |
| 668 cram_container *ctr; | |
| 669 | |
| 670 // positions for encoding or decoding | |
| 671 int first_base, last_base; | |
| 672 | |
| 673 // cached reference portion | |
| 674 refs_t *refs; // ref meta-data structure | |
| 675 char *ref, *ref_free; // current portion held in memory | |
| 676 int ref_id; | |
| 677 int ref_start; | |
| 678 int ref_end; | |
| 679 char *ref_fn; // reference fasta filename | |
| 680 | |
| 681 // compression level and metrics | |
| 682 int level; | |
| 683 cram_metrics *m[DS_END]; | |
| 684 | |
| 685 // options | |
| 686 int decode_md; // Whether to export MD and NM tags | |
| 687 int verbose; | |
| 688 int seqs_per_slice; | |
| 689 int slices_per_container; | |
| 690 int embed_ref; | |
| 691 int no_ref; | |
| 692 int ignore_md5; | |
| 693 int use_bz2; | |
| 694 int use_rans; | |
| 695 int use_lzma; | |
| 696 int shared_ref; | |
| 697 unsigned int required_fields; | |
| 698 cram_range range; | |
| 699 | |
| 700 // lookup tables, stored here so we can be trivially multi-threaded | |
| 701 unsigned int bam_flag_swap[0x1000]; // cram -> bam flags | |
| 702 unsigned int cram_flag_swap[0x1000];// bam -> cram flags | |
| 703 unsigned char L1[256]; // ACGT{*} ->0123{4} | |
| 704 unsigned char L2[256]; // ACGTN{*}->01234{5} | |
| 705 char cram_sub_matrix[32][32]; // base substituion codes | |
| 706 | |
| 707 int index_sz; | |
| 708 cram_index *index; // array, sizeof index_sz | |
| 709 off_t first_container; | |
| 710 int eof; | |
| 711 int last_slice; // number of recs encoded in last slice | |
| 712 int multi_seq; | |
| 713 int unsorted; | |
| 714 int empty_container; // Marker for EOF block | |
| 715 | |
| 716 // thread pool | |
| 717 int own_pool; | |
| 718 t_pool *pool; | |
| 719 t_results_queue *rqueue; | |
| 720 pthread_mutex_t metrics_lock; | |
| 721 pthread_mutex_t ref_lock; | |
| 722 spare_bams *bl; | |
| 723 pthread_mutex_t bam_list_lock; | |
| 724 void *job_pending; | |
| 725 int ooc; // out of containers. | |
| 726 } cram_fd; | |
| 727 | |
| 728 // Translation of required fields to cram data series | |
| 729 enum cram_fields { | |
| 730 CRAM_BF = 0x00000001, | |
| 731 CRAM_AP = 0x00000002, | |
| 732 CRAM_FP = 0x00000004, | |
| 733 CRAM_RL = 0x00000008, | |
| 734 CRAM_DL = 0x00000010, | |
| 735 CRAM_NF = 0x00000020, | |
| 736 CRAM_BA = 0x00000040, | |
| 737 CRAM_QS = 0x00000080, | |
| 738 CRAM_FC = 0x00000100, | |
| 739 CRAM_FN = 0x00000200, | |
| 740 CRAM_BS = 0x00000400, | |
| 741 CRAM_IN = 0x00000800, | |
| 742 CRAM_RG = 0x00001000, | |
| 743 CRAM_MQ = 0x00002000, | |
| 744 CRAM_TL = 0x00004000, | |
| 745 CRAM_RN = 0x00008000, | |
| 746 CRAM_NS = 0x00010000, | |
| 747 CRAM_NP = 0x00020000, | |
| 748 CRAM_TS = 0x00040000, | |
| 749 CRAM_MF = 0x00080000, | |
| 750 CRAM_CF = 0x00100000, | |
| 751 CRAM_RI = 0x00200000, | |
| 752 CRAM_RS = 0x00400000, | |
| 753 CRAM_PD = 0x00800000, | |
| 754 CRAM_HC = 0x01000000, | |
| 755 CRAM_SC = 0x02000000, | |
| 756 CRAM_BB = 0x04000000, | |
| 757 CRAM_BB_len = 0x08000000, | |
| 758 CRAM_QQ = 0x10000000, | |
| 759 CRAM_QQ_len = 0x20000000, | |
| 760 CRAM_aux= 0x40000000, | |
| 761 CRAM_ALL= 0x7fffffff, | |
| 762 }; | |
| 763 | |
| 764 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may | |
| 765 // encode a base difference, but we don't need to know what it is for CIGAR. | |
| 766 // If we have a soft-clip or insertion, we do need SC/IN though to know how | |
| 767 // long that array is. | |
| 768 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \ | |
| 769 CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF) | |
| 770 | |
| 771 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_QS | CRAM_BS | \ | |
| 772 CRAM_RL | CRAM_AP | CRAM_BB | CRAM_QQ) | |
| 773 | |
| 774 /* BF bitfields */ | |
| 775 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */ | |
| 776 #define CRAM_FPAIRED 256 | |
| 777 #define CRAM_FPROPER_PAIR 128 | |
| 778 #define CRAM_FUNMAP 64 | |
| 779 #define CRAM_FREVERSE 32 | |
| 780 #define CRAM_FREAD1 16 | |
| 781 #define CRAM_FREAD2 8 | |
| 782 #define CRAM_FSECONDARY 4 | |
| 783 #define CRAM_FQCFAIL 2 | |
| 784 #define CRAM_FDUP 1 | |
| 785 | |
| 786 #define DS_aux_S "\001" | |
| 787 #define DS_aux_OQ_S "\002" | |
| 788 #define DS_aux_BQ_S "\003" | |
| 789 #define DS_aux_BD_S "\004" | |
| 790 #define DS_aux_BI_S "\005" | |
| 791 #define DS_aux_FZ_S "\006" | |
| 792 #define DS_aux_oq_S "\007" | |
| 793 #define DS_aux_os_S "\010" | |
| 794 #define DS_aux_oz_S "\011" | |
| 795 | |
| 796 #define CRAM_M_REVERSE 1 | |
| 797 #define CRAM_M_UNMAP 2 | |
| 798 | |
| 799 | |
| 800 /* CF bitfields */ | |
| 801 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0) | |
| 802 #define CRAM_FLAG_DETACHED (1<<1) | |
| 803 #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) | |
| 804 | |
| 805 #ifdef __cplusplus | |
| 806 } | |
| 807 #endif | |
| 808 | |
| 809 #endif /* _CRAM_STRUCTS_H_ */ |
