Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/cram/cram_structs.h @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* | |
2 Copyright (c) 2012-2013 Genome Research Ltd. | |
3 Author: James Bonfield <jkb@sanger.ac.uk> | |
4 | |
5 Redistribution and use in source and binary forms, with or without | |
6 modification, are permitted provided that the following conditions are met: | |
7 | |
8 1. Redistributions of source code must retain the above copyright notice, | |
9 this list of conditions and the following disclaimer. | |
10 | |
11 2. Redistributions in binary form must reproduce the above copyright notice, | |
12 this list of conditions and the following disclaimer in the documentation | |
13 and/or other materials provided with the distribution. | |
14 | |
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger | |
16 Institute nor the names of its contributors may be used to endorse or promote | |
17 products derived from this software without specific prior written permission. | |
18 | |
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND | |
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE | |
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 */ | |
30 | |
31 #ifndef _CRAM_STRUCTS_H_ | |
32 #define _CRAM_STRUCTS_H_ | |
33 | |
34 #ifdef __cplusplus | |
35 extern "C" { | |
36 #endif | |
37 | |
38 /* | |
39 * Defines in-memory structs for the basic file-format objects in the | |
40 * CRAM format. | |
41 * | |
42 * The basic file format is: | |
43 * File-def SAM-hdr Container Container ... | |
44 * | |
45 * Container: | |
46 * Service-block data-block data-block ... | |
47 * | |
48 * Multiple blocks in a container are grouped together as slices, | |
49 * also sometimes referred to as landmarks in the spec. | |
50 */ | |
51 | |
52 | |
53 #include <stdint.h> | |
54 | |
55 #include "cram/thread_pool.h" | |
56 #include "cram/string_alloc.h" | |
57 #include "htslib/khash.h" | |
58 | |
59 // Generic hash-map integer -> integer | |
60 KHASH_MAP_INIT_INT(m_i2i, int) | |
61 | |
62 // Generic hash-set integer -> (existance) | |
63 KHASH_SET_INIT_INT(s_i2i) | |
64 | |
65 // For brevity | |
66 typedef unsigned char uc; | |
67 | |
68 /* | |
69 * A union for the preservation map. Required for khash. | |
70 */ | |
71 typedef union { | |
72 int i; | |
73 char *p; | |
74 } pmap_t; | |
75 | |
76 // Generates static functions here which isn't ideal, but we have no way | |
77 // currently to declare the kh_map_t structure here without also declaring a | |
78 // duplicate in the .c files due to the nature of the KHASH macros. | |
79 KHASH_MAP_INIT_STR(map, pmap_t) | |
80 | |
81 struct hFILE; | |
82 | |
83 #define SEQS_PER_SLICE 10000 | |
84 #define SLICE_PER_CNT 1 | |
85 | |
86 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" | |
87 | |
88 #define MAX_STAT_VAL 1024 | |
89 //#define MAX_STAT_VAL 16 | |
90 typedef struct { | |
91 int freqs[MAX_STAT_VAL]; | |
92 khash_t(m_i2i) *h; | |
93 int nsamp; // total number of values added | |
94 int nvals; // total number of unique values added | |
95 } cram_stats; | |
96 | |
97 /* NB: matches java impl, not the spec */ | |
98 enum cram_encoding { | |
99 E_NULL = 0, | |
100 E_EXTERNAL = 1, | |
101 E_GOLOMB = 2, | |
102 E_HUFFMAN = 3, | |
103 E_BYTE_ARRAY_LEN = 4, | |
104 E_BYTE_ARRAY_STOP = 5, | |
105 E_BETA = 6, | |
106 E_SUBEXP = 7, | |
107 E_GOLOMB_RICE = 8, | |
108 E_GAMMA = 9 | |
109 }; | |
110 | |
111 enum cram_external_type { | |
112 E_INT = 1, | |
113 E_LONG = 2, | |
114 E_BYTE = 3, | |
115 E_BYTE_ARRAY = 4, | |
116 E_BYTE_ARRAY_BLOCK = 5, | |
117 }; | |
118 | |
119 /* External IDs used by this implementation (only assumed during writing) */ | |
120 enum cram_DS_ID { | |
121 DS_CORE = 0, | |
122 DS_aux = 1, // aux_blk | |
123 DS_aux_OQ = 2, | |
124 DS_aux_BQ = 3, | |
125 DS_aux_BD = 4, | |
126 DS_aux_BI = 5, | |
127 DS_aux_FZ = 6, // also ZM:B | |
128 DS_aux_oq = 7, // other qualities | |
129 DS_aux_os = 8, // other sequences | |
130 DS_aux_oz = 9, // other strings | |
131 DS_ref, | |
132 DS_RN, // name_blk | |
133 DS_QS, // qual_blk | |
134 DS_IN, // base_blk | |
135 DS_SC, // soft_blk | |
136 | |
137 DS_BF, // start loop | |
138 DS_CF, | |
139 DS_AP, | |
140 DS_RG, | |
141 DS_MQ, | |
142 DS_NS, | |
143 DS_MF, | |
144 DS_TS, | |
145 DS_NP, | |
146 DS_NF, | |
147 DS_RL, | |
148 DS_FN, | |
149 DS_FC, | |
150 DS_FP, | |
151 DS_DL, | |
152 DS_BA, | |
153 DS_BS, | |
154 DS_TL, | |
155 DS_RI, | |
156 DS_RS, | |
157 DS_PD, | |
158 DS_HC, | |
159 DS_BB, | |
160 DS_QQ, | |
161 | |
162 DS_TN, // end loop | |
163 | |
164 DS_RN_len, | |
165 DS_SC_len, | |
166 DS_BB_len, | |
167 DS_QQ_len, | |
168 | |
169 DS_TC, // CRAM v1.0 tags | |
170 DS_TM, // test | |
171 DS_TV, // test | |
172 | |
173 DS_END, | |
174 }; | |
175 | |
176 /* "File Definition Structure" */ | |
177 typedef struct { | |
178 char magic[4]; | |
179 uint8_t major_version; | |
180 uint8_t minor_version; | |
181 char file_id[20]; // Filename or SHA1 checksum | |
182 } cram_file_def; | |
183 | |
184 #define CRAM_MAJOR_VERS(v) ((v) >> 8) | |
185 #define CRAM_MINOR_VERS(v) ((v) & 0xff) | |
186 | |
187 struct cram_slice; | |
188 | |
189 enum cram_block_method { | |
190 ERROR = -1, | |
191 RAW = 0, | |
192 GZIP = 1, | |
193 BZIP2 = 2, | |
194 LZMA = 3, | |
195 RANS = 4, // Generic; either order | |
196 RANS0 = 4, | |
197 RANS1 = 10, // Not externalised; stored as RANS (generic) | |
198 GZIP_RLE = 11, // NB: not externalised in CRAM | |
199 }; | |
200 | |
201 enum cram_content_type { | |
202 CT_ERROR = -1, | |
203 FILE_HEADER = 0, | |
204 COMPRESSION_HEADER = 1, | |
205 MAPPED_SLICE = 2, | |
206 UNMAPPED_SLICE = 3, // CRAM V1.0 only | |
207 EXTERNAL = 4, | |
208 CORE = 5, | |
209 }; | |
210 | |
211 /* Compression metrics */ | |
212 typedef struct { | |
213 // number of trials and time to next trial | |
214 int trial; | |
215 int next_trial; | |
216 | |
217 // aggregate sizes during trials | |
218 int sz_gz_rle; | |
219 int sz_gz_def; | |
220 int sz_rans0; | |
221 int sz_rans1; | |
222 int sz_bzip2; | |
223 int sz_lzma; | |
224 | |
225 // resultant method from trials | |
226 int method; | |
227 int strat; | |
228 | |
229 // Revisions of method, to allow culling of continually failing ones. | |
230 int gz_rle_cnt; | |
231 int gz_def_cnt; | |
232 int rans0_cnt; | |
233 int rans1_cnt; | |
234 int bzip2_cnt; | |
235 int lzma_cnt; | |
236 int revised_method; | |
237 | |
238 double gz_rle_extra; | |
239 double gz_def_extra; | |
240 double rans0_extra; | |
241 double rans1_extra; | |
242 double bzip2_extra; | |
243 double lzma_extra; | |
244 } cram_metrics; | |
245 | |
246 /* Block */ | |
247 typedef struct { | |
248 enum cram_block_method method, orig_method; | |
249 enum cram_content_type content_type; | |
250 int32_t content_id; | |
251 int32_t comp_size; | |
252 int32_t uncomp_size; | |
253 uint32_t crc32; | |
254 int32_t idx; /* offset into data */ | |
255 unsigned char *data; | |
256 | |
257 // For bit I/O | |
258 size_t alloc; | |
259 size_t byte; | |
260 int bit; | |
261 } cram_block; | |
262 | |
263 struct cram_codec; /* defined in cram_codecs.h */ | |
264 struct cram_map; | |
265 | |
266 #define CRAM_MAP_HASH 32 | |
267 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1)) | |
268 | |
269 /* Compression header block */ | |
270 typedef struct { | |
271 int32_t ref_seq_id; | |
272 int32_t ref_seq_start; | |
273 int32_t ref_seq_span; | |
274 int32_t num_records; | |
275 int32_t num_landmarks; | |
276 int32_t *landmark; | |
277 | |
278 /* Flags from preservation map */ | |
279 int mapped_qs_included; | |
280 int unmapped_qs_included; | |
281 int unmapped_placed; | |
282 int qs_included; | |
283 int read_names_included; | |
284 int AP_delta; | |
285 // indexed by ref-base and subst. code | |
286 char substitution_matrix[5][4]; | |
287 | |
288 // TD Dictionary as a concatenated block | |
289 cram_block *TD_blk; // Tag Dictionary | |
290 int nTL; // number of TL entries in TD | |
291 unsigned char **TL; // array of size nTL, pointer into TD_blk. | |
292 khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices | |
293 string_alloc_t *TD_keys; // Pooled keys for TD hash. | |
294 | |
295 khash_t(map) *preservation_map; | |
296 struct cram_map *rec_encoding_map[CRAM_MAP_HASH]; | |
297 struct cram_map *tag_encoding_map[CRAM_MAP_HASH]; | |
298 | |
299 struct cram_codec *codecs[DS_END]; | |
300 | |
301 char *uncomp; // A single block of uncompressed data | |
302 size_t uncomp_size, uncomp_alloc; | |
303 | |
304 unsigned int data_series; // See cram_fields enum below | |
305 } cram_block_compression_hdr; | |
306 | |
307 typedef struct cram_map { | |
308 int key; /* 0xe0 + 3 bytes */ | |
309 enum cram_encoding encoding; | |
310 int offset; /* Offset into a single block of memory */ | |
311 int size; /* Size */ | |
312 struct cram_codec *codec; | |
313 struct cram_map *next; // for noddy internal hash | |
314 } cram_map; | |
315 | |
316 /* Mapped or unmapped slice header block */ | |
317 typedef struct { | |
318 enum cram_content_type content_type; | |
319 int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */ | |
320 int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */ | |
321 int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */ | |
322 int32_t num_records; | |
323 int64_t record_counter; | |
324 int32_t num_blocks; | |
325 int32_t num_content_ids; | |
326 int32_t *block_content_ids; | |
327 int32_t ref_base_id; /* if content_type == MAPPED_SLICE */ | |
328 unsigned char md5[16]; | |
329 } cram_block_slice_hdr; | |
330 | |
331 struct ref_entry; | |
332 | |
333 /* | |
334 * Container. | |
335 * | |
336 * Conceptually a container is split into slices, and slices into blocks. | |
337 * However on disk it's just a list of blocks and we need to query the | |
338 * block types to identify the start/end points of the slices. | |
339 * | |
340 * OR... are landmarks the start/end points of slices? | |
341 */ | |
342 typedef struct { | |
343 int32_t length; | |
344 int32_t ref_seq_id; | |
345 int32_t ref_seq_start; | |
346 int32_t ref_seq_span; | |
347 int64_t record_counter; | |
348 int64_t num_bases; | |
349 int32_t num_records; | |
350 int32_t num_blocks; | |
351 int32_t num_landmarks; | |
352 int32_t *landmark; | |
353 | |
354 /* Size of container header above */ | |
355 size_t offset; | |
356 | |
357 /* Compression header is always the first block? */ | |
358 cram_block_compression_hdr *comp_hdr; | |
359 cram_block *comp_hdr_block; | |
360 | |
361 /* For construction purposes */ | |
362 int max_slice, curr_slice; // maximum number of slices | |
363 int max_rec, curr_rec; // current and max recs per slice | |
364 int max_c_rec, curr_c_rec; // current and max recs per container | |
365 int slice_rec; // rec no. for start of this slice | |
366 int curr_ref; // current ref ID. -2 for no previous | |
367 int last_pos; // last record position | |
368 struct cram_slice **slices, *slice; | |
369 int pos_sorted; // boolean, 1=>position sorted data | |
370 int max_apos; // maximum position, used if pos_sorted==0 | |
371 int last_slice; // number of reads in last slice (0 for 1st) | |
372 int multi_seq; // true if packing multi seqs per cont/slice | |
373 int unsorted; // true is AP_delta is 0. | |
374 | |
375 /* Copied from fd before encoding, to allow multi-threading */ | |
376 int ref_start, first_base, last_base, ref_id, ref_end; | |
377 char *ref; | |
378 //struct ref_entry *ref; | |
379 | |
380 /* For multi-threading */ | |
381 bam_seq_t **bams; | |
382 | |
383 /* Statistics for encoding */ | |
384 cram_stats *stats[DS_END]; | |
385 | |
386 khash_t(s_i2i) *tags_used; // set of tag types in use, for tag encoding map | |
387 int *refs_used; // array of frequency of ref seq IDs | |
388 | |
389 uint32_t crc32; // CRC32 | |
390 } cram_container; | |
391 | |
392 /* | |
393 * A single cram record | |
394 */ | |
395 typedef struct { | |
396 struct cram_slice *s; // Filled out by cram_decode only | |
397 | |
398 int32_t ref_id; // fixed for all recs in slice? | |
399 int32_t flags; // BF | |
400 int32_t cram_flags; // CF | |
401 int32_t len; // RL | |
402 int32_t apos; // AP | |
403 int32_t rg; // RG | |
404 int32_t name; // RN; idx to s->names_blk | |
405 int32_t name_len; | |
406 int32_t mate_line; // index to another cram_record | |
407 int32_t mate_ref_id; | |
408 int32_t mate_pos; // NP | |
409 int32_t tlen; // TS | |
410 | |
411 // Auxiliary data | |
412 int32_t ntags; // TC | |
413 int32_t aux; // idx to s->aux_blk | |
414 int32_t aux_size; // total size of packed ntags in aux_blk | |
415 #ifndef TN_external | |
416 int32_t TN_idx; // TN; idx to s->TN; | |
417 #else | |
418 int32_t tn; // idx to s->tn_blk | |
419 #endif | |
420 int TL; | |
421 | |
422 int32_t seq; // idx to s->seqs_blk | |
423 int32_t qual; // idx to s->qual_blk | |
424 int32_t cigar; // idx to s->cigar | |
425 int32_t ncigar; | |
426 int32_t aend; // alignment end | |
427 int32_t mqual; // MQ | |
428 | |
429 int32_t feature; // idx to s->feature | |
430 int32_t nfeature; // number of features | |
431 int32_t mate_flags; // MF | |
432 } cram_record; | |
433 | |
434 // Accessor macros as an analogue of the bam ones | |
435 #define cram_qname(c) (&(c)->s->name_blk->data[(c)->name]) | |
436 #define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq]) | |
437 #define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual]) | |
438 #define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux]) | |
439 #define cram_seqi(c,i) (cram_seq((c))[(i)]) | |
440 #define cram_name_len(c) ((c)->name_len) | |
441 #define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0) | |
442 #define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0) | |
443 #define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar]) | |
444 | |
445 /* | |
446 * A feature is a base difference, used for the sequence reference encoding. | |
447 * (We generate these internally when writing CRAM.) | |
448 */ | |
449 typedef struct { | |
450 union { | |
451 struct { | |
452 int pos; | |
453 int code; | |
454 int base; // substitution code | |
455 } X; | |
456 struct { | |
457 int pos; | |
458 int code; | |
459 int base; // actual base & qual | |
460 int qual; | |
461 } B; | |
462 struct { | |
463 int pos; | |
464 int code; | |
465 int seq_idx; // index to s->seqs_blk | |
466 int len; | |
467 } b; | |
468 struct { | |
469 int pos; | |
470 int code; | |
471 int qual; | |
472 } Q; | |
473 struct { | |
474 int pos; | |
475 int code; | |
476 int len; | |
477 int seq_idx; // soft-clip multiple bases | |
478 } S; | |
479 struct { | |
480 int pos; | |
481 int code; | |
482 int len; | |
483 int seq_idx; // insertion multiple bases | |
484 } I; | |
485 struct { | |
486 int pos; | |
487 int code; | |
488 int base; // insertion single base | |
489 } i; | |
490 struct { | |
491 int pos; | |
492 int code; | |
493 int len; | |
494 } D; | |
495 struct { | |
496 int pos; | |
497 int code; | |
498 int len; | |
499 } N; | |
500 struct { | |
501 int pos; | |
502 int code; | |
503 int len; | |
504 } P; | |
505 struct { | |
506 int pos; | |
507 int code; | |
508 int len; | |
509 } H; | |
510 }; | |
511 } cram_feature; | |
512 | |
513 /* | |
514 * A slice is really just a set of blocks, but it | |
515 * is the logical unit for decoding a number of | |
516 * sequences. | |
517 */ | |
518 typedef struct cram_slice { | |
519 cram_block_slice_hdr *hdr; | |
520 cram_block *hdr_block; | |
521 cram_block **block; | |
522 cram_block **block_by_id; | |
523 | |
524 /* State used during encoding/decoding */ | |
525 int last_apos, max_apos; | |
526 | |
527 /* Array of decoded cram records */ | |
528 cram_record *crecs; | |
529 | |
530 /* An dynamically growing buffers for data pointed | |
531 * to by crecs[] array. | |
532 */ | |
533 uint32_t *cigar; | |
534 uint32_t cigar_alloc; | |
535 uint32_t ncigar; | |
536 | |
537 cram_feature *features; | |
538 int nfeatures; | |
539 int afeatures; // allocated size of features | |
540 | |
541 #ifndef TN_external | |
542 // TN field (Tag Name) | |
543 uint32_t *TN; | |
544 int nTN, aTN; // used and allocated size for TN[] | |
545 #else | |
546 cram_block *tn_blk; | |
547 int tn_id; | |
548 #endif | |
549 | |
550 // For variable sized elements which are always external blocks. | |
551 cram_block *name_blk; | |
552 cram_block *seqs_blk; | |
553 cram_block *qual_blk; | |
554 cram_block *base_blk; | |
555 cram_block *soft_blk; | |
556 cram_block *aux_blk; | |
557 cram_block *aux_OQ_blk; | |
558 cram_block *aux_BQ_blk; | |
559 cram_block *aux_BD_blk; | |
560 cram_block *aux_BI_blk; | |
561 cram_block *aux_FZ_blk; | |
562 cram_block *aux_oq_blk; | |
563 cram_block *aux_os_blk; | |
564 cram_block *aux_oz_blk; | |
565 | |
566 string_alloc_t *pair_keys; // Pooled keys for pair hash. | |
567 khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice. | |
568 | |
569 char *ref; // slice of current reference | |
570 int ref_start; // start position of current reference; | |
571 int ref_end; // end position of current reference; | |
572 int ref_id; | |
573 } cram_slice; | |
574 | |
575 /*----------------------------------------------------------------------------- | |
576 * Consider moving reference handling to cram_refs.[ch] | |
577 */ | |
578 // from fa.fai / samtools faidx files | |
579 typedef struct ref_entry { | |
580 char *name; | |
581 char *fn; | |
582 int64_t length; | |
583 int64_t offset; | |
584 int bases_per_line; | |
585 int line_length; | |
586 int64_t count; // for shared references so we know to dealloc seq | |
587 char *seq; | |
588 } ref_entry; | |
589 | |
590 KHASH_MAP_INIT_STR(refs, ref_entry*) | |
591 | |
592 // References structure. | |
593 typedef struct { | |
594 string_alloc_t *pool; // String pool for holding filenames and SN vals | |
595 | |
596 khash_t(refs) *h_meta; // ref_entry*, index by name | |
597 ref_entry **ref_id; // ref_entry*, index by ID | |
598 int nref; // number of ref_entry | |
599 | |
600 char *fn; // current file opened | |
601 BGZF *fp; // and the hFILE* to go with it. | |
602 | |
603 int count; // how many cram_fd sharing this refs struct | |
604 | |
605 pthread_mutex_t lock; // Mutex for multi-threaded updating | |
606 ref_entry *last; // Last queried sequence | |
607 int last_id; // Used in cram_ref_decr_locked to delay free | |
608 } refs_t; | |
609 | |
610 /*----------------------------------------------------------------------------- | |
611 * CRAM index | |
612 * | |
613 * Detect format by number of entries per line. | |
614 * 5 => 1.0 (refid, start, nseq, C offset, slice) | |
615 * 6 => 1.1 (refid, start, span, C offset, S offset, S size) | |
616 * | |
617 * Indices are stored in a nested containment list, which is trivial to set | |
618 * up as the indices are on sorted data so we're appending to the nclist | |
619 * in sorted order. Basically if a slice entirely fits within a previous | |
620 * slice then we append to that slices list. This is done recursively. | |
621 * | |
622 * Lists are sorted on two dimensions: ref id + slice coords. | |
623 */ | |
624 typedef struct cram_index { | |
625 int nslice, nalloc; // total number of slices | |
626 struct cram_index *e; // array of size nslice | |
627 | |
628 int refid; // 1.0 1.1 | |
629 int start; // 1.0 1.1 | |
630 int end; // 1.1 | |
631 int nseq; // 1.0 - undocumented | |
632 int slice; // 1.0 landmark index, 1.1 landmark value | |
633 int len; // 1.1 - size of slice in bytes | |
634 int64_t offset; // 1.0 1.1 | |
635 } cram_index; | |
636 | |
637 typedef struct { | |
638 int refid; | |
639 int start; | |
640 int end; | |
641 } cram_range; | |
642 | |
643 /*----------------------------------------------------------------------------- | |
644 */ | |
645 /* CRAM File handle */ | |
646 | |
647 typedef struct spare_bams { | |
648 bam_seq_t **bams; | |
649 struct spare_bams *next; | |
650 } spare_bams; | |
651 | |
652 typedef struct cram_fd { | |
653 struct hFILE *fp; | |
654 int mode; // 'r' or 'w' | |
655 int version; | |
656 cram_file_def *file_def; | |
657 SAM_hdr *header; | |
658 | |
659 char *prefix; | |
660 int64_t record_counter; | |
661 int err; | |
662 | |
663 // Most recent compression header decoded | |
664 //cram_block_compression_hdr *comp_hdr; | |
665 //cram_block_slice_hdr *slice_hdr; | |
666 | |
667 // Current container being processed. | |
668 cram_container *ctr; | |
669 | |
670 // positions for encoding or decoding | |
671 int first_base, last_base; | |
672 | |
673 // cached reference portion | |
674 refs_t *refs; // ref meta-data structure | |
675 char *ref, *ref_free; // current portion held in memory | |
676 int ref_id; | |
677 int ref_start; | |
678 int ref_end; | |
679 char *ref_fn; // reference fasta filename | |
680 | |
681 // compression level and metrics | |
682 int level; | |
683 cram_metrics *m[DS_END]; | |
684 | |
685 // options | |
686 int decode_md; // Whether to export MD and NM tags | |
687 int verbose; | |
688 int seqs_per_slice; | |
689 int slices_per_container; | |
690 int embed_ref; | |
691 int no_ref; | |
692 int ignore_md5; | |
693 int use_bz2; | |
694 int use_rans; | |
695 int use_lzma; | |
696 int shared_ref; | |
697 unsigned int required_fields; | |
698 cram_range range; | |
699 | |
700 // lookup tables, stored here so we can be trivially multi-threaded | |
701 unsigned int bam_flag_swap[0x1000]; // cram -> bam flags | |
702 unsigned int cram_flag_swap[0x1000];// bam -> cram flags | |
703 unsigned char L1[256]; // ACGT{*} ->0123{4} | |
704 unsigned char L2[256]; // ACGTN{*}->01234{5} | |
705 char cram_sub_matrix[32][32]; // base substituion codes | |
706 | |
707 int index_sz; | |
708 cram_index *index; // array, sizeof index_sz | |
709 off_t first_container; | |
710 int eof; | |
711 int last_slice; // number of recs encoded in last slice | |
712 int multi_seq; | |
713 int unsorted; | |
714 int empty_container; // Marker for EOF block | |
715 | |
716 // thread pool | |
717 int own_pool; | |
718 t_pool *pool; | |
719 t_results_queue *rqueue; | |
720 pthread_mutex_t metrics_lock; | |
721 pthread_mutex_t ref_lock; | |
722 spare_bams *bl; | |
723 pthread_mutex_t bam_list_lock; | |
724 void *job_pending; | |
725 int ooc; // out of containers. | |
726 } cram_fd; | |
727 | |
728 // Translation of required fields to cram data series | |
729 enum cram_fields { | |
730 CRAM_BF = 0x00000001, | |
731 CRAM_AP = 0x00000002, | |
732 CRAM_FP = 0x00000004, | |
733 CRAM_RL = 0x00000008, | |
734 CRAM_DL = 0x00000010, | |
735 CRAM_NF = 0x00000020, | |
736 CRAM_BA = 0x00000040, | |
737 CRAM_QS = 0x00000080, | |
738 CRAM_FC = 0x00000100, | |
739 CRAM_FN = 0x00000200, | |
740 CRAM_BS = 0x00000400, | |
741 CRAM_IN = 0x00000800, | |
742 CRAM_RG = 0x00001000, | |
743 CRAM_MQ = 0x00002000, | |
744 CRAM_TL = 0x00004000, | |
745 CRAM_RN = 0x00008000, | |
746 CRAM_NS = 0x00010000, | |
747 CRAM_NP = 0x00020000, | |
748 CRAM_TS = 0x00040000, | |
749 CRAM_MF = 0x00080000, | |
750 CRAM_CF = 0x00100000, | |
751 CRAM_RI = 0x00200000, | |
752 CRAM_RS = 0x00400000, | |
753 CRAM_PD = 0x00800000, | |
754 CRAM_HC = 0x01000000, | |
755 CRAM_SC = 0x02000000, | |
756 CRAM_BB = 0x04000000, | |
757 CRAM_BB_len = 0x08000000, | |
758 CRAM_QQ = 0x10000000, | |
759 CRAM_QQ_len = 0x20000000, | |
760 CRAM_aux= 0x40000000, | |
761 CRAM_ALL= 0x7fffffff, | |
762 }; | |
763 | |
764 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may | |
765 // encode a base difference, but we don't need to know what it is for CIGAR. | |
766 // If we have a soft-clip or insertion, we do need SC/IN though to know how | |
767 // long that array is. | |
768 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \ | |
769 CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF) | |
770 | |
771 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_QS | CRAM_BS | \ | |
772 CRAM_RL | CRAM_AP | CRAM_BB | CRAM_QQ) | |
773 | |
774 /* BF bitfields */ | |
775 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */ | |
776 #define CRAM_FPAIRED 256 | |
777 #define CRAM_FPROPER_PAIR 128 | |
778 #define CRAM_FUNMAP 64 | |
779 #define CRAM_FREVERSE 32 | |
780 #define CRAM_FREAD1 16 | |
781 #define CRAM_FREAD2 8 | |
782 #define CRAM_FSECONDARY 4 | |
783 #define CRAM_FQCFAIL 2 | |
784 #define CRAM_FDUP 1 | |
785 | |
786 #define DS_aux_S "\001" | |
787 #define DS_aux_OQ_S "\002" | |
788 #define DS_aux_BQ_S "\003" | |
789 #define DS_aux_BD_S "\004" | |
790 #define DS_aux_BI_S "\005" | |
791 #define DS_aux_FZ_S "\006" | |
792 #define DS_aux_oq_S "\007" | |
793 #define DS_aux_os_S "\010" | |
794 #define DS_aux_oz_S "\011" | |
795 | |
796 #define CRAM_M_REVERSE 1 | |
797 #define CRAM_M_UNMAP 2 | |
798 | |
799 | |
800 /* CF bitfields */ | |
801 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0) | |
802 #define CRAM_FLAG_DETACHED (1<<1) | |
803 #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) | |
804 | |
805 #ifdef __cplusplus | |
806 } | |
807 #endif | |
808 | |
809 #endif /* _CRAM_STRUCTS_H_ */ |