comparison ezBAMQC/src/htslib/cram/cram_structs.h @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #ifndef _CRAM_STRUCTS_H_
32 #define _CRAM_STRUCTS_H_
33
34 #ifdef __cplusplus
35 extern "C" {
36 #endif
37
38 /*
39 * Defines in-memory structs for the basic file-format objects in the
40 * CRAM format.
41 *
42 * The basic file format is:
43 * File-def SAM-hdr Container Container ...
44 *
45 * Container:
46 * Service-block data-block data-block ...
47 *
48 * Multiple blocks in a container are grouped together as slices,
49 * also sometimes referred to as landmarks in the spec.
50 */
51
52
53 #include <stdint.h>
54
55 #include "cram/thread_pool.h"
56 #include "cram/string_alloc.h"
57 #include "htslib/khash.h"
58
59 // Generic hash-map integer -> integer
60 KHASH_MAP_INIT_INT(m_i2i, int)
61
62 // Generic hash-set integer -> (existance)
63 KHASH_SET_INIT_INT(s_i2i)
64
65 // For brevity
66 typedef unsigned char uc;
67
68 /*
69 * A union for the preservation map. Required for khash.
70 */
71 typedef union {
72 int i;
73 char *p;
74 } pmap_t;
75
76 // Generates static functions here which isn't ideal, but we have no way
77 // currently to declare the kh_map_t structure here without also declaring a
78 // duplicate in the .c files due to the nature of the KHASH macros.
79 KHASH_MAP_INIT_STR(map, pmap_t)
80
81 struct hFILE;
82
83 #define SEQS_PER_SLICE 10000
84 #define SLICE_PER_CNT 1
85
86 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
87
88 #define MAX_STAT_VAL 1024
89 //#define MAX_STAT_VAL 16
90 typedef struct {
91 int freqs[MAX_STAT_VAL];
92 khash_t(m_i2i) *h;
93 int nsamp; // total number of values added
94 int nvals; // total number of unique values added
95 } cram_stats;
96
97 /* NB: matches java impl, not the spec */
98 enum cram_encoding {
99 E_NULL = 0,
100 E_EXTERNAL = 1,
101 E_GOLOMB = 2,
102 E_HUFFMAN = 3,
103 E_BYTE_ARRAY_LEN = 4,
104 E_BYTE_ARRAY_STOP = 5,
105 E_BETA = 6,
106 E_SUBEXP = 7,
107 E_GOLOMB_RICE = 8,
108 E_GAMMA = 9
109 };
110
111 enum cram_external_type {
112 E_INT = 1,
113 E_LONG = 2,
114 E_BYTE = 3,
115 E_BYTE_ARRAY = 4,
116 E_BYTE_ARRAY_BLOCK = 5,
117 };
118
119 /* External IDs used by this implementation (only assumed during writing) */
120 enum cram_DS_ID {
121 DS_CORE = 0,
122 DS_aux = 1, // aux_blk
123 DS_aux_OQ = 2,
124 DS_aux_BQ = 3,
125 DS_aux_BD = 4,
126 DS_aux_BI = 5,
127 DS_aux_FZ = 6, // also ZM:B
128 DS_aux_oq = 7, // other qualities
129 DS_aux_os = 8, // other sequences
130 DS_aux_oz = 9, // other strings
131 DS_ref,
132 DS_RN, // name_blk
133 DS_QS, // qual_blk
134 DS_IN, // base_blk
135 DS_SC, // soft_blk
136
137 DS_BF, // start loop
138 DS_CF,
139 DS_AP,
140 DS_RG,
141 DS_MQ,
142 DS_NS,
143 DS_MF,
144 DS_TS,
145 DS_NP,
146 DS_NF,
147 DS_RL,
148 DS_FN,
149 DS_FC,
150 DS_FP,
151 DS_DL,
152 DS_BA,
153 DS_BS,
154 DS_TL,
155 DS_RI,
156 DS_RS,
157 DS_PD,
158 DS_HC,
159 DS_BB,
160 DS_QQ,
161
162 DS_TN, // end loop
163
164 DS_RN_len,
165 DS_SC_len,
166 DS_BB_len,
167 DS_QQ_len,
168
169 DS_TC, // CRAM v1.0 tags
170 DS_TM, // test
171 DS_TV, // test
172
173 DS_END,
174 };
175
176 /* "File Definition Structure" */
177 typedef struct {
178 char magic[4];
179 uint8_t major_version;
180 uint8_t minor_version;
181 char file_id[20]; // Filename or SHA1 checksum
182 } cram_file_def;
183
184 #define CRAM_MAJOR_VERS(v) ((v) >> 8)
185 #define CRAM_MINOR_VERS(v) ((v) & 0xff)
186
187 struct cram_slice;
188
189 enum cram_block_method {
190 ERROR = -1,
191 RAW = 0,
192 GZIP = 1,
193 BZIP2 = 2,
194 LZMA = 3,
195 RANS = 4, // Generic; either order
196 RANS0 = 4,
197 RANS1 = 10, // Not externalised; stored as RANS (generic)
198 GZIP_RLE = 11, // NB: not externalised in CRAM
199 };
200
201 enum cram_content_type {
202 CT_ERROR = -1,
203 FILE_HEADER = 0,
204 COMPRESSION_HEADER = 1,
205 MAPPED_SLICE = 2,
206 UNMAPPED_SLICE = 3, // CRAM V1.0 only
207 EXTERNAL = 4,
208 CORE = 5,
209 };
210
211 /* Compression metrics */
212 typedef struct {
213 // number of trials and time to next trial
214 int trial;
215 int next_trial;
216
217 // aggregate sizes during trials
218 int sz_gz_rle;
219 int sz_gz_def;
220 int sz_rans0;
221 int sz_rans1;
222 int sz_bzip2;
223 int sz_lzma;
224
225 // resultant method from trials
226 int method;
227 int strat;
228
229 // Revisions of method, to allow culling of continually failing ones.
230 int gz_rle_cnt;
231 int gz_def_cnt;
232 int rans0_cnt;
233 int rans1_cnt;
234 int bzip2_cnt;
235 int lzma_cnt;
236 int revised_method;
237
238 double gz_rle_extra;
239 double gz_def_extra;
240 double rans0_extra;
241 double rans1_extra;
242 double bzip2_extra;
243 double lzma_extra;
244 } cram_metrics;
245
246 /* Block */
247 typedef struct {
248 enum cram_block_method method, orig_method;
249 enum cram_content_type content_type;
250 int32_t content_id;
251 int32_t comp_size;
252 int32_t uncomp_size;
253 uint32_t crc32;
254 int32_t idx; /* offset into data */
255 unsigned char *data;
256
257 // For bit I/O
258 size_t alloc;
259 size_t byte;
260 int bit;
261 } cram_block;
262
263 struct cram_codec; /* defined in cram_codecs.h */
264 struct cram_map;
265
266 #define CRAM_MAP_HASH 32
267 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1))
268
269 /* Compression header block */
270 typedef struct {
271 int32_t ref_seq_id;
272 int32_t ref_seq_start;
273 int32_t ref_seq_span;
274 int32_t num_records;
275 int32_t num_landmarks;
276 int32_t *landmark;
277
278 /* Flags from preservation map */
279 int mapped_qs_included;
280 int unmapped_qs_included;
281 int unmapped_placed;
282 int qs_included;
283 int read_names_included;
284 int AP_delta;
285 // indexed by ref-base and subst. code
286 char substitution_matrix[5][4];
287
288 // TD Dictionary as a concatenated block
289 cram_block *TD_blk; // Tag Dictionary
290 int nTL; // number of TL entries in TD
291 unsigned char **TL; // array of size nTL, pointer into TD_blk.
292 khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices
293 string_alloc_t *TD_keys; // Pooled keys for TD hash.
294
295 khash_t(map) *preservation_map;
296 struct cram_map *rec_encoding_map[CRAM_MAP_HASH];
297 struct cram_map *tag_encoding_map[CRAM_MAP_HASH];
298
299 struct cram_codec *codecs[DS_END];
300
301 char *uncomp; // A single block of uncompressed data
302 size_t uncomp_size, uncomp_alloc;
303
304 unsigned int data_series; // See cram_fields enum below
305 } cram_block_compression_hdr;
306
307 typedef struct cram_map {
308 int key; /* 0xe0 + 3 bytes */
309 enum cram_encoding encoding;
310 int offset; /* Offset into a single block of memory */
311 int size; /* Size */
312 struct cram_codec *codec;
313 struct cram_map *next; // for noddy internal hash
314 } cram_map;
315
316 /* Mapped or unmapped slice header block */
317 typedef struct {
318 enum cram_content_type content_type;
319 int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */
320 int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */
321 int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */
322 int32_t num_records;
323 int64_t record_counter;
324 int32_t num_blocks;
325 int32_t num_content_ids;
326 int32_t *block_content_ids;
327 int32_t ref_base_id; /* if content_type == MAPPED_SLICE */
328 unsigned char md5[16];
329 } cram_block_slice_hdr;
330
331 struct ref_entry;
332
333 /*
334 * Container.
335 *
336 * Conceptually a container is split into slices, and slices into blocks.
337 * However on disk it's just a list of blocks and we need to query the
338 * block types to identify the start/end points of the slices.
339 *
340 * OR... are landmarks the start/end points of slices?
341 */
342 typedef struct {
343 int32_t length;
344 int32_t ref_seq_id;
345 int32_t ref_seq_start;
346 int32_t ref_seq_span;
347 int64_t record_counter;
348 int64_t num_bases;
349 int32_t num_records;
350 int32_t num_blocks;
351 int32_t num_landmarks;
352 int32_t *landmark;
353
354 /* Size of container header above */
355 size_t offset;
356
357 /* Compression header is always the first block? */
358 cram_block_compression_hdr *comp_hdr;
359 cram_block *comp_hdr_block;
360
361 /* For construction purposes */
362 int max_slice, curr_slice; // maximum number of slices
363 int max_rec, curr_rec; // current and max recs per slice
364 int max_c_rec, curr_c_rec; // current and max recs per container
365 int slice_rec; // rec no. for start of this slice
366 int curr_ref; // current ref ID. -2 for no previous
367 int last_pos; // last record position
368 struct cram_slice **slices, *slice;
369 int pos_sorted; // boolean, 1=>position sorted data
370 int max_apos; // maximum position, used if pos_sorted==0
371 int last_slice; // number of reads in last slice (0 for 1st)
372 int multi_seq; // true if packing multi seqs per cont/slice
373 int unsorted; // true is AP_delta is 0.
374
375 /* Copied from fd before encoding, to allow multi-threading */
376 int ref_start, first_base, last_base, ref_id, ref_end;
377 char *ref;
378 //struct ref_entry *ref;
379
380 /* For multi-threading */
381 bam_seq_t **bams;
382
383 /* Statistics for encoding */
384 cram_stats *stats[DS_END];
385
386 khash_t(s_i2i) *tags_used; // set of tag types in use, for tag encoding map
387 int *refs_used; // array of frequency of ref seq IDs
388
389 uint32_t crc32; // CRC32
390 } cram_container;
391
392 /*
393 * A single cram record
394 */
395 typedef struct {
396 struct cram_slice *s; // Filled out by cram_decode only
397
398 int32_t ref_id; // fixed for all recs in slice?
399 int32_t flags; // BF
400 int32_t cram_flags; // CF
401 int32_t len; // RL
402 int32_t apos; // AP
403 int32_t rg; // RG
404 int32_t name; // RN; idx to s->names_blk
405 int32_t name_len;
406 int32_t mate_line; // index to another cram_record
407 int32_t mate_ref_id;
408 int32_t mate_pos; // NP
409 int32_t tlen; // TS
410
411 // Auxiliary data
412 int32_t ntags; // TC
413 int32_t aux; // idx to s->aux_blk
414 int32_t aux_size; // total size of packed ntags in aux_blk
415 #ifndef TN_external
416 int32_t TN_idx; // TN; idx to s->TN;
417 #else
418 int32_t tn; // idx to s->tn_blk
419 #endif
420 int TL;
421
422 int32_t seq; // idx to s->seqs_blk
423 int32_t qual; // idx to s->qual_blk
424 int32_t cigar; // idx to s->cigar
425 int32_t ncigar;
426 int32_t aend; // alignment end
427 int32_t mqual; // MQ
428
429 int32_t feature; // idx to s->feature
430 int32_t nfeature; // number of features
431 int32_t mate_flags; // MF
432 } cram_record;
433
434 // Accessor macros as an analogue of the bam ones
435 #define cram_qname(c) (&(c)->s->name_blk->data[(c)->name])
436 #define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq])
437 #define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual])
438 #define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux])
439 #define cram_seqi(c,i) (cram_seq((c))[(i)])
440 #define cram_name_len(c) ((c)->name_len)
441 #define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0)
442 #define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0)
443 #define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar])
444
445 /*
446 * A feature is a base difference, used for the sequence reference encoding.
447 * (We generate these internally when writing CRAM.)
448 */
449 typedef struct {
450 union {
451 struct {
452 int pos;
453 int code;
454 int base; // substitution code
455 } X;
456 struct {
457 int pos;
458 int code;
459 int base; // actual base & qual
460 int qual;
461 } B;
462 struct {
463 int pos;
464 int code;
465 int seq_idx; // index to s->seqs_blk
466 int len;
467 } b;
468 struct {
469 int pos;
470 int code;
471 int qual;
472 } Q;
473 struct {
474 int pos;
475 int code;
476 int len;
477 int seq_idx; // soft-clip multiple bases
478 } S;
479 struct {
480 int pos;
481 int code;
482 int len;
483 int seq_idx; // insertion multiple bases
484 } I;
485 struct {
486 int pos;
487 int code;
488 int base; // insertion single base
489 } i;
490 struct {
491 int pos;
492 int code;
493 int len;
494 } D;
495 struct {
496 int pos;
497 int code;
498 int len;
499 } N;
500 struct {
501 int pos;
502 int code;
503 int len;
504 } P;
505 struct {
506 int pos;
507 int code;
508 int len;
509 } H;
510 };
511 } cram_feature;
512
513 /*
514 * A slice is really just a set of blocks, but it
515 * is the logical unit for decoding a number of
516 * sequences.
517 */
518 typedef struct cram_slice {
519 cram_block_slice_hdr *hdr;
520 cram_block *hdr_block;
521 cram_block **block;
522 cram_block **block_by_id;
523
524 /* State used during encoding/decoding */
525 int last_apos, max_apos;
526
527 /* Array of decoded cram records */
528 cram_record *crecs;
529
530 /* An dynamically growing buffers for data pointed
531 * to by crecs[] array.
532 */
533 uint32_t *cigar;
534 uint32_t cigar_alloc;
535 uint32_t ncigar;
536
537 cram_feature *features;
538 int nfeatures;
539 int afeatures; // allocated size of features
540
541 #ifndef TN_external
542 // TN field (Tag Name)
543 uint32_t *TN;
544 int nTN, aTN; // used and allocated size for TN[]
545 #else
546 cram_block *tn_blk;
547 int tn_id;
548 #endif
549
550 // For variable sized elements which are always external blocks.
551 cram_block *name_blk;
552 cram_block *seqs_blk;
553 cram_block *qual_blk;
554 cram_block *base_blk;
555 cram_block *soft_blk;
556 cram_block *aux_blk;
557 cram_block *aux_OQ_blk;
558 cram_block *aux_BQ_blk;
559 cram_block *aux_BD_blk;
560 cram_block *aux_BI_blk;
561 cram_block *aux_FZ_blk;
562 cram_block *aux_oq_blk;
563 cram_block *aux_os_blk;
564 cram_block *aux_oz_blk;
565
566 string_alloc_t *pair_keys; // Pooled keys for pair hash.
567 khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice.
568
569 char *ref; // slice of current reference
570 int ref_start; // start position of current reference;
571 int ref_end; // end position of current reference;
572 int ref_id;
573 } cram_slice;
574
575 /*-----------------------------------------------------------------------------
576 * Consider moving reference handling to cram_refs.[ch]
577 */
578 // from fa.fai / samtools faidx files
579 typedef struct ref_entry {
580 char *name;
581 char *fn;
582 int64_t length;
583 int64_t offset;
584 int bases_per_line;
585 int line_length;
586 int64_t count; // for shared references so we know to dealloc seq
587 char *seq;
588 } ref_entry;
589
590 KHASH_MAP_INIT_STR(refs, ref_entry*)
591
592 // References structure.
593 typedef struct {
594 string_alloc_t *pool; // String pool for holding filenames and SN vals
595
596 khash_t(refs) *h_meta; // ref_entry*, index by name
597 ref_entry **ref_id; // ref_entry*, index by ID
598 int nref; // number of ref_entry
599
600 char *fn; // current file opened
601 BGZF *fp; // and the hFILE* to go with it.
602
603 int count; // how many cram_fd sharing this refs struct
604
605 pthread_mutex_t lock; // Mutex for multi-threaded updating
606 ref_entry *last; // Last queried sequence
607 int last_id; // Used in cram_ref_decr_locked to delay free
608 } refs_t;
609
610 /*-----------------------------------------------------------------------------
611 * CRAM index
612 *
613 * Detect format by number of entries per line.
614 * 5 => 1.0 (refid, start, nseq, C offset, slice)
615 * 6 => 1.1 (refid, start, span, C offset, S offset, S size)
616 *
617 * Indices are stored in a nested containment list, which is trivial to set
618 * up as the indices are on sorted data so we're appending to the nclist
619 * in sorted order. Basically if a slice entirely fits within a previous
620 * slice then we append to that slices list. This is done recursively.
621 *
622 * Lists are sorted on two dimensions: ref id + slice coords.
623 */
624 typedef struct cram_index {
625 int nslice, nalloc; // total number of slices
626 struct cram_index *e; // array of size nslice
627
628 int refid; // 1.0 1.1
629 int start; // 1.0 1.1
630 int end; // 1.1
631 int nseq; // 1.0 - undocumented
632 int slice; // 1.0 landmark index, 1.1 landmark value
633 int len; // 1.1 - size of slice in bytes
634 int64_t offset; // 1.0 1.1
635 } cram_index;
636
637 typedef struct {
638 int refid;
639 int start;
640 int end;
641 } cram_range;
642
643 /*-----------------------------------------------------------------------------
644 */
645 /* CRAM File handle */
646
647 typedef struct spare_bams {
648 bam_seq_t **bams;
649 struct spare_bams *next;
650 } spare_bams;
651
652 typedef struct cram_fd {
653 struct hFILE *fp;
654 int mode; // 'r' or 'w'
655 int version;
656 cram_file_def *file_def;
657 SAM_hdr *header;
658
659 char *prefix;
660 int64_t record_counter;
661 int err;
662
663 // Most recent compression header decoded
664 //cram_block_compression_hdr *comp_hdr;
665 //cram_block_slice_hdr *slice_hdr;
666
667 // Current container being processed.
668 cram_container *ctr;
669
670 // positions for encoding or decoding
671 int first_base, last_base;
672
673 // cached reference portion
674 refs_t *refs; // ref meta-data structure
675 char *ref, *ref_free; // current portion held in memory
676 int ref_id;
677 int ref_start;
678 int ref_end;
679 char *ref_fn; // reference fasta filename
680
681 // compression level and metrics
682 int level;
683 cram_metrics *m[DS_END];
684
685 // options
686 int decode_md; // Whether to export MD and NM tags
687 int verbose;
688 int seqs_per_slice;
689 int slices_per_container;
690 int embed_ref;
691 int no_ref;
692 int ignore_md5;
693 int use_bz2;
694 int use_rans;
695 int use_lzma;
696 int shared_ref;
697 unsigned int required_fields;
698 cram_range range;
699
700 // lookup tables, stored here so we can be trivially multi-threaded
701 unsigned int bam_flag_swap[0x1000]; // cram -> bam flags
702 unsigned int cram_flag_swap[0x1000];// bam -> cram flags
703 unsigned char L1[256]; // ACGT{*} ->0123{4}
704 unsigned char L2[256]; // ACGTN{*}->01234{5}
705 char cram_sub_matrix[32][32]; // base substituion codes
706
707 int index_sz;
708 cram_index *index; // array, sizeof index_sz
709 off_t first_container;
710 int eof;
711 int last_slice; // number of recs encoded in last slice
712 int multi_seq;
713 int unsorted;
714 int empty_container; // Marker for EOF block
715
716 // thread pool
717 int own_pool;
718 t_pool *pool;
719 t_results_queue *rqueue;
720 pthread_mutex_t metrics_lock;
721 pthread_mutex_t ref_lock;
722 spare_bams *bl;
723 pthread_mutex_t bam_list_lock;
724 void *job_pending;
725 int ooc; // out of containers.
726 } cram_fd;
727
728 // Translation of required fields to cram data series
729 enum cram_fields {
730 CRAM_BF = 0x00000001,
731 CRAM_AP = 0x00000002,
732 CRAM_FP = 0x00000004,
733 CRAM_RL = 0x00000008,
734 CRAM_DL = 0x00000010,
735 CRAM_NF = 0x00000020,
736 CRAM_BA = 0x00000040,
737 CRAM_QS = 0x00000080,
738 CRAM_FC = 0x00000100,
739 CRAM_FN = 0x00000200,
740 CRAM_BS = 0x00000400,
741 CRAM_IN = 0x00000800,
742 CRAM_RG = 0x00001000,
743 CRAM_MQ = 0x00002000,
744 CRAM_TL = 0x00004000,
745 CRAM_RN = 0x00008000,
746 CRAM_NS = 0x00010000,
747 CRAM_NP = 0x00020000,
748 CRAM_TS = 0x00040000,
749 CRAM_MF = 0x00080000,
750 CRAM_CF = 0x00100000,
751 CRAM_RI = 0x00200000,
752 CRAM_RS = 0x00400000,
753 CRAM_PD = 0x00800000,
754 CRAM_HC = 0x01000000,
755 CRAM_SC = 0x02000000,
756 CRAM_BB = 0x04000000,
757 CRAM_BB_len = 0x08000000,
758 CRAM_QQ = 0x10000000,
759 CRAM_QQ_len = 0x20000000,
760 CRAM_aux= 0x40000000,
761 CRAM_ALL= 0x7fffffff,
762 };
763
764 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may
765 // encode a base difference, but we don't need to know what it is for CIGAR.
766 // If we have a soft-clip or insertion, we do need SC/IN though to know how
767 // long that array is.
768 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \
769 CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF)
770
771 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_QS | CRAM_BS | \
772 CRAM_RL | CRAM_AP | CRAM_BB | CRAM_QQ)
773
774 /* BF bitfields */
775 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
776 #define CRAM_FPAIRED 256
777 #define CRAM_FPROPER_PAIR 128
778 #define CRAM_FUNMAP 64
779 #define CRAM_FREVERSE 32
780 #define CRAM_FREAD1 16
781 #define CRAM_FREAD2 8
782 #define CRAM_FSECONDARY 4
783 #define CRAM_FQCFAIL 2
784 #define CRAM_FDUP 1
785
786 #define DS_aux_S "\001"
787 #define DS_aux_OQ_S "\002"
788 #define DS_aux_BQ_S "\003"
789 #define DS_aux_BD_S "\004"
790 #define DS_aux_BI_S "\005"
791 #define DS_aux_FZ_S "\006"
792 #define DS_aux_oq_S "\007"
793 #define DS_aux_os_S "\010"
794 #define DS_aux_oz_S "\011"
795
796 #define CRAM_M_REVERSE 1
797 #define CRAM_M_UNMAP 2
798
799
800 /* CF bitfields */
801 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0)
802 #define CRAM_FLAG_DETACHED (1<<1)
803 #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2)
804
805 #ifdef __cplusplus
806 }
807 #endif
808
809 #endif /* _CRAM_STRUCTS_H_ */