Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/bam.h @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 /* The MIT License | |
2 | |
3 Copyright (c) 2008-2010 Genome Research Ltd (GRL). | |
4 | |
5 Permission is hereby granted, free of charge, to any person obtaining | |
6 a copy of this software and associated documentation files (the | |
7 "Software"), to deal in the Software without restriction, including | |
8 without limitation the rights to use, copy, modify, merge, publish, | |
9 distribute, sublicense, and/or sell copies of the Software, and to | |
10 permit persons to whom the Software is furnished to do so, subject to | |
11 the following conditions: | |
12 | |
13 The above copyright notice and this permission notice shall be | |
14 included in all copies or substantial portions of the Software. | |
15 | |
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
23 SOFTWARE. | |
24 */ | |
25 | |
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */ | |
27 | |
28 #ifndef BAM_BAM_H | |
29 #define BAM_BAM_H | |
30 | |
31 /*! | |
32 @header | |
33 | |
34 BAM library provides I/O and various operations on manipulating files | |
35 in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) | |
36 format. It now supports importing from or exporting to SAM, sorting, | |
37 merging, generating pileup, and quickly retrieval of reads overlapped | |
38 with a specified region. | |
39 | |
40 @copyright Genome Research Ltd. | |
41 */ | |
42 | |
43 #define BAM_VERSION "0.1.19-44428cd" | |
44 | |
45 #include <stdint.h> | |
46 #include <stdlib.h> | |
47 #include <string.h> | |
48 #include <stdio.h> | |
49 | |
50 #ifndef BAM_LITE | |
51 #define BAM_VIRTUAL_OFFSET16 | |
52 #include "bgzf.h" | |
53 /*! @abstract BAM file handler */ | |
54 typedef BGZF *bamFile; | |
55 #define bam_open(fn, mode) bgzf_open(fn, mode) | |
56 #define bam_dopen(fd, mode) bgzf_fdopen(fd, mode) | |
57 #define bam_close(fp) bgzf_close(fp) | |
58 #define bam_read(fp, buf, size) bgzf_read(fp, buf, size) | |
59 #define bam_write(fp, buf, size) bgzf_write(fp, buf, size) | |
60 #define bam_tell(fp) bgzf_tell(fp) | |
61 #define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir) | |
62 #else | |
63 #define BAM_TRUE_OFFSET | |
64 #include <zlib.h> | |
65 typedef gzFile bamFile; | |
66 #define bam_open(fn, mode) gzopen(fn, mode) | |
67 #define bam_dopen(fd, mode) gzdopen(fd, mode) | |
68 #define bam_close(fp) gzclose(fp) | |
69 #define bam_read(fp, buf, size) gzread(fp, buf, size) | |
70 /* no bam_write/bam_tell/bam_seek() here */ | |
71 #endif | |
72 | |
73 /*! @typedef | |
74 @abstract Structure for the alignment header. | |
75 @field n_targets number of reference sequences | |
76 @field target_name names of the reference sequences | |
77 @field target_len lengths of the referene sequences | |
78 @field dict header dictionary | |
79 @field hash hash table for fast name lookup | |
80 @field rg2lib hash table for @RG-ID -> LB lookup | |
81 @field l_text length of the plain text in the header | |
82 @field text plain text | |
83 | |
84 @discussion Field hash points to null by default. It is a private | |
85 member. | |
86 */ | |
87 typedef struct { | |
88 int32_t n_targets; | |
89 char **target_name; | |
90 uint32_t *target_len; | |
91 void *dict, *hash, *rg2lib; | |
92 uint32_t l_text, n_text; | |
93 char *text; | |
94 } bam_header_t; | |
95 | |
96 /*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */ | |
97 #define BAM_FPAIRED 1 | |
98 /*! @abstract the read is mapped in a proper pair */ | |
99 #define BAM_FPROPER_PAIR 2 | |
100 /*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ | |
101 #define BAM_FUNMAP 4 | |
102 /*! @abstract the mate is unmapped */ | |
103 #define BAM_FMUNMAP 8 | |
104 /*! @abstract the read is mapped to the reverse strand */ | |
105 #define BAM_FREVERSE 16 | |
106 /*! @abstract the mate is mapped to the reverse strand */ | |
107 #define BAM_FMREVERSE 32 | |
108 /*! @abstract this is read1 */ | |
109 #define BAM_FREAD1 64 | |
110 /*! @abstract this is read2 */ | |
111 #define BAM_FREAD2 128 | |
112 /*! @abstract not primary alignment */ | |
113 #define BAM_FSECONDARY 256 | |
114 /*! @abstract QC failure */ | |
115 #define BAM_FQCFAIL 512 | |
116 /*! @abstract optical or PCR duplicate */ | |
117 #define BAM_FDUP 1024 | |
118 | |
119 #define BAM_OFDEC 0 | |
120 #define BAM_OFHEX 1 | |
121 #define BAM_OFSTR 2 | |
122 | |
123 /*! @abstract defautl mask for pileup */ | |
124 #define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) | |
125 | |
126 #define BAM_CORE_SIZE sizeof(bam1_core_t) | |
127 | |
128 /** | |
129 * Describing how CIGAR operation/length is packed in a 32-bit integer. | |
130 */ | |
131 #define BAM_CIGAR_SHIFT 4 | |
132 #define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) | |
133 | |
134 /* | |
135 CIGAR operations. | |
136 */ | |
137 /*! @abstract CIGAR: M = match or mismatch*/ | |
138 #define BAM_CMATCH 0 | |
139 /*! @abstract CIGAR: I = insertion to the reference */ | |
140 #define BAM_CINS 1 | |
141 /*! @abstract CIGAR: D = deletion from the reference */ | |
142 #define BAM_CDEL 2 | |
143 /*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */ | |
144 #define BAM_CREF_SKIP 3 | |
145 /*! @abstract CIGAR: S = clip on the read with clipped sequence | |
146 present in qseq */ | |
147 #define BAM_CSOFT_CLIP 4 | |
148 /*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */ | |
149 #define BAM_CHARD_CLIP 5 | |
150 /*! @abstract CIGAR: P = padding */ | |
151 #define BAM_CPAD 6 | |
152 /*! @abstract CIGAR: equals = match */ | |
153 #define BAM_CEQUAL 7 | |
154 /*! @abstract CIGAR: X = mismatch */ | |
155 #define BAM_CDIFF 8 | |
156 #define BAM_CBACK 9 | |
157 | |
158 #define BAM_CIGAR_STR "MIDNSHP=XB" | |
159 #define BAM_CIGAR_TYPE 0x3C1A7 | |
160 | |
161 #define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) | |
162 #define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT) | |
163 #define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)]) | |
164 #define bam_cigar_gen(l, o) ((l)<<BAM_CIGAR_SHIFT|(o)) | |
165 #define bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference | |
166 | |
167 /*! @typedef | |
168 @abstract Structure for core alignment information. | |
169 @field tid chromosome ID, defined by bam_header_t | |
170 @field pos 0-based leftmost coordinate | |
171 @field bin bin calculated by bam_reg2bin() | |
172 @field qual mapping quality | |
173 @field l_qname length of the query name | |
174 @field flag bitwise flag | |
175 @field n_cigar number of CIGAR operations | |
176 @field l_qseq length of the query sequence (read) | |
177 */ | |
178 typedef struct { | |
179 int32_t tid; | |
180 int32_t pos; | |
181 uint32_t bin:16, qual:8, l_qname:8; | |
182 uint32_t flag:16, n_cigar:16; | |
183 int32_t l_qseq; | |
184 int32_t mtid; | |
185 int32_t mpos; | |
186 int32_t isize; | |
187 } bam1_core_t; | |
188 | |
189 /*! @typedef | |
190 @abstract Structure for one alignment. | |
191 @field core core information about the alignment | |
192 @field l_aux length of auxiliary data | |
193 @field data_len current length of bam1_t::data | |
194 @field m_data maximum length of bam1_t::data | |
195 @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux | |
196 | |
197 @discussion Notes: | |
198 | |
199 1. qname is zero tailing and core.l_qname includes the tailing '\0'. | |
200 2. l_qseq is calculated from the total length of an alignment block | |
201 on reading or from CIGAR. | |
202 3. cigar data is encoded 4 bytes per CIGAR operation. | |
203 4. seq is nybble-encoded according to bam_nt16_table. | |
204 */ | |
205 typedef struct { | |
206 bam1_core_t core; | |
207 int l_aux, data_len, m_data; | |
208 uint8_t *data; | |
209 } bam1_t; | |
210 | |
211 typedef struct __bam_iter_t *bam_iter_t; | |
212 | |
213 #define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) | |
214 #define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) | |
215 | |
216 /*! @function | |
217 @abstract Get the CIGAR array | |
218 @param b pointer to an alignment | |
219 @return pointer to the CIGAR array | |
220 | |
221 @discussion In the CIGAR array, each element is a 32-bit integer. The | |
222 lower 4 bits gives a CIGAR operation and the higher 28 bits keep the | |
223 length of a CIGAR. | |
224 */ | |
225 #define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) | |
226 | |
227 /*! @function | |
228 @abstract Get the name of the query | |
229 @param b pointer to an alignment | |
230 @return pointer to the name string, null terminated | |
231 */ | |
232 #define bam1_qname(b) ((char*)((b)->data)) | |
233 | |
234 /*! @function | |
235 @abstract Get query sequence | |
236 @param b pointer to an alignment | |
237 @return pointer to sequence | |
238 | |
239 @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, | |
240 8 for T and 15 for N. Two bases are packed in one byte with the base | |
241 at the higher 4 bits having smaller coordinate on the read. It is | |
242 recommended to use bam1_seqi() macro to get the base. | |
243 */ | |
244 #define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) | |
245 | |
246 /*! @function | |
247 @abstract Get query quality | |
248 @param b pointer to an alignment | |
249 @return pointer to quality string | |
250 */ | |
251 #define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) | |
252 | |
253 /*! @function | |
254 @abstract Get a base on read | |
255 @param s Query sequence returned by bam1_seq() | |
256 @param i The i-th position, 0-based | |
257 @return 4-bit integer representing the base. | |
258 */ | |
259 //#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) | |
260 #define bam1_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf) | |
261 | |
262 #define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) ) | |
263 | |
264 /*! @function | |
265 @abstract Get query sequence and quality | |
266 @param b pointer to an alignment | |
267 @return pointer to the concatenated auxiliary data | |
268 */ | |
269 #define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) | |
270 | |
271 #ifndef kroundup32 | |
272 /*! @function | |
273 @abstract Round an integer to the next closest power-2 integer. | |
274 @param x integer to be rounded (in place) | |
275 @discussion x will be modified. | |
276 */ | |
277 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
278 #endif | |
279 | |
280 /*! | |
281 @abstract Whether the machine is big-endian; modified only in | |
282 bam_header_init(). | |
283 */ | |
284 extern int bam_is_be; | |
285 | |
286 /*! | |
287 @abstract Verbose level between 0 and 3; 0 is supposed to disable all | |
288 debugging information, though this may not have been implemented. | |
289 */ | |
290 extern int bam_verbose; | |
291 | |
292 extern int bam_no_B; | |
293 | |
294 /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ | |
295 extern unsigned char bam_nt16_table[256]; | |
296 | |
297 /*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */ | |
298 extern char *bam_nt16_rev_table; | |
299 | |
300 extern char bam_nt16_nt4_table[]; | |
301 | |
302 #ifdef __cplusplus | |
303 extern "C" { | |
304 #endif | |
305 | |
306 /********************* | |
307 * Low-level SAM I/O * | |
308 *********************/ | |
309 | |
310 /*! @abstract TAM file handler */ | |
311 typedef struct __tamFile_t *tamFile; | |
312 | |
313 /*! | |
314 @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib. | |
315 @param fn SAM file name | |
316 @return SAM file handler | |
317 */ | |
318 tamFile sam_open(const char *fn); | |
319 | |
320 /*! | |
321 @abstract Close a SAM file handler | |
322 @param fp SAM file handler | |
323 */ | |
324 void sam_close(tamFile fp); | |
325 | |
326 /*! | |
327 @abstract Read one alignment from a SAM file handler | |
328 @param fp SAM file handler | |
329 @param header header information (ordered names of chromosomes) | |
330 @param b read alignment; all members in b will be updated | |
331 @return 0 if successful; otherwise negative | |
332 */ | |
333 int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b); | |
334 | |
335 /*! | |
336 @abstract Read header information from a TAB-delimited list file. | |
337 @param fn_list file name for the list | |
338 @return a pointer to the header structure | |
339 | |
340 @discussion Each line in this file consists of chromosome name and | |
341 the length of chromosome. | |
342 */ | |
343 bam_header_t *sam_header_read2(const char *fn_list); | |
344 | |
345 /*! | |
346 @abstract Read header from a SAM file (if present) | |
347 @param fp SAM file handler | |
348 @return pointer to header struct; 0 if no @SQ lines available | |
349 */ | |
350 bam_header_t *sam_header_read(tamFile fp); | |
351 | |
352 /*! | |
353 @abstract Parse @SQ lines a update a header struct | |
354 @param h pointer to the header struct to be updated | |
355 @return number of target sequences | |
356 | |
357 @discussion bam_header_t::{n_targets,target_len,target_name} will | |
358 be destroyed in the first place. | |
359 */ | |
360 int sam_header_parse(bam_header_t *h); | |
361 int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); | |
362 | |
363 /*! | |
364 @abstract Parse @RG lines a update a header struct | |
365 @param h pointer to the header struct to be updated | |
366 @return number of @RG lines | |
367 | |
368 @discussion bam_header_t::rg2lib will be destroyed in the first | |
369 place. | |
370 */ | |
371 int sam_header_parse_rg(bam_header_t *h); | |
372 | |
373 #define sam_write1(header, b) bam_view1(header, b) | |
374 | |
375 | |
376 /******************************** | |
377 * APIs for string dictionaries * | |
378 ********************************/ | |
379 | |
380 int bam_strmap_put(void *strmap, const char *rg, const char *lib); | |
381 const char *bam_strmap_get(const void *strmap, const char *rg); | |
382 void *bam_strmap_dup(const void*); | |
383 void *bam_strmap_init(); | |
384 void bam_strmap_destroy(void *strmap); | |
385 | |
386 | |
387 /********************* | |
388 * Low-level BAM I/O * | |
389 *********************/ | |
390 | |
391 /*! | |
392 @abstract Initialize a header structure. | |
393 @return the pointer to the header structure | |
394 | |
395 @discussion This function also modifies the global variable | |
396 bam_is_be. | |
397 */ | |
398 bam_header_t *bam_header_init(); | |
399 | |
400 /*! | |
401 @abstract Destroy a header structure. | |
402 @param header pointer to the header | |
403 */ | |
404 void bam_header_destroy(bam_header_t *header); | |
405 | |
406 /*! | |
407 @abstract Read a header structure from BAM. | |
408 @param fp BAM file handler, opened by bam_open() | |
409 @return pointer to the header structure | |
410 | |
411 @discussion The file position indicator must be placed at the | |
412 beginning of the file. Upon success, the position indicator will | |
413 be set at the start of the first alignment. | |
414 */ | |
415 bam_header_t *bam_header_read(bamFile fp); | |
416 | |
417 /*! | |
418 @abstract Write a header structure to BAM. | |
419 @param fp BAM file handler | |
420 @param header pointer to the header structure | |
421 @return always 0 currently | |
422 */ | |
423 int bam_header_write(bamFile fp, const bam_header_t *header); | |
424 | |
425 /*! | |
426 @abstract Read an alignment from BAM. | |
427 @param fp BAM file handler | |
428 @param b read alignment; all members are updated. | |
429 @return number of bytes read from the file | |
430 | |
431 @discussion The file position indicator must be | |
432 placed right before an alignment. Upon success, this function | |
433 will set the position indicator to the start of the next | |
434 alignment. This function is not affected by the machine | |
435 endianness. | |
436 */ | |
437 int bam_read1(bamFile fp, bam1_t *b); | |
438 | |
439 int bam_remove_B(bam1_t *b); | |
440 | |
441 /*! | |
442 @abstract Write an alignment to BAM. | |
443 @param fp BAM file handler | |
444 @param c pointer to the bam1_core_t structure | |
445 @param data_len total length of variable size data related to | |
446 the alignment | |
447 @param data pointer to the concatenated data | |
448 @return number of bytes written to the file | |
449 | |
450 @discussion This function is not affected by the machine | |
451 endianness. | |
452 */ | |
453 int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data); | |
454 | |
455 /*! | |
456 @abstract Write an alignment to BAM. | |
457 @param fp BAM file handler | |
458 @param b alignment to write | |
459 @return number of bytes written to the file | |
460 | |
461 @abstract It is equivalent to: | |
462 bam_write1_core(fp, &b->core, b->data_len, b->data) | |
463 */ | |
464 int bam_write1(bamFile fp, const bam1_t *b); | |
465 | |
466 /*! @function | |
467 @abstract Initiate a pointer to bam1_t struct | |
468 */ | |
469 #define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) | |
470 | |
471 /*! @function | |
472 @abstract Free the memory allocated for an alignment. | |
473 @param b pointer to an alignment | |
474 */ | |
475 #define bam_destroy1(b) do { \ | |
476 if (b) { free((b)->data); free(b); } \ | |
477 } while (0) | |
478 | |
479 /*! | |
480 @abstract Format a BAM record in the SAM format | |
481 @param header pointer to the header structure | |
482 @param b alignment to print | |
483 @return a pointer to the SAM string | |
484 */ | |
485 char *bam_format1(const bam_header_t *header, const bam1_t *b); | |
486 | |
487 char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of); | |
488 | |
489 /*! | |
490 @abstract Check whether a BAM record is plausibly valid | |
491 @param header associated header structure, or NULL if unavailable | |
492 @param b alignment to validate | |
493 @return 0 if the alignment is invalid; non-zero otherwise | |
494 | |
495 @discussion Simple consistency check of some of the fields of the | |
496 alignment record. If the header is provided, several additional checks | |
497 are made. Not all fields are checked, so a non-zero result is not a | |
498 guarantee that the record is valid. However it is usually good enough | |
499 to detect when bam_seek() has been called with a virtual file offset | |
500 that is not the offset of an alignment record. | |
501 */ | |
502 int bam_validate1(const bam_header_t *header, const bam1_t *b); | |
503 | |
504 const char *bam_get_library(bam_header_t *header, const bam1_t *b); | |
505 | |
506 | |
507 /*************** | |
508 * pileup APIs * | |
509 ***************/ | |
510 | |
511 /*! @typedef | |
512 @abstract Structure for one alignment covering the pileup position. | |
513 @field b pointer to the alignment | |
514 @field qpos position of the read base at the pileup site, 0-based | |
515 @field indel indel length; 0 for no indel, positive for ins and negative for del | |
516 @field is_del 1 iff the base on the padded read is a deletion | |
517 @field level the level of the read in the "viewer" mode | |
518 | |
519 @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The | |
520 difference between the two functions is that the former does not | |
521 set bam_pileup1_t::level, while the later does. Level helps the | |
522 implementation of alignment viewers, but calculating this has some | |
523 overhead. | |
524 */ | |
525 typedef struct { | |
526 bam1_t *b; | |
527 int32_t qpos; | |
528 int indel, level; | |
529 uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; | |
530 } bam_pileup1_t; | |
531 | |
532 typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); | |
533 | |
534 struct __bam_plp_t; | |
535 typedef struct __bam_plp_t *bam_plp_t; | |
536 | |
537 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data); | |
538 int bam_plp_push(bam_plp_t iter, const bam1_t *b); | |
539 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); | |
540 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); | |
541 void bam_plp_set_mask(bam_plp_t iter, int mask); | |
542 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); | |
543 void bam_plp_reset(bam_plp_t iter); | |
544 void bam_plp_destroy(bam_plp_t iter); | |
545 | |
546 struct __bam_mplp_t; | |
547 typedef struct __bam_mplp_t *bam_mplp_t; | |
548 | |
549 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); | |
550 void bam_mplp_destroy(bam_mplp_t iter); | |
551 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); | |
552 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); | |
553 | |
554 /*! @typedef | |
555 @abstract Type of function to be called by bam_plbuf_push(). | |
556 @param tid chromosome ID as is defined in the header | |
557 @param pos start coordinate of the alignment, 0-based | |
558 @param n number of elements in pl array | |
559 @param pl array of alignments | |
560 @param data user provided data | |
561 @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t. | |
562 */ | |
563 typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); | |
564 | |
565 typedef struct { | |
566 bam_plp_t iter; | |
567 bam_pileup_f func; | |
568 void *data; | |
569 } bam_plbuf_t; | |
570 | |
571 void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); | |
572 void bam_plbuf_reset(bam_plbuf_t *buf); | |
573 bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); | |
574 void bam_plbuf_destroy(bam_plbuf_t *buf); | |
575 int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); | |
576 | |
577 int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); | |
578 | |
579 struct __bam_lplbuf_t; | |
580 typedef struct __bam_lplbuf_t bam_lplbuf_t; | |
581 | |
582 void bam_lplbuf_reset(bam_lplbuf_t *buf); | |
583 | |
584 /*! @abstract bam_plbuf_init() equivalent with level calculated. */ | |
585 bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data); | |
586 | |
587 /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */ | |
588 void bam_lplbuf_destroy(bam_lplbuf_t *tv); | |
589 | |
590 /*! @abstract bam_plbuf_push() equivalent with level calculated. */ | |
591 int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf); | |
592 | |
593 | |
594 /********************* | |
595 * BAM indexing APIs * | |
596 *********************/ | |
597 | |
598 struct __bam_index_t; | |
599 typedef struct __bam_index_t bam_index_t; | |
600 | |
601 /*! | |
602 @abstract Build index for a BAM file. | |
603 @discussion Index file "fn.bai" will be created. | |
604 @param fn name of the BAM file | |
605 @return always 0 currently | |
606 */ | |
607 int bam_index_build(const char *fn); | |
608 | |
609 /*! | |
610 @abstract Load index from file "fn.bai". | |
611 @param fn name of the BAM file (NOT the index file) | |
612 @return pointer to the index structure | |
613 */ | |
614 bam_index_t *bam_index_load(const char *fn); | |
615 | |
616 /*! | |
617 @abstract Destroy an index structure. | |
618 @param idx pointer to the index structure | |
619 */ | |
620 void bam_index_destroy(bam_index_t *idx); | |
621 | |
622 /*! @typedef | |
623 @abstract Type of function to be called by bam_fetch(). | |
624 @param b the alignment | |
625 @param data user provided data | |
626 */ | |
627 typedef int (*bam_fetch_f)(const bam1_t *b, void *data); | |
628 | |
629 /*! | |
630 @abstract Retrieve the alignments that are overlapped with the | |
631 specified region. | |
632 | |
633 @discussion A user defined function will be called for each | |
634 retrieved alignment ordered by its start position. | |
635 | |
636 @param fp BAM file handler | |
637 @param idx pointer to the alignment index | |
638 @param tid chromosome ID as is defined in the header | |
639 @param beg start coordinate, 0-based | |
640 @param end end coordinate, 0-based | |
641 @param data user provided data (will be transferred to func) | |
642 @param func user defined function | |
643 */ | |
644 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func); | |
645 | |
646 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end); | |
647 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b); | |
648 void bam_iter_destroy(bam_iter_t iter); | |
649 | |
650 /*! | |
651 @abstract Parse a region in the format: "chr2:100,000-200,000". | |
652 @discussion bam_header_t::hash will be initialized if empty. | |
653 @param header pointer to the header structure | |
654 @param str string to be parsed | |
655 @param ref_id the returned chromosome ID | |
656 @param begin the returned start coordinate | |
657 @param end the returned end coordinate | |
658 @return 0 on success; -1 on failure | |
659 */ | |
660 int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); | |
661 | |
662 | |
663 /************************** | |
664 * APIs for optional tags * | |
665 **************************/ | |
666 | |
667 /*! | |
668 @abstract Retrieve data of a tag | |
669 @param b pointer to an alignment struct | |
670 @param tag two-character tag to be retrieved | |
671 | |
672 @return pointer to the type and data. The first character is the | |
673 type that can be 'iIsScCdfAZH'. | |
674 | |
675 @discussion Use bam_aux2?() series to convert the returned data to | |
676 the corresponding type. | |
677 */ | |
678 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); | |
679 | |
680 int32_t bam_aux2i(const uint8_t *s); | |
681 float bam_aux2f(const uint8_t *s); | |
682 double bam_aux2d(const uint8_t *s); | |
683 char bam_aux2A(const uint8_t *s); | |
684 char *bam_aux2Z(const uint8_t *s); | |
685 | |
686 int bam_aux_del(bam1_t *b, uint8_t *s); | |
687 void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); | |
688 uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get() | |
689 | |
690 | |
691 /***************** | |
692 * Miscellaneous * | |
693 *****************/ | |
694 | |
695 /*! | |
696 @abstract Calculate the rightmost coordinate of an alignment on the | |
697 reference genome. | |
698 | |
699 @param c pointer to the bam1_core_t structure | |
700 @param cigar the corresponding CIGAR array (from bam1_t::cigar) | |
701 @return the rightmost coordinate, 0-based | |
702 */ | |
703 uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar); | |
704 | |
705 /*! | |
706 @abstract Calculate the length of the query sequence from CIGAR. | |
707 @param c pointer to the bam1_core_t structure | |
708 @param cigar the corresponding CIGAR array (from bam1_t::cigar) | |
709 @return length of the query sequence | |
710 */ | |
711 int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar); | |
712 | |
713 #ifdef __cplusplus | |
714 } | |
715 #endif | |
716 | |
717 /*! | |
718 @abstract Calculate the minimum bin that contains a region [beg,end). | |
719 @param beg start of the region, 0-based | |
720 @param end end of the region, 0-based | |
721 @return bin | |
722 */ | |
723 static inline int bam_reg2bin(uint32_t beg, uint32_t end) | |
724 { | |
725 --end; | |
726 if (beg>>14 == end>>14) return 4681 + (beg>>14); | |
727 if (beg>>17 == end>>17) return 585 + (beg>>17); | |
728 if (beg>>20 == end>>20) return 73 + (beg>>20); | |
729 if (beg>>23 == end>>23) return 9 + (beg>>23); | |
730 if (beg>>26 == end>>26) return 1 + (beg>>26); | |
731 return 0; | |
732 } | |
733 | |
734 /*! | |
735 @abstract Copy an alignment | |
736 @param bdst destination alignment struct | |
737 @param bsrc source alignment struct | |
738 @return pointer to the destination alignment struct | |
739 */ | |
740 static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) | |
741 { | |
742 uint8_t *data = bdst->data; | |
743 int m_data = bdst->m_data; // backup data and m_data | |
744 if (m_data < bsrc->data_len) { // double the capacity | |
745 m_data = bsrc->data_len; kroundup32(m_data); | |
746 data = (uint8_t*)realloc(data, m_data); | |
747 } | |
748 memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data | |
749 *bdst = *bsrc; // copy the rest | |
750 // restore the backup | |
751 bdst->m_data = m_data; | |
752 bdst->data = data; | |
753 return bdst; | |
754 } | |
755 | |
756 /*! | |
757 @abstract Duplicate an alignment | |
758 @param src source alignment struct | |
759 @return pointer to the destination alignment struct | |
760 */ | |
761 static inline bam1_t *bam_dup1(const bam1_t *src) | |
762 { | |
763 bam1_t *b; | |
764 b = bam_init1(); | |
765 *b = *src; | |
766 b->m_data = b->data_len; | |
767 b->data = (uint8_t*)calloc(b->data_len, 1); | |
768 memcpy(b->data, src->data, b->data_len); | |
769 return b; | |
770 } | |
771 | |
772 static inline int bam_aux_type2size(int x) | |
773 { | |
774 if (x == 'C' || x == 'c' || x == 'A') return 1; | |
775 else if (x == 'S' || x == 's') return 2; | |
776 else if (x == 'I' || x == 'i' || x == 'f' || x == 'F') return 4; | |
777 else return 0; | |
778 } | |
779 | |
780 /********************************* | |
781 *** Compatibility with htslib *** | |
782 *********************************/ | |
783 | |
784 typedef bam_header_t bam_hdr_t; | |
785 | |
786 #define bam_get_qname(b) bam1_qname(b) | |
787 #define bam_get_cigar(b) bam1_cigar(b) | |
788 | |
789 #define bam_hdr_read(fp) bam_header_read(fp) | |
790 #define bam_hdr_write(fp, h) bam_header_write(fp, h) | |
791 #define bam_hdr_destroy(fp) bam_header_destroy(fp) | |
792 | |
793 #endif |