0
|
1 /* The MIT License
|
|
2
|
|
3 Copyright (c) 2008-2010 Genome Research Ltd (GRL).
|
|
4
|
|
5 Permission is hereby granted, free of charge, to any person obtaining
|
|
6 a copy of this software and associated documentation files (the
|
|
7 "Software"), to deal in the Software without restriction, including
|
|
8 without limitation the rights to use, copy, modify, merge, publish,
|
|
9 distribute, sublicense, and/or sell copies of the Software, and to
|
|
10 permit persons to whom the Software is furnished to do so, subject to
|
|
11 the following conditions:
|
|
12
|
|
13 The above copyright notice and this permission notice shall be
|
|
14 included in all copies or substantial portions of the Software.
|
|
15
|
|
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23 SOFTWARE.
|
|
24 */
|
|
25
|
|
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
|
|
27
|
|
28 #ifndef BAM_BAM_H
|
|
29 #define BAM_BAM_H
|
|
30
|
|
31 /*!
|
|
32 @header
|
|
33
|
|
34 BAM library provides I/O and various operations on manipulating files
|
|
35 in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
|
|
36 format. It now supports importing from or exporting to SAM, sorting,
|
|
37 merging, generating pileup, and quickly retrieval of reads overlapped
|
|
38 with a specified region.
|
|
39
|
|
40 @copyright Genome Research Ltd.
|
|
41 */
|
|
42
|
|
43 #define BAM_VERSION "0.1.16 (r963:234)"
|
|
44
|
|
45 #include <stdint.h>
|
|
46 #include <stdlib.h>
|
|
47 #include <string.h>
|
|
48 #include <stdio.h>
|
|
49
|
|
50 #ifndef BAM_LITE
|
|
51 #define BAM_VIRTUAL_OFFSET16
|
|
52 #include "bgzf.h"
|
|
53 /*! @abstract BAM file handler */
|
|
54 typedef BGZF *bamFile;
|
|
55 #define bam_open(fn, mode) bgzf_open(fn, mode)
|
|
56 #define bam_dopen(fd, mode) bgzf_fdopen(fd, mode)
|
|
57 #define bam_close(fp) bgzf_close(fp)
|
|
58 #define bam_read(fp, buf, size) bgzf_read(fp, buf, size)
|
|
59 #define bam_write(fp, buf, size) bgzf_write(fp, buf, size)
|
|
60 #define bam_tell(fp) bgzf_tell(fp)
|
|
61 #define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
|
|
62 #else
|
|
63 #define BAM_TRUE_OFFSET
|
|
64 #include <zlib.h>
|
|
65 typedef gzFile bamFile;
|
|
66 #define bam_open(fn, mode) gzopen(fn, mode)
|
|
67 #define bam_dopen(fd, mode) gzdopen(fd, mode)
|
|
68 #define bam_close(fp) gzclose(fp)
|
|
69 #define bam_read(fp, buf, size) gzread(fp, buf, size)
|
|
70 /* no bam_write/bam_tell/bam_seek() here */
|
|
71 #endif
|
|
72
|
|
73 /*! @typedef
|
|
74 @abstract Structure for the alignment header.
|
|
75 @field n_targets number of reference sequences
|
|
76 @field target_name names of the reference sequences
|
|
77 @field target_len lengths of the referene sequences
|
|
78 @field dict header dictionary
|
|
79 @field hash hash table for fast name lookup
|
|
80 @field rg2lib hash table for @RG-ID -> LB lookup
|
|
81 @field l_text length of the plain text in the header
|
|
82 @field text plain text
|
|
83
|
|
84 @discussion Field hash points to null by default. It is a private
|
|
85 member.
|
|
86 */
|
|
87 typedef struct {
|
|
88 int32_t n_targets;
|
|
89 char **target_name;
|
|
90 uint32_t *target_len;
|
|
91 void *dict, *hash, *rg2lib;
|
|
92 size_t l_text, n_text;
|
|
93 char *text;
|
|
94 } bam_header_t;
|
|
95
|
|
96 /*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
|
|
97 #define BAM_FPAIRED 1
|
|
98 /*! @abstract the read is mapped in a proper pair */
|
|
99 #define BAM_FPROPER_PAIR 2
|
|
100 /*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
|
|
101 #define BAM_FUNMAP 4
|
|
102 /*! @abstract the mate is unmapped */
|
|
103 #define BAM_FMUNMAP 8
|
|
104 /*! @abstract the read is mapped to the reverse strand */
|
|
105 #define BAM_FREVERSE 16
|
|
106 /*! @abstract the mate is mapped to the reverse strand */
|
|
107 #define BAM_FMREVERSE 32
|
|
108 /*! @abstract this is read1 */
|
|
109 #define BAM_FREAD1 64
|
|
110 /*! @abstract this is read2 */
|
|
111 #define BAM_FREAD2 128
|
|
112 /*! @abstract not primary alignment */
|
|
113 #define BAM_FSECONDARY 256
|
|
114 /*! @abstract QC failure */
|
|
115 #define BAM_FQCFAIL 512
|
|
116 /*! @abstract optical or PCR duplicate */
|
|
117 #define BAM_FDUP 1024
|
|
118
|
|
119 #define BAM_OFDEC 0
|
|
120 #define BAM_OFHEX 1
|
|
121 #define BAM_OFSTR 2
|
|
122
|
|
123 /*! @abstract defautl mask for pileup */
|
|
124 #define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
|
|
125
|
|
126 #define BAM_CORE_SIZE sizeof(bam1_core_t)
|
|
127
|
|
128 /**
|
|
129 * Describing how CIGAR operation/length is packed in a 32-bit integer.
|
|
130 */
|
|
131 #define BAM_CIGAR_SHIFT 4
|
|
132 #define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
|
|
133
|
|
134 /*
|
|
135 CIGAR operations.
|
|
136 */
|
|
137 /*! @abstract CIGAR: match */
|
|
138 #define BAM_CMATCH 0
|
|
139 /*! @abstract CIGAR: insertion to the reference */
|
|
140 #define BAM_CINS 1
|
|
141 /*! @abstract CIGAR: deletion from the reference */
|
|
142 #define BAM_CDEL 2
|
|
143 /*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */
|
|
144 #define BAM_CREF_SKIP 3
|
|
145 /*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */
|
|
146 #define BAM_CSOFT_CLIP 4
|
|
147 /*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */
|
|
148 #define BAM_CHARD_CLIP 5
|
|
149 /*! @abstract CIGAR: padding */
|
|
150 #define BAM_CPAD 6
|
|
151
|
|
152 /*! @typedef
|
|
153 @abstract Structure for core alignment information.
|
|
154 @field tid chromosome ID, defined by bam_header_t
|
|
155 @field pos 0-based leftmost coordinate
|
|
156 @field strand strand; 0 for forward and 1 otherwise
|
|
157 @field bin bin calculated by bam_reg2bin()
|
|
158 @field qual mapping quality
|
|
159 @field l_qname length of the query name
|
|
160 @field flag bitwise flag
|
|
161 @field n_cigar number of CIGAR operations
|
|
162 @field l_qseq length of the query sequence (read)
|
|
163 */
|
|
164 typedef struct {
|
|
165 int32_t tid;
|
|
166 int32_t pos;
|
|
167 uint32_t bin:16, qual:8, l_qname:8;
|
|
168 uint32_t flag:16, n_cigar:16;
|
|
169 int32_t l_qseq;
|
|
170 int32_t mtid;
|
|
171 int32_t mpos;
|
|
172 int32_t isize;
|
|
173 } bam1_core_t;
|
|
174
|
|
175 /*! @typedef
|
|
176 @abstract Structure for one alignment.
|
|
177 @field core core information about the alignment
|
|
178 @field l_aux length of auxiliary data
|
|
179 @field data_len current length of bam1_t::data
|
|
180 @field m_data maximum length of bam1_t::data
|
|
181 @field data all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux
|
|
182
|
|
183 @discussion Notes:
|
|
184
|
|
185 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
|
|
186 2. l_qseq is calculated from the total length of an alignment block
|
|
187 on reading or from CIGAR.
|
|
188 */
|
|
189 typedef struct {
|
|
190 bam1_core_t core;
|
|
191 int l_aux, data_len, m_data;
|
|
192 uint8_t *data;
|
|
193 } bam1_t;
|
|
194
|
|
195 typedef struct __bam_iter_t *bam_iter_t;
|
|
196
|
|
197 #define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
|
|
198 #define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
|
|
199
|
|
200 /*! @function
|
|
201 @abstract Get the CIGAR array
|
|
202 @param b pointer to an alignment
|
|
203 @return pointer to the CIGAR array
|
|
204
|
|
205 @discussion In the CIGAR array, each element is a 32-bit integer. The
|
|
206 lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
|
|
207 length of a CIGAR.
|
|
208 */
|
|
209 #define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
|
|
210
|
|
211 /*! @function
|
|
212 @abstract Get the name of the query
|
|
213 @param b pointer to an alignment
|
|
214 @return pointer to the name string, null terminated
|
|
215 */
|
|
216 #define bam1_qname(b) ((char*)((b)->data))
|
|
217
|
|
218 /*! @function
|
|
219 @abstract Get query sequence
|
|
220 @param b pointer to an alignment
|
|
221 @return pointer to sequence
|
|
222
|
|
223 @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
|
|
224 8 for T and 15 for N. Two bases are packed in one byte with the base
|
|
225 at the higher 4 bits having smaller coordinate on the read. It is
|
|
226 recommended to use bam1_seqi() macro to get the base.
|
|
227 */
|
|
228 #define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
|
|
229
|
|
230 /*! @function
|
|
231 @abstract Get query quality
|
|
232 @param b pointer to an alignment
|
|
233 @return pointer to quality string
|
|
234 */
|
|
235 #define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
|
|
236
|
|
237 /*! @function
|
|
238 @abstract Get a base on read
|
|
239 @param s Query sequence returned by bam1_seq()
|
|
240 @param i The i-th position, 0-based
|
|
241 @return 4-bit integer representing the base.
|
|
242 */
|
|
243 #define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
|
|
244
|
|
245 /*! @function
|
|
246 @abstract Get query sequence and quality
|
|
247 @param b pointer to an alignment
|
|
248 @return pointer to the concatenated auxiliary data
|
|
249 */
|
|
250 #define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
|
|
251
|
|
252 #ifndef kroundup32
|
|
253 /*! @function
|
|
254 @abstract Round an integer to the next closest power-2 integer.
|
|
255 @param x integer to be rounded (in place)
|
|
256 @discussion x will be modified.
|
|
257 */
|
|
258 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
|
259 #endif
|
|
260
|
|
261 /*!
|
|
262 @abstract Whether the machine is big-endian; modified only in
|
|
263 bam_header_init().
|
|
264 */
|
|
265 extern int bam_is_be;
|
|
266
|
|
267 /*!
|
|
268 @abstract Verbose level between 0 and 3; 0 is supposed to disable all
|
|
269 debugging information, though this may not have been implemented.
|
|
270 */
|
|
271 extern int bam_verbose;
|
|
272
|
|
273 /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
|
|
274 extern unsigned char bam_nt16_table[256];
|
|
275
|
|
276 /*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
|
|
277 extern char *bam_nt16_rev_table;
|
|
278
|
|
279 extern char bam_nt16_nt4_table[];
|
|
280
|
|
281 #ifdef __cplusplus
|
|
282 extern "C" {
|
|
283 #endif
|
|
284
|
|
285 /*********************
|
|
286 * Low-level SAM I/O *
|
|
287 *********************/
|
|
288
|
|
289 /*! @abstract TAM file handler */
|
|
290 typedef struct __tamFile_t *tamFile;
|
|
291
|
|
292 /*!
|
|
293 @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib.
|
|
294 @param fn SAM file name
|
|
295 @return SAM file handler
|
|
296 */
|
|
297 tamFile sam_open(const char *fn);
|
|
298
|
|
299 /*!
|
|
300 @abstract Close a SAM file handler
|
|
301 @param fp SAM file handler
|
|
302 */
|
|
303 void sam_close(tamFile fp);
|
|
304
|
|
305 /*!
|
|
306 @abstract Read one alignment from a SAM file handler
|
|
307 @param fp SAM file handler
|
|
308 @param header header information (ordered names of chromosomes)
|
|
309 @param b read alignment; all members in b will be updated
|
|
310 @return 0 if successful; otherwise negative
|
|
311 */
|
|
312 int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b);
|
|
313
|
|
314 /*!
|
|
315 @abstract Read header information from a TAB-delimited list file.
|
|
316 @param fn_list file name for the list
|
|
317 @return a pointer to the header structure
|
|
318
|
|
319 @discussion Each line in this file consists of chromosome name and
|
|
320 the length of chromosome.
|
|
321 */
|
|
322 bam_header_t *sam_header_read2(const char *fn_list);
|
|
323
|
|
324 /*!
|
|
325 @abstract Read header from a SAM file (if present)
|
|
326 @param fp SAM file handler
|
|
327 @return pointer to header struct; 0 if no @SQ lines available
|
|
328 */
|
|
329 bam_header_t *sam_header_read(tamFile fp);
|
|
330
|
|
331 /*!
|
|
332 @abstract Parse @SQ lines a update a header struct
|
|
333 @param h pointer to the header struct to be updated
|
|
334 @return number of target sequences
|
|
335
|
|
336 @discussion bam_header_t::{n_targets,target_len,target_name} will
|
|
337 be destroyed in the first place.
|
|
338 */
|
|
339 int sam_header_parse(bam_header_t *h);
|
|
340 int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
|
|
341
|
|
342 /*!
|
|
343 @abstract Parse @RG lines a update a header struct
|
|
344 @param h pointer to the header struct to be updated
|
|
345 @return number of @RG lines
|
|
346
|
|
347 @discussion bam_header_t::rg2lib will be destroyed in the first
|
|
348 place.
|
|
349 */
|
|
350 int sam_header_parse_rg(bam_header_t *h);
|
|
351
|
|
352 #define sam_write1(header, b) bam_view1(header, b)
|
|
353
|
|
354
|
|
355 /********************************
|
|
356 * APIs for string dictionaries *
|
|
357 ********************************/
|
|
358
|
|
359 int bam_strmap_put(void *strmap, const char *rg, const char *lib);
|
|
360 const char *bam_strmap_get(const void *strmap, const char *rg);
|
|
361 void *bam_strmap_dup(const void*);
|
|
362 void *bam_strmap_init();
|
|
363 void bam_strmap_destroy(void *strmap);
|
|
364
|
|
365
|
|
366 /*********************
|
|
367 * Low-level BAM I/O *
|
|
368 *********************/
|
|
369
|
|
370 /*!
|
|
371 @abstract Initialize a header structure.
|
|
372 @return the pointer to the header structure
|
|
373
|
|
374 @discussion This function also modifies the global variable
|
|
375 bam_is_be.
|
|
376 */
|
|
377 bam_header_t *bam_header_init();
|
|
378
|
|
379 /*!
|
|
380 @abstract Destroy a header structure.
|
|
381 @param header pointer to the header
|
|
382 */
|
|
383 void bam_header_destroy(bam_header_t *header);
|
|
384
|
|
385 /*!
|
|
386 @abstract Read a header structure from BAM.
|
|
387 @param fp BAM file handler, opened by bam_open()
|
|
388 @return pointer to the header structure
|
|
389
|
|
390 @discussion The file position indicator must be placed at the
|
|
391 beginning of the file. Upon success, the position indicator will
|
|
392 be set at the start of the first alignment.
|
|
393 */
|
|
394 bam_header_t *bam_header_read(bamFile fp);
|
|
395
|
|
396 /*!
|
|
397 @abstract Write a header structure to BAM.
|
|
398 @param fp BAM file handler
|
|
399 @param header pointer to the header structure
|
|
400 @return always 0 currently
|
|
401 */
|
|
402 int bam_header_write(bamFile fp, const bam_header_t *header);
|
|
403
|
|
404 /*!
|
|
405 @abstract Read an alignment from BAM.
|
|
406 @param fp BAM file handler
|
|
407 @param b read alignment; all members are updated.
|
|
408 @return number of bytes read from the file
|
|
409
|
|
410 @discussion The file position indicator must be
|
|
411 placed right before an alignment. Upon success, this function
|
|
412 will set the position indicator to the start of the next
|
|
413 alignment. This function is not affected by the machine
|
|
414 endianness.
|
|
415 */
|
|
416 int bam_read1(bamFile fp, bam1_t *b);
|
|
417
|
|
418 /*!
|
|
419 @abstract Write an alignment to BAM.
|
|
420 @param fp BAM file handler
|
|
421 @param c pointer to the bam1_core_t structure
|
|
422 @param data_len total length of variable size data related to
|
|
423 the alignment
|
|
424 @param data pointer to the concatenated data
|
|
425 @return number of bytes written to the file
|
|
426
|
|
427 @discussion This function is not affected by the machine
|
|
428 endianness.
|
|
429 */
|
|
430 int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data);
|
|
431
|
|
432 /*!
|
|
433 @abstract Write an alignment to BAM.
|
|
434 @param fp BAM file handler
|
|
435 @param b alignment to write
|
|
436 @return number of bytes written to the file
|
|
437
|
|
438 @abstract It is equivalent to:
|
|
439 bam_write1_core(fp, &b->core, b->data_len, b->data)
|
|
440 */
|
|
441 int bam_write1(bamFile fp, const bam1_t *b);
|
|
442
|
|
443 /*! @function
|
|
444 @abstract Initiate a pointer to bam1_t struct
|
|
445 */
|
|
446 #define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
|
|
447
|
|
448 /*! @function
|
|
449 @abstract Free the memory allocated for an alignment.
|
|
450 @param b pointer to an alignment
|
|
451 */
|
|
452 #define bam_destroy1(b) do { \
|
|
453 if (b) { free((b)->data); free(b); } \
|
|
454 } while (0)
|
|
455
|
|
456 /*!
|
|
457 @abstract Format a BAM record in the SAM format
|
|
458 @param header pointer to the header structure
|
|
459 @param b alignment to print
|
|
460 @return a pointer to the SAM string
|
|
461 */
|
|
462 char *bam_format1(const bam_header_t *header, const bam1_t *b);
|
|
463
|
|
464 char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of);
|
|
465
|
|
466 /*!
|
|
467 @abstract Check whether a BAM record is plausibly valid
|
|
468 @param header associated header structure, or NULL if unavailable
|
|
469 @param b alignment to validate
|
|
470 @return 0 if the alignment is invalid; non-zero otherwise
|
|
471
|
|
472 @discussion Simple consistency check of some of the fields of the
|
|
473 alignment record. If the header is provided, several additional checks
|
|
474 are made. Not all fields are checked, so a non-zero result is not a
|
|
475 guarantee that the record is valid. However it is usually good enough
|
|
476 to detect when bam_seek() has been called with a virtual file offset
|
|
477 that is not the offset of an alignment record.
|
|
478 */
|
|
479 int bam_validate1(const bam_header_t *header, const bam1_t *b);
|
|
480
|
|
481 const char *bam_get_library(bam_header_t *header, const bam1_t *b);
|
|
482
|
|
483
|
|
484 /***************
|
|
485 * pileup APIs *
|
|
486 ***************/
|
|
487
|
|
488 /*! @typedef
|
|
489 @abstract Structure for one alignment covering the pileup position.
|
|
490 @field b pointer to the alignment
|
|
491 @field qpos position of the read base at the pileup site, 0-based
|
|
492 @field indel indel length; 0 for no indel, positive for ins and negative for del
|
|
493 @field is_del 1 iff the base on the padded read is a deletion
|
|
494 @field level the level of the read in the "viewer" mode
|
|
495
|
|
496 @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
|
|
497 difference between the two functions is that the former does not
|
|
498 set bam_pileup1_t::level, while the later does. Level helps the
|
|
499 implementation of alignment viewers, but calculating this has some
|
|
500 overhead.
|
|
501 */
|
|
502 typedef struct {
|
|
503 bam1_t *b;
|
|
504 int32_t qpos;
|
|
505 int indel, level;
|
|
506 uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28;
|
|
507 } bam_pileup1_t;
|
|
508
|
|
509 typedef int (*bam_plp_auto_f)(void *data, bam1_t *b);
|
|
510
|
|
511 struct __bam_plp_t;
|
|
512 typedef struct __bam_plp_t *bam_plp_t;
|
|
513
|
|
514 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data);
|
|
515 int bam_plp_push(bam_plp_t iter, const bam1_t *b);
|
|
516 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
|
|
517 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
|
|
518 void bam_plp_set_mask(bam_plp_t iter, int mask);
|
|
519 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);
|
|
520 void bam_plp_reset(bam_plp_t iter);
|
|
521 void bam_plp_destroy(bam_plp_t iter);
|
|
522
|
|
523 struct __bam_mplp_t;
|
|
524 typedef struct __bam_mplp_t *bam_mplp_t;
|
|
525
|
|
526 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data);
|
|
527 void bam_mplp_destroy(bam_mplp_t iter);
|
|
528 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);
|
|
529 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
|
|
530
|
|
531 /*! @typedef
|
|
532 @abstract Type of function to be called by bam_plbuf_push().
|
|
533 @param tid chromosome ID as is defined in the header
|
|
534 @param pos start coordinate of the alignment, 0-based
|
|
535 @param n number of elements in pl array
|
|
536 @param pl array of alignments
|
|
537 @param data user provided data
|
|
538 @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t.
|
|
539 */
|
|
540 typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
|
|
541
|
|
542 typedef struct {
|
|
543 bam_plp_t iter;
|
|
544 bam_pileup_f func;
|
|
545 void *data;
|
|
546 } bam_plbuf_t;
|
|
547
|
|
548 void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
|
|
549 void bam_plbuf_reset(bam_plbuf_t *buf);
|
|
550 bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
|
|
551 void bam_plbuf_destroy(bam_plbuf_t *buf);
|
|
552 int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
|
|
553
|
|
554 int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
|
|
555
|
|
556 struct __bam_lplbuf_t;
|
|
557 typedef struct __bam_lplbuf_t bam_lplbuf_t;
|
|
558
|
|
559 void bam_lplbuf_reset(bam_lplbuf_t *buf);
|
|
560
|
|
561 /*! @abstract bam_plbuf_init() equivalent with level calculated. */
|
|
562 bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data);
|
|
563
|
|
564 /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */
|
|
565 void bam_lplbuf_destroy(bam_lplbuf_t *tv);
|
|
566
|
|
567 /*! @abstract bam_plbuf_push() equivalent with level calculated. */
|
|
568 int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
|
|
569
|
|
570
|
|
571 /*********************
|
|
572 * BAM indexing APIs *
|
|
573 *********************/
|
|
574
|
|
575 struct __bam_index_t;
|
|
576 typedef struct __bam_index_t bam_index_t;
|
|
577
|
|
578 /*!
|
|
579 @abstract Build index for a BAM file.
|
|
580 @discussion Index file "fn.bai" will be created.
|
|
581 @param fn name of the BAM file
|
|
582 @return always 0 currently
|
|
583 */
|
|
584 int bam_index_build(const char *fn);
|
|
585
|
|
586 /*!
|
|
587 @abstract Load index from file "fn.bai".
|
|
588 @param fn name of the BAM file (NOT the index file)
|
|
589 @return pointer to the index structure
|
|
590 */
|
|
591 bam_index_t *bam_index_load(const char *fn);
|
|
592
|
|
593 /*!
|
|
594 @abstract Destroy an index structure.
|
|
595 @param idx pointer to the index structure
|
|
596 */
|
|
597 void bam_index_destroy(bam_index_t *idx);
|
|
598
|
|
599 /*! @typedef
|
|
600 @abstract Type of function to be called by bam_fetch().
|
|
601 @param b the alignment
|
|
602 @param data user provided data
|
|
603 */
|
|
604 typedef int (*bam_fetch_f)(const bam1_t *b, void *data);
|
|
605
|
|
606 /*!
|
|
607 @abstract Retrieve the alignments that are overlapped with the
|
|
608 specified region.
|
|
609
|
|
610 @discussion A user defined function will be called for each
|
|
611 retrieved alignment ordered by its start position.
|
|
612
|
|
613 @param fp BAM file handler
|
|
614 @param idx pointer to the alignment index
|
|
615 @param tid chromosome ID as is defined in the header
|
|
616 @param beg start coordinate, 0-based
|
|
617 @param end end coordinate, 0-based
|
|
618 @param data user provided data (will be transferred to func)
|
|
619 @param func user defined function
|
|
620 */
|
|
621 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
|
|
622
|
|
623 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end);
|
|
624 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b);
|
|
625 void bam_iter_destroy(bam_iter_t iter);
|
|
626
|
|
627 /*!
|
|
628 @abstract Parse a region in the format: "chr2:100,000-200,000".
|
|
629 @discussion bam_header_t::hash will be initialized if empty.
|
|
630 @param header pointer to the header structure
|
|
631 @param str string to be parsed
|
|
632 @param ref_id the returned chromosome ID
|
|
633 @param begin the returned start coordinate
|
|
634 @param end the returned end coordinate
|
|
635 @return 0 on success; -1 on failure
|
|
636 */
|
|
637 int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
|
|
638
|
|
639
|
|
640 /**************************
|
|
641 * APIs for optional tags *
|
|
642 **************************/
|
|
643
|
|
644 /*!
|
|
645 @abstract Retrieve data of a tag
|
|
646 @param b pointer to an alignment struct
|
|
647 @param tag two-character tag to be retrieved
|
|
648
|
|
649 @return pointer to the type and data. The first character is the
|
|
650 type that can be 'iIsScCdfAZH'.
|
|
651
|
|
652 @discussion Use bam_aux2?() series to convert the returned data to
|
|
653 the corresponding type.
|
|
654 */
|
|
655 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
|
|
656
|
|
657 int32_t bam_aux2i(const uint8_t *s);
|
|
658 float bam_aux2f(const uint8_t *s);
|
|
659 double bam_aux2d(const uint8_t *s);
|
|
660 char bam_aux2A(const uint8_t *s);
|
|
661 char *bam_aux2Z(const uint8_t *s);
|
|
662
|
|
663 int bam_aux_del(bam1_t *b, uint8_t *s);
|
|
664 void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
|
|
665 uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
|
|
666
|
|
667
|
|
668 /*****************
|
|
669 * Miscellaneous *
|
|
670 *****************/
|
|
671
|
|
672 /*!
|
|
673 @abstract Calculate the rightmost coordinate of an alignment on the
|
|
674 reference genome.
|
|
675
|
|
676 @param c pointer to the bam1_core_t structure
|
|
677 @param cigar the corresponding CIGAR array (from bam1_t::cigar)
|
|
678 @return the rightmost coordinate, 0-based
|
|
679 */
|
|
680 uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar);
|
|
681
|
|
682 /*!
|
|
683 @abstract Calculate the length of the query sequence from CIGAR.
|
|
684 @param c pointer to the bam1_core_t structure
|
|
685 @param cigar the corresponding CIGAR array (from bam1_t::cigar)
|
|
686 @return length of the query sequence
|
|
687 */
|
|
688 int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);
|
|
689
|
|
690 #ifdef __cplusplus
|
|
691 }
|
|
692 #endif
|
|
693
|
|
694 /*!
|
|
695 @abstract Calculate the minimum bin that contains a region [beg,end).
|
|
696 @param beg start of the region, 0-based
|
|
697 @param end end of the region, 0-based
|
|
698 @return bin
|
|
699 */
|
|
700 static inline int bam_reg2bin(uint32_t beg, uint32_t end)
|
|
701 {
|
|
702 --end;
|
|
703 if (beg>>14 == end>>14) return 4681 + (beg>>14);
|
|
704 if (beg>>17 == end>>17) return 585 + (beg>>17);
|
|
705 if (beg>>20 == end>>20) return 73 + (beg>>20);
|
|
706 if (beg>>23 == end>>23) return 9 + (beg>>23);
|
|
707 if (beg>>26 == end>>26) return 1 + (beg>>26);
|
|
708 return 0;
|
|
709 }
|
|
710
|
|
711 /*!
|
|
712 @abstract Copy an alignment
|
|
713 @param bdst destination alignment struct
|
|
714 @param bsrc source alignment struct
|
|
715 @return pointer to the destination alignment struct
|
|
716 */
|
|
717 static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
|
|
718 {
|
|
719 uint8_t *data = bdst->data;
|
|
720 int m_data = bdst->m_data; // backup data and m_data
|
|
721 if (m_data < bsrc->data_len) { // double the capacity
|
|
722 m_data = bsrc->data_len; kroundup32(m_data);
|
|
723 data = (uint8_t*)realloc(data, m_data);
|
|
724 }
|
|
725 memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data
|
|
726 *bdst = *bsrc; // copy the rest
|
|
727 // restore the backup
|
|
728 bdst->m_data = m_data;
|
|
729 bdst->data = data;
|
|
730 return bdst;
|
|
731 }
|
|
732
|
|
733 /*!
|
|
734 @abstract Duplicate an alignment
|
|
735 @param src source alignment struct
|
|
736 @return pointer to the destination alignment struct
|
|
737 */
|
|
738 static inline bam1_t *bam_dup1(const bam1_t *src)
|
|
739 {
|
|
740 bam1_t *b;
|
|
741 b = bam_init1();
|
|
742 *b = *src;
|
|
743 b->m_data = b->data_len;
|
|
744 b->data = (uint8_t*)calloc(b->data_len, 1);
|
|
745 memcpy(b->data, src->data, b->data_len);
|
|
746 return b;
|
|
747 }
|
|
748
|
|
749 static inline int bam_aux_type2size(int x)
|
|
750 {
|
|
751 if (x == 'C' || x == 'c' || x == 'A') return 1;
|
|
752 else if (x == 'S' || x == 's') return 2;
|
|
753 else if (x == 'I' || x == 'i' || x == 'f') return 4;
|
|
754 else return 0;
|
|
755 }
|
|
756
|
|
757
|
|
758 #endif
|