0
|
1 /* The MIT License
|
|
2
|
|
3 Copyright (c) 2008 Genome Research Ltd (GRL).
|
|
4
|
|
5 Permission is hereby granted, free of charge, to any person obtaining
|
|
6 a copy of this software and associated documentation files (the
|
|
7 "Software"), to deal in the Software without restriction, including
|
|
8 without limitation the rights to use, copy, modify, merge, publish,
|
|
9 distribute, sublicense, and/or sell copies of the Software, and to
|
|
10 permit persons to whom the Software is furnished to do so, subject to
|
|
11 the following conditions:
|
|
12
|
|
13 The above copyright notice and this permission notice shall be
|
|
14 included in all copies or substantial portions of the Software.
|
|
15
|
|
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23 SOFTWARE.
|
|
24 */
|
|
25
|
|
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
|
|
27
|
|
28 #ifndef BAM_BAM_H
|
|
29 #define BAM_BAM_H
|
|
30
|
|
31 /*!
|
|
32 @header
|
|
33
|
|
34 BAM library provides I/O and various operations on manipulating files
|
|
35 in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
|
|
36 format. It now supports importing from or exporting to TAM, sorting,
|
|
37 merging, generating pileup, and quickly retrieval of reads overlapped
|
|
38 with a specified region.
|
|
39
|
|
40 @copyright Genome Research Ltd.
|
|
41 */
|
|
42
|
|
43 #include <stdint.h>
|
|
44 #include <stdlib.h>
|
|
45 #include <string.h>
|
|
46 #include <stdio.h>
|
|
47
|
|
48 #ifndef BAM_LITE
|
|
49 #define BAM_VIRTUAL_OFFSET16
|
|
50 #include "bgzf.h"
|
|
51 /*! @abstract BAM file handler */
|
|
52 typedef BGZF *bamFile;
|
|
53 #define bam_open(fn, mode) bgzf_open(fn, mode)
|
|
54 #define bam_dopen(fd, mode) bgzf_fdopen(fd, mode)
|
|
55 #define bam_close(fp) bgzf_close(fp)
|
|
56 #define bam_read(fp, buf, size) bgzf_read(fp, buf, size)
|
|
57 #define bam_write(fp, buf, size) bgzf_write(fp, buf, size)
|
|
58 #define bam_tell(fp) bgzf_tell(fp)
|
|
59 #define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
|
|
60 #else
|
|
61 #define BAM_TRUE_OFFSET
|
|
62 #include <zlib.h>
|
|
63 typedef gzFile bamFile;
|
|
64 #define bam_open(fn, mode) gzopen(fn, mode)
|
|
65 #define bam_dopen(fd, mode) gzdopen(fd, mode)
|
|
66 #define bam_close(fp) gzclose(fp)
|
|
67 #define bam_read(fp, buf, size) gzread(fp, buf, size)
|
|
68 /* no bam_write/bam_tell/bam_seek() here */
|
|
69 #endif
|
|
70
|
|
71 /*! @typedef
|
|
72 @abstract Structure for the alignment header.
|
|
73 @field n_targets number of reference sequences
|
|
74 @field target_name names of the reference sequences
|
|
75 @field target_len lengths of the referene sequences
|
|
76 @field hash hash table for fast name lookup
|
|
77 @field rg2lib hash table for @RG-ID -> LB lookup
|
|
78 @field l_text length of the plain text in the header
|
|
79 @field text plain text
|
|
80
|
|
81 @discussion Field hash points to null by default. It is a private
|
|
82 member.
|
|
83 */
|
|
84 typedef struct {
|
|
85 int32_t n_targets;
|
|
86 char **target_name;
|
|
87 uint32_t *target_len;
|
|
88 void *hash, *rg2lib;
|
|
89 int l_text;
|
|
90 char *text;
|
|
91 } bam_header_t;
|
|
92
|
|
93 /*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
|
|
94 #define BAM_FPAIRED 1
|
|
95 /*! @abstract the read is mapped in a proper pair */
|
|
96 #define BAM_FPROPER_PAIR 2
|
|
97 /*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
|
|
98 #define BAM_FUNMAP 4
|
|
99 /*! @abstract the mate is unmapped */
|
|
100 #define BAM_FMUNMAP 8
|
|
101 /*! @abstract the read is mapped to the reverse strand */
|
|
102 #define BAM_FREVERSE 16
|
|
103 /*! @abstract the mate is mapped to the reverse strand */
|
|
104 #define BAM_FMREVERSE 32
|
|
105 /*! @abstract this is read1 */
|
|
106 #define BAM_FREAD1 64
|
|
107 /*! @abstract this is read2 */
|
|
108 #define BAM_FREAD2 128
|
|
109 /*! @abstract not primary alignment */
|
|
110 #define BAM_FSECONDARY 256
|
|
111 /*! @abstract QC failure */
|
|
112 #define BAM_FQCFAIL 512
|
|
113 /*! @abstract optical or PCR duplicate */
|
|
114 #define BAM_FDUP 1024
|
|
115
|
|
116 #define BAM_OFDEC 0
|
|
117 #define BAM_OFHEX 1
|
|
118 #define BAM_OFSTR 2
|
|
119
|
|
120 /*! @abstract defautl mask for pileup */
|
|
121 #define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
|
|
122
|
|
123 #define BAM_CORE_SIZE sizeof(bam1_core_t)
|
|
124
|
|
125 /**
|
|
126 * Describing how CIGAR operation/length is packed in a 32-bit integer.
|
|
127 */
|
|
128 #define BAM_CIGAR_SHIFT 4
|
|
129 #define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
|
|
130
|
|
131 /*
|
|
132 CIGAR operations.
|
|
133 */
|
|
134 /*! @abstract CIGAR: match */
|
|
135 #define BAM_CMATCH 0
|
|
136 /*! @abstract CIGAR: insertion to the reference */
|
|
137 #define BAM_CINS 1
|
|
138 /*! @abstract CIGAR: deletion from the reference */
|
|
139 #define BAM_CDEL 2
|
|
140 /*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */
|
|
141 #define BAM_CREF_SKIP 3
|
|
142 /*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */
|
|
143 #define BAM_CSOFT_CLIP 4
|
|
144 /*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */
|
|
145 #define BAM_CHARD_CLIP 5
|
|
146 /*! @abstract CIGAR: padding */
|
|
147 #define BAM_CPAD 6
|
|
148
|
|
149 /*! @typedef
|
|
150 @abstract Structure for core alignment information.
|
|
151 @field tid chromosome ID, defined by bam_header_t
|
|
152 @field pos 0-based leftmost coordinate
|
|
153 @field strand strand; 0 for forward and 1 otherwise
|
|
154 @field bin bin calculated by bam_reg2bin()
|
|
155 @field qual mapping quality
|
|
156 @field l_qname length of the query name
|
|
157 @field flag bitwise flag
|
|
158 @field n_cigar number of CIGAR operations
|
|
159 @field l_qseq length of the query sequence (read)
|
|
160 */
|
|
161 typedef struct {
|
|
162 int32_t tid;
|
|
163 int32_t pos;
|
|
164 uint32_t bin:16, qual:8, l_qname:8;
|
|
165 uint32_t flag:16, n_cigar:16;
|
|
166 int32_t l_qseq;
|
|
167 int32_t mtid;
|
|
168 int32_t mpos;
|
|
169 int32_t isize;
|
|
170 } bam1_core_t;
|
|
171
|
|
172 /*! @typedef
|
|
173 @abstract Structure for one alignment.
|
|
174 @field core core information about the alignment
|
|
175 @field l_aux length of auxiliary data
|
|
176 @field data_len current length of bam1_t::data
|
|
177 @field m_data maximum length of bam1_t::data
|
|
178 @field data all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux
|
|
179
|
|
180 @discussion Notes:
|
|
181
|
|
182 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
|
|
183 2. l_qseq is calculated from the total length of an alignment block
|
|
184 on reading or from CIGAR.
|
|
185 */
|
|
186 typedef struct {
|
|
187 bam1_core_t core;
|
|
188 int l_aux, data_len, m_data;
|
|
189 uint8_t *data;
|
|
190 } bam1_t;
|
|
191
|
|
192 #define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
|
|
193 #define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
|
|
194
|
|
195 /*! @function
|
|
196 @abstract Get the CIGAR array
|
|
197 @param b pointer to an alignment
|
|
198 @return pointer to the CIGAR array
|
|
199
|
|
200 @discussion In the CIGAR array, each element is a 32-bit integer. The
|
|
201 lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
|
|
202 length of a CIGAR.
|
|
203 */
|
|
204 #define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
|
|
205
|
|
206 /*! @function
|
|
207 @abstract Get the name of the query
|
|
208 @param b pointer to an alignment
|
|
209 @return pointer to the name string, null terminated
|
|
210 */
|
|
211 #define bam1_qname(b) ((char*)((b)->data))
|
|
212
|
|
213 /*! @function
|
|
214 @abstract Get query sequence
|
|
215 @param b pointer to an alignment
|
|
216 @return pointer to sequence
|
|
217
|
|
218 @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
|
|
219 8 for T and 15 for N. Two bases are packed in one byte with the base
|
|
220 at the higher 4 bits having smaller coordinate on the read. It is
|
|
221 recommended to use bam1_seqi() macro to get the base.
|
|
222 */
|
|
223 #define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
|
|
224
|
|
225 /*! @function
|
|
226 @abstract Get query quality
|
|
227 @param b pointer to an alignment
|
|
228 @return pointer to quality string
|
|
229 */
|
|
230 #define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2)
|
|
231
|
|
232 /*! @function
|
|
233 @abstract Get a base on read
|
|
234 @param s Query sequence returned by bam1_seq()
|
|
235 @param i The i-th position, 0-based
|
|
236 @return 4-bit integer representing the base.
|
|
237 */
|
|
238 #define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
|
|
239
|
|
240 /*! @function
|
|
241 @abstract Get query sequence and quality
|
|
242 @param b pointer to an alignment
|
|
243 @return pointer to the concatenated auxiliary data
|
|
244 */
|
|
245 #define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
|
|
246
|
|
247 #ifndef kroundup32
|
|
248 /*! @function
|
|
249 @abstract Round an integer to the next closest power-2 integer.
|
|
250 @param x integer to be rounded (in place)
|
|
251 @discussion x will be modified.
|
|
252 */
|
|
253 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
|
254 #endif
|
|
255
|
|
256 /*!
|
|
257 @abstract Whether the machine is big-endian; modified only in
|
|
258 bam_header_init().
|
|
259 */
|
|
260 extern int bam_is_be;
|
|
261
|
|
262 /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
|
|
263 extern unsigned char bam_nt16_table[256];
|
|
264
|
|
265 /*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
|
|
266 extern char *bam_nt16_rev_table;
|
|
267
|
|
268 extern char bam_nt16_nt4_table[];
|
|
269
|
|
270 #ifdef __cplusplus
|
|
271 extern "C" {
|
|
272 #endif
|
|
273
|
|
274 /*! @abstract TAM file handler */
|
|
275 typedef struct __tamFile_t *tamFile;
|
|
276
|
|
277 /*!
|
|
278 @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib.
|
|
279 @param fn SAM file name
|
|
280 @return SAM file handler
|
|
281 */
|
|
282 tamFile sam_open(const char *fn);
|
|
283
|
|
284 /*!
|
|
285 @abstract Close a SAM file handler
|
|
286 @param fp SAM file handler
|
|
287 */
|
|
288 void sam_close(tamFile fp);
|
|
289
|
|
290 /*!
|
|
291 @abstract Read one alignment from a SAM file handler
|
|
292 @param fp SAM file handler
|
|
293 @param header header information (ordered names of chromosomes)
|
|
294 @param b read alignment; all members in b will be updated
|
|
295 @return 0 if successful; otherwise negative
|
|
296 */
|
|
297 int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b);
|
|
298
|
|
299 /*!
|
|
300 @abstract Read header information from a TAB-delimited list file.
|
|
301 @param fn_list file name for the list
|
|
302 @return a pointer to the header structure
|
|
303
|
|
304 @discussion Each line in this file consists of chromosome name and
|
|
305 the length of chromosome.
|
|
306 */
|
|
307 bam_header_t *sam_header_read2(const char *fn_list);
|
|
308
|
|
309 /*!
|
|
310 @abstract Read header from a SAM file (if present)
|
|
311 @param fp SAM file handler
|
|
312 @return pointer to header struct; 0 if no @SQ lines available
|
|
313 */
|
|
314 bam_header_t *sam_header_read(tamFile fp);
|
|
315
|
|
316 /*!
|
|
317 @abstract Parse @SQ lines a update a header struct
|
|
318 @param h pointer to the header struct to be updated
|
|
319 @return number of target sequences
|
|
320
|
|
321 @discussion bam_header_t::{n_targets,target_len,target_name} will
|
|
322 be destroyed in the first place.
|
|
323 */
|
|
324 int sam_header_parse(bam_header_t *h);
|
|
325
|
|
326 /*!
|
|
327 @abstract Parse @RG lines a update a header struct
|
|
328 @param h pointer to the header struct to be updated
|
|
329 @return number of @RG lines
|
|
330
|
|
331 @discussion bam_header_t::rg2lib will be destroyed in the first
|
|
332 place.
|
|
333 */
|
|
334 int sam_header_parse_rg(bam_header_t *h);
|
|
335
|
|
336 #define sam_write1(header, b) bam_view1(header, b)
|
|
337
|
|
338 int bam_strmap_put(void *strmap, const char *rg, const char *lib);
|
|
339 const char *bam_strmap_get(const void *strmap, const char *rg);
|
|
340 void *bam_strmap_dup(const void*);
|
|
341 void *bam_strmap_init();
|
|
342 void bam_strmap_destroy(void *strmap);
|
|
343
|
|
344 /*!
|
|
345 @abstract Initialize a header structure.
|
|
346 @return the pointer to the header structure
|
|
347
|
|
348 @discussion This function also modifies the global variable
|
|
349 bam_is_be.
|
|
350 */
|
|
351 bam_header_t *bam_header_init();
|
|
352
|
|
353 /*!
|
|
354 @abstract Destroy a header structure.
|
|
355 @param header pointer to the header
|
|
356 */
|
|
357 void bam_header_destroy(bam_header_t *header);
|
|
358
|
|
359 /*!
|
|
360 @abstract Read a header structure from BAM.
|
|
361 @param fp BAM file handler, opened by bam_open()
|
|
362 @return pointer to the header structure
|
|
363
|
|
364 @discussion The file position indicator must be placed at the
|
|
365 beginning of the file. Upon success, the position indicator will
|
|
366 be set at the start of the first alignment.
|
|
367 */
|
|
368 bam_header_t *bam_header_read(bamFile fp);
|
|
369
|
|
370 /*!
|
|
371 @abstract Write a header structure to BAM.
|
|
372 @param fp BAM file handler
|
|
373 @param header pointer to the header structure
|
|
374 @return always 0 currently
|
|
375 */
|
|
376 int bam_header_write(bamFile fp, const bam_header_t *header);
|
|
377
|
|
378 /*!
|
|
379 @abstract Read an alignment from BAM.
|
|
380 @param fp BAM file handler
|
|
381 @param b read alignment; all members are updated.
|
|
382 @return number of bytes read from the file
|
|
383
|
|
384 @discussion The file position indicator must be
|
|
385 placed right before an alignment. Upon success, this function
|
|
386 will set the position indicator to the start of the next
|
|
387 alignment. This function is not affected by the machine
|
|
388 endianness.
|
|
389 */
|
|
390 int bam_read1(bamFile fp, bam1_t *b);
|
|
391
|
|
392 /*!
|
|
393 @abstract Write an alignment to BAM.
|
|
394 @param fp BAM file handler
|
|
395 @param c pointer to the bam1_core_t structure
|
|
396 @param data_len total length of variable size data related to
|
|
397 the alignment
|
|
398 @param data pointer to the concatenated data
|
|
399 @return number of bytes written to the file
|
|
400
|
|
401 @discussion This function is not affected by the machine
|
|
402 endianness.
|
|
403 */
|
|
404 int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data);
|
|
405
|
|
406 /*!
|
|
407 @abstract Write an alignment to BAM.
|
|
408 @param fp BAM file handler
|
|
409 @param b alignment to write
|
|
410 @return number of bytes written to the file
|
|
411
|
|
412 @abstract It is equivalent to:
|
|
413 bam_write1_core(fp, &b->core, b->data_len, b->data)
|
|
414 */
|
|
415 int bam_write1(bamFile fp, const bam1_t *b);
|
|
416
|
|
417 /*! @function
|
|
418 @abstract Initiate a pointer to bam1_t struct
|
|
419 */
|
|
420 #define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
|
|
421
|
|
422 /*! @function
|
|
423 @abstract Free the memory allocated for an alignment.
|
|
424 @param b pointer to an alignment
|
|
425 */
|
|
426 #define bam_destroy1(b) do { \
|
|
427 free((b)->data); free(b); \
|
|
428 } while (0)
|
|
429
|
|
430 /*!
|
|
431 @abstract Format a BAM record in the SAM format
|
|
432 @param header pointer to the header structure
|
|
433 @param b alignment to print
|
|
434 @return a pointer to the SAM string
|
|
435 */
|
|
436 char *bam_format1(const bam_header_t *header, const bam1_t *b);
|
|
437
|
|
438 char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of);
|
|
439
|
|
440 /*! @typedef
|
|
441 @abstract Structure for one alignment covering the pileup position.
|
|
442 @field b pointer to the alignment
|
|
443 @field qpos position of the read base at the pileup site, 0-based
|
|
444 @field indel indel length; 0 for no indel, positive for ins and negative for del
|
|
445 @field is_del 1 iff the base on the padded read is a deletion
|
|
446 @field level the level of the read in the "viewer" mode
|
|
447
|
|
448 @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
|
|
449 difference between the two functions is that the former does not
|
|
450 set bam_pileup1_t::level, while the later does. Level helps the
|
|
451 implementation of alignment viewers, but calculating this has some
|
|
452 overhead.
|
|
453 */
|
|
454 typedef struct {
|
|
455 bam1_t *b;
|
|
456 int32_t qpos;
|
|
457 int indel, level;
|
|
458 uint32_t is_del:1, is_head:1, is_tail:1;
|
|
459 } bam_pileup1_t;
|
|
460
|
|
461 struct __bam_plbuf_t;
|
|
462 /*! @abstract pileup buffer */
|
|
463 typedef struct __bam_plbuf_t bam_plbuf_t;
|
|
464
|
|
465 void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
|
|
466
|
|
467 /*! @typedef
|
|
468 @abstract Type of function to be called by bam_plbuf_push().
|
|
469 @param tid chromosome ID as is defined in the header
|
|
470 @param pos start coordinate of the alignment, 0-based
|
|
471 @param n number of elements in pl array
|
|
472 @param pl array of alignments
|
|
473 @param data user provided data
|
|
474 @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t.
|
|
475 */
|
|
476 typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
|
|
477
|
|
478 /*!
|
|
479 @abstract Reset a pileup buffer for another pileup process
|
|
480 @param buf the pileup buffer to be reset
|
|
481 */
|
|
482 void bam_plbuf_reset(bam_plbuf_t *buf);
|
|
483
|
|
484 /*!
|
|
485 @abstract Initialize a buffer for pileup.
|
|
486 @param func fucntion to be called by bam_pileup_core()
|
|
487 @param data user provided data
|
|
488 @return pointer to the pileup buffer
|
|
489 */
|
|
490 bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
|
|
491
|
|
492 /*!
|
|
493 @abstract Destroy a pileup buffer.
|
|
494 @param buf pointer to the pileup buffer
|
|
495 */
|
|
496 void bam_plbuf_destroy(bam_plbuf_t *buf);
|
|
497
|
|
498 /*!
|
|
499 @abstract Push an alignment to the pileup buffer.
|
|
500 @param b alignment to be pushed
|
|
501 @param buf pileup buffer
|
|
502 @see bam_plbuf_init()
|
|
503 @return always 0 currently
|
|
504
|
|
505 @discussion If all the alignments covering a particular site have
|
|
506 been collected, this function will call the user defined function
|
|
507 as is provided to bam_plbuf_init(). The coordinate of the site and
|
|
508 all the alignments will be transferred to the user defined
|
|
509 function as function parameters.
|
|
510
|
|
511 When all the alignments are pushed to the buffer, this function
|
|
512 needs to be called with b equal to NULL. This will flush the
|
|
513 buffer. A pileup buffer can only be reused when bam_plbuf_reset()
|
|
514 is called.
|
|
515 */
|
|
516 int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
|
|
517
|
|
518 int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
|
|
519
|
|
520 struct __bam_lplbuf_t;
|
|
521 typedef struct __bam_lplbuf_t bam_lplbuf_t;
|
|
522
|
|
523 void bam_lplbuf_reset(bam_lplbuf_t *buf);
|
|
524
|
|
525 /*! @abstract bam_plbuf_init() equivalent with level calculated. */
|
|
526 bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data);
|
|
527
|
|
528 /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */
|
|
529 void bam_lplbuf_destroy(bam_lplbuf_t *tv);
|
|
530
|
|
531 /*! @abstract bam_plbuf_push() equivalent with level calculated. */
|
|
532 int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
|
|
533
|
|
534 struct __bam_index_t;
|
|
535 typedef struct __bam_index_t bam_index_t;
|
|
536
|
|
537 /*!
|
|
538 @abstract Build index for a BAM file.
|
|
539 @discussion Index file "fn.bai" will be created.
|
|
540 @param fn name of the BAM file
|
|
541 @return always 0 currently
|
|
542 */
|
|
543 int bam_index_build(const char *fn);
|
|
544
|
|
545 /*!
|
|
546 @abstract Load index from file "fn.bai".
|
|
547 @param fn name of the BAM file (NOT the index file)
|
|
548 @return pointer to the index structure
|
|
549 */
|
|
550 bam_index_t *bam_index_load(const char *fn);
|
|
551
|
|
552 /*!
|
|
553 @abstract Destroy an index structure.
|
|
554 @param idx pointer to the index structure
|
|
555 */
|
|
556 void bam_index_destroy(bam_index_t *idx);
|
|
557
|
|
558 /*! @typedef
|
|
559 @abstract Type of function to be called by bam_fetch().
|
|
560 @param b the alignment
|
|
561 @param data user provided data
|
|
562 */
|
|
563 typedef int (*bam_fetch_f)(const bam1_t *b, void *data);
|
|
564
|
|
565 /*!
|
|
566 @abstract Retrieve the alignments that are overlapped with the
|
|
567 specified region.
|
|
568
|
|
569 @discussion A user defined function will be called for each
|
|
570 retrieved alignment ordered by its start position.
|
|
571
|
|
572 @param fp BAM file handler
|
|
573 @param idx pointer to the alignment index
|
|
574 @param tid chromosome ID as is defined in the header
|
|
575 @param beg start coordinate, 0-based
|
|
576 @param end end coordinate, 0-based
|
|
577 @param data user provided data (will be transferred to func)
|
|
578 @param func user defined function
|
|
579 */
|
|
580 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
|
|
581
|
|
582 /*!
|
|
583 @abstract Parse a region in the format: "chr2:100,000-200,000".
|
|
584 @discussion bam_header_t::hash will be initialized if empty.
|
|
585 @param header pointer to the header structure
|
|
586 @param str string to be parsed
|
|
587 @param ref_id the returned chromosome ID
|
|
588 @param begin the returned start coordinate
|
|
589 @param end the returned end coordinate
|
|
590 @return 0 on success; -1 on failure
|
|
591 */
|
|
592 int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
|
|
593
|
|
594 /*!
|
|
595 @abstract Retrieve data of a tag
|
|
596 @param b pointer to an alignment struct
|
|
597 @param tag two-character tag to be retrieved
|
|
598
|
|
599 @return pointer to the type and data. The first character is the
|
|
600 type that can be 'iIsScCdfAZH'.
|
|
601
|
|
602 @discussion Use bam_aux2?() series to convert the returned data to
|
|
603 the corresponding type.
|
|
604 */
|
|
605 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
|
|
606
|
|
607 int32_t bam_aux2i(const uint8_t *s);
|
|
608 float bam_aux2f(const uint8_t *s);
|
|
609 double bam_aux2d(const uint8_t *s);
|
|
610 char bam_aux2A(const uint8_t *s);
|
|
611 char *bam_aux2Z(const uint8_t *s);
|
|
612
|
|
613 int bam_aux_del(bam1_t *b, uint8_t *s);
|
|
614 void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
|
|
615 uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
|
|
616
|
|
617 /*!
|
|
618 @abstract Calculate the rightmost coordinate of an alignment on the
|
|
619 reference genome.
|
|
620
|
|
621 @param c pointer to the bam1_core_t structure
|
|
622 @param cigar the corresponding CIGAR array (from bam1_t::cigar)
|
|
623 @return the rightmost coordinate, 0-based
|
|
624 */
|
|
625 uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar);
|
|
626
|
|
627 /*!
|
|
628 @abstract Calculate the length of the query sequence from CIGAR.
|
|
629 @param c pointer to the bam1_core_t structure
|
|
630 @param cigar the corresponding CIGAR array (from bam1_t::cigar)
|
|
631 @return length of the query sequence
|
|
632 */
|
|
633 int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);
|
|
634
|
|
635 typedef struct {
|
|
636 int32_t qbeg, qend;
|
|
637 int32_t tbeg, tend;
|
|
638 int32_t cbeg, cend;
|
|
639 } bam_segreg_t;
|
|
640
|
|
641 int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg);
|
|
642
|
|
643 #ifdef __cplusplus
|
|
644 }
|
|
645 #endif
|
|
646
|
|
647 /*!
|
|
648 @abstract Calculate the minimum bin that contains a region [beg,end).
|
|
649 @param beg start of the region, 0-based
|
|
650 @param end end of the region, 0-based
|
|
651 @return bin
|
|
652 */
|
|
653 static inline int bam_reg2bin(uint32_t beg, uint32_t end)
|
|
654 {
|
|
655 --end;
|
|
656 if (beg>>14 == end>>14) return 4681 + (beg>>14);
|
|
657 if (beg>>17 == end>>17) return 585 + (beg>>17);
|
|
658 if (beg>>20 == end>>20) return 73 + (beg>>20);
|
|
659 if (beg>>23 == end>>23) return 9 + (beg>>23);
|
|
660 if (beg>>26 == end>>26) return 1 + (beg>>26);
|
|
661 return 0;
|
|
662 }
|
|
663
|
|
664 /*!
|
|
665 @abstract Copy an alignment
|
|
666 @param bdst destination alignment struct
|
|
667 @param bsrc source alignment struct
|
|
668 @return pointer to the destination alignment struct
|
|
669 */
|
|
670 static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
|
|
671 {
|
|
672 uint8_t *data = bdst->data;
|
|
673 int m_data = bdst->m_data; // backup data and m_data
|
|
674 if (m_data < bsrc->m_data) { // double the capacity
|
|
675 m_data = bsrc->m_data; kroundup32(m_data);
|
|
676 data = (uint8_t*)realloc(data, m_data);
|
|
677 }
|
|
678 memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data
|
|
679 *bdst = *bsrc; // copy the rest
|
|
680 // restore the backup
|
|
681 bdst->m_data = m_data;
|
|
682 bdst->data = data;
|
|
683 return bdst;
|
|
684 }
|
|
685
|
|
686 /*!
|
|
687 @abstract Duplicate an alignment
|
|
688 @param src source alignment struct
|
|
689 @return pointer to the destination alignment struct
|
|
690 */
|
|
691 static inline bam1_t *bam_dup1(const bam1_t *src)
|
|
692 {
|
|
693 bam1_t *b;
|
|
694 b = bam_init1();
|
|
695 *b = *src;
|
|
696 b->m_data = b->data_len;
|
|
697 b->data = (uint8_t*)calloc(b->data_len, 1);
|
|
698 memcpy(b->data, src->data, b->data_len);
|
|
699 return b;
|
|
700 }
|
|
701
|
|
702 #endif
|