comparison SNV/SNVMix2_source/SNVMix2-v0.12.1-rc1/samtools-0.1.6/bam_index.c @ 0:74f5ea818cea

Uploaded
author ryanmorin
date Wed, 12 Oct 2011 19:50:38 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:74f5ea818cea
1 #include <ctype.h>
2 #include <assert.h>
3 #include "bam.h"
4 #include "khash.h"
5 #include "ksort.h"
6 #include "bam_endian.h"
7 #ifdef _USE_KNETFILE
8 #include "knetfile.h"
9 #endif
10
11 /*!
12 @header
13
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
19 short alignments.
20
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
29
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
38
39 */
40
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
44
45 typedef struct {
46 uint64_t u, v;
47 } pair64_t;
48
49 #define pair64_lt(a,b) ((a).u < (b).u)
50 KSORT_INIT(off, pair64_t, pair64_lt)
51
52 typedef struct {
53 uint32_t m, n;
54 pair64_t *list;
55 } bam_binlist_t;
56
57 typedef struct {
58 int32_t n, m;
59 uint64_t *offset;
60 } bam_lidx_t;
61
62 KHASH_MAP_INIT_INT(i, bam_binlist_t)
63
64 struct __bam_index_t {
65 int32_t n;
66 khash_t(i) **index;
67 bam_lidx_t *index2;
68 };
69
70 // requirement: len <= LEN_MASK
71 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
72 {
73 khint_t k;
74 bam_binlist_t *l;
75 int ret;
76 k = kh_put(i, h, bin, &ret);
77 l = &kh_value(h, k);
78 if (ret) { // not present
79 l->m = 1; l->n = 0;
80 l->list = (pair64_t*)calloc(l->m, 16);
81 }
82 if (l->n == l->m) {
83 l->m <<= 1;
84 l->list = (pair64_t*)realloc(l->list, l->m * 16);
85 }
86 l->list[l->n].u = beg; l->list[l->n++].v = end;
87 }
88
89 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
90 {
91 int i, beg, end;
92 beg = b->core.pos >> BAM_LIDX_SHIFT;
93 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
94 if (index2->m < end + 1) {
95 int old_m = index2->m;
96 index2->m = end + 1;
97 kroundup32(index2->m);
98 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
99 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
100 }
101 for (i = beg + 1; i <= end; ++i)
102 if (index2->offset[i] == 0) index2->offset[i] = offset;
103 index2->n = end + 1;
104 }
105
106 static void merge_chunks(bam_index_t *idx)
107 {
108 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
109 khash_t(i) *index;
110 int i, l, m;
111 khint_t k;
112 for (i = 0; i < idx->n; ++i) {
113 index = idx->index[i];
114 for (k = kh_begin(index); k != kh_end(index); ++k) {
115 bam_binlist_t *p;
116 if (!kh_exist(index, k)) continue;
117 p = &kh_value(index, k);
118 m = 0;
119 for (l = 1; l < p->n; ++l) {
120 #ifdef BAM_TRUE_OFFSET
121 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
122 #else
123 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
124 #endif
125 else p->list[++m] = p->list[l];
126 } // ~for(l)
127 p->n = m + 1;
128 } // ~for(k)
129 } // ~for(i)
130 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
131 }
132
133 bam_index_t *bam_index_core(bamFile fp)
134 {
135 bam1_t *b;
136 bam_header_t *h;
137 int i, ret;
138 bam_index_t *idx;
139 uint32_t last_bin, save_bin;
140 int32_t last_coor, last_tid, save_tid;
141 bam1_core_t *c;
142 uint64_t save_off, last_off;
143
144 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
145 b = (bam1_t*)calloc(1, sizeof(bam1_t));
146 h = bam_header_read(fp);
147 c = &b->core;
148
149 idx->n = h->n_targets;
150 bam_header_destroy(h);
151 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
152 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
153 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
154
155 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
156 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
157 while ((ret = bam_read1(fp, b)) >= 0) {
158 if (last_tid != c->tid) { // change of chromosomes
159 last_tid = c->tid;
160 last_bin = 0xffffffffu;
161 } else if (last_coor > c->pos) {
162 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
163 bam1_qname(b), last_coor, c->pos, c->tid+1);
164 exit(1);
165 }
166 if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
167 if (c->bin != last_bin) { // then possibly write the binning index
168 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
169 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
170 save_off = last_off;
171 save_bin = last_bin = c->bin;
172 save_tid = c->tid;
173 if (save_tid < 0) break;
174 }
175 if (bam_tell(fp) <= last_off) {
176 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
177 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
178 exit(1);
179 }
180 last_off = bam_tell(fp);
181 last_coor = b->core.pos;
182 }
183 if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
184 merge_chunks(idx);
185 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
186 free(b->data); free(b);
187 return idx;
188 }
189
190 void bam_index_destroy(bam_index_t *idx)
191 {
192 khint_t k;
193 int i;
194 if (idx == 0) return;
195 for (i = 0; i < idx->n; ++i) {
196 khash_t(i) *index = idx->index[i];
197 bam_lidx_t *index2 = idx->index2 + i;
198 for (k = kh_begin(index); k != kh_end(index); ++k) {
199 if (kh_exist(index, k))
200 free(kh_value(index, k).list);
201 }
202 kh_destroy(i, index);
203 free(index2->offset);
204 }
205 free(idx->index); free(idx->index2);
206 free(idx);
207 }
208
209 void bam_index_save(const bam_index_t *idx, FILE *fp)
210 {
211 int32_t i, size;
212 khint_t k;
213 fwrite("BAI\1", 1, 4, fp);
214 if (bam_is_be) {
215 uint32_t x = idx->n;
216 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
217 } else fwrite(&idx->n, 4, 1, fp);
218 for (i = 0; i < idx->n; ++i) {
219 khash_t(i) *index = idx->index[i];
220 bam_lidx_t *index2 = idx->index2 + i;
221 // write binning index
222 size = kh_size(index);
223 if (bam_is_be) { // big endian
224 uint32_t x = size;
225 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
226 } else fwrite(&size, 4, 1, fp);
227 for (k = kh_begin(index); k != kh_end(index); ++k) {
228 if (kh_exist(index, k)) {
229 bam_binlist_t *p = &kh_value(index, k);
230 if (bam_is_be) { // big endian
231 uint32_t x;
232 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
233 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
234 for (x = 0; (int)x < p->n; ++x) {
235 bam_swap_endian_8p(&p->list[x].u);
236 bam_swap_endian_8p(&p->list[x].v);
237 }
238 fwrite(p->list, 16, p->n, fp);
239 for (x = 0; (int)x < p->n; ++x) {
240 bam_swap_endian_8p(&p->list[x].u);
241 bam_swap_endian_8p(&p->list[x].v);
242 }
243 } else {
244 fwrite(&kh_key(index, k), 4, 1, fp);
245 fwrite(&p->n, 4, 1, fp);
246 fwrite(p->list, 16, p->n, fp);
247 }
248 }
249 }
250 // write linear index (index2)
251 if (bam_is_be) {
252 int x = index2->n;
253 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
254 } else fwrite(&index2->n, 4, 1, fp);
255 if (bam_is_be) { // big endian
256 int x;
257 for (x = 0; (int)x < index2->n; ++x)
258 bam_swap_endian_8p(&index2->offset[x]);
259 fwrite(index2->offset, 8, index2->n, fp);
260 for (x = 0; (int)x < index2->n; ++x)
261 bam_swap_endian_8p(&index2->offset[x]);
262 } else fwrite(index2->offset, 8, index2->n, fp);
263 }
264 fflush(fp);
265 }
266
267 static bam_index_t *bam_index_load_core(FILE *fp)
268 {
269 int i;
270 char magic[4];
271 bam_index_t *idx;
272 if (fp == 0) {
273 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
274 return 0;
275 }
276 fread(magic, 1, 4, fp);
277 if (strncmp(magic, "BAI\1", 4)) {
278 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
279 fclose(fp);
280 return 0;
281 }
282 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
283 fread(&idx->n, 4, 1, fp);
284 if (bam_is_be) bam_swap_endian_4p(&idx->n);
285 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
286 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
287 for (i = 0; i < idx->n; ++i) {
288 khash_t(i) *index;
289 bam_lidx_t *index2 = idx->index2 + i;
290 uint32_t key, size;
291 khint_t k;
292 int j, ret;
293 bam_binlist_t *p;
294 index = idx->index[i] = kh_init(i);
295 // load binning index
296 fread(&size, 4, 1, fp);
297 if (bam_is_be) bam_swap_endian_4p(&size);
298 for (j = 0; j < (int)size; ++j) {
299 fread(&key, 4, 1, fp);
300 if (bam_is_be) bam_swap_endian_4p(&key);
301 k = kh_put(i, index, key, &ret);
302 p = &kh_value(index, k);
303 fread(&p->n, 4, 1, fp);
304 if (bam_is_be) bam_swap_endian_4p(&p->n);
305 p->m = p->n;
306 p->list = (pair64_t*)malloc(p->m * 16);
307 fread(p->list, 16, p->n, fp);
308 if (bam_is_be) {
309 int x;
310 for (x = 0; x < p->n; ++x) {
311 bam_swap_endian_8p(&p->list[x].u);
312 bam_swap_endian_8p(&p->list[x].v);
313 }
314 }
315 }
316 // load linear index
317 fread(&index2->n, 4, 1, fp);
318 if (bam_is_be) bam_swap_endian_4p(&index2->n);
319 index2->m = index2->n;
320 index2->offset = (uint64_t*)calloc(index2->m, 8);
321 fread(index2->offset, index2->n, 8, fp);
322 if (bam_is_be)
323 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
324 }
325 return idx;
326 }
327
328 bam_index_t *bam_index_load_local(const char *_fn)
329 {
330 FILE *fp;
331 char *fnidx, *fn;
332
333 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
334 const char *p;
335 int l = strlen(_fn);
336 for (p = _fn + l - 1; p >= _fn; --p)
337 if (*p == '/') break;
338 fn = strdup(p + 1);
339 } else fn = strdup(_fn);
340 fnidx = (char*)calloc(strlen(fn) + 5, 1);
341 strcpy(fnidx, fn); strcat(fnidx, ".bai");
342 fp = fopen(fnidx, "r");
343 if (fp == 0) { // try "{base}.bai"
344 char *s = strstr(fn, "bam");
345 if (s == fn + strlen(fn) - 3) {
346 strcpy(fnidx, fn);
347 fnidx[strlen(fn)-1] = 'i';
348 fp = fopen(fnidx, "r");
349 }
350 }
351 free(fnidx); free(fn);
352 if (fp) {
353 bam_index_t *idx = bam_index_load_core(fp);
354 fclose(fp);
355 return idx;
356 } else return 0;
357 }
358
359 #ifdef _USE_KNETFILE
360 static void download_from_remote(const char *url)
361 {
362 const int buf_size = 1 * 1024 * 1024;
363 char *fn;
364 FILE *fp;
365 uint8_t *buf;
366 knetFile *fp_remote;
367 int l;
368 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
369 l = strlen(url);
370 for (fn = (char*)url + l - 1; fn >= url; --fn)
371 if (*fn == '/') break;
372 ++fn; // fn now points to the file name
373 fp_remote = knet_open(url, "r");
374 if (fp_remote == 0) {
375 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
376 return;
377 }
378 if ((fp = fopen(fn, "w")) == 0) {
379 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
380 knet_close(fp_remote);
381 return;
382 }
383 buf = (uint8_t*)calloc(buf_size, 1);
384 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
385 fwrite(buf, 1, l, fp);
386 free(buf);
387 fclose(fp);
388 knet_close(fp_remote);
389 }
390 #else
391 static void download_from_remote(const char *url)
392 {
393 return;
394 }
395 #endif
396
397 bam_index_t *bam_index_load(const char *fn)
398 {
399 bam_index_t *idx;
400 idx = bam_index_load_local(fn);
401 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
402 char *fnidx = calloc(strlen(fn) + 5, 1);
403 strcat(strcpy(fnidx, fn), ".bai");
404 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
405 download_from_remote(fnidx);
406 idx = bam_index_load_local(fn);
407 }
408 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
409 return idx;
410 }
411
412 int bam_index_build2(const char *fn, const char *_fnidx)
413 {
414 char *fnidx;
415 FILE *fpidx;
416 bamFile fp;
417 bam_index_t *idx;
418 if ((fp = bam_open(fn, "r")) == 0) {
419 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
420 return -1;
421 }
422 idx = bam_index_core(fp);
423 bam_close(fp);
424 if (_fnidx == 0) {
425 fnidx = (char*)calloc(strlen(fn) + 5, 1);
426 strcpy(fnidx, fn); strcat(fnidx, ".bai");
427 } else fnidx = strdup(_fnidx);
428 fpidx = fopen(fnidx, "w");
429 if (fpidx == 0) {
430 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
431 free(fnidx);
432 return -1;
433 }
434 bam_index_save(idx, fpidx);
435 bam_index_destroy(idx);
436 fclose(fpidx);
437 free(fnidx);
438 return 0;
439 }
440
441 int bam_index_build(const char *fn)
442 {
443 return bam_index_build2(fn, 0);
444 }
445
446 int bam_index(int argc, char *argv[])
447 {
448 if (argc < 2) {
449 fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
450 return 1;
451 }
452 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
453 else bam_index_build(argv[1]);
454 return 0;
455 }
456
457 #define MAX_BIN 37450 // =(8^6-1)/7+1
458
459 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
460 {
461 int i = 0, k;
462 --end;
463 list[i++] = 0;
464 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
465 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
466 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
467 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
468 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
469 return i;
470 }
471
472 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
473 {
474 uint32_t rbeg = b->core.pos;
475 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
476 return (rend > beg && rbeg < end);
477 }
478
479 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
480 {
481 uint16_t *bins;
482 int i, n_bins, n_off;
483 pair64_t *off;
484 khint_t k;
485 khash_t(i) *index;
486 uint64_t min_off;
487
488 bins = (uint16_t*)calloc(MAX_BIN, 2);
489 n_bins = reg2bins(beg, end, bins);
490 index = idx->index[tid];
491 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
492 for (i = n_off = 0; i < n_bins; ++i) {
493 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
494 n_off += kh_value(index, k).n;
495 }
496 if (n_off == 0) {
497 free(bins); return 0;
498 }
499 off = (pair64_t*)calloc(n_off, 16);
500 for (i = n_off = 0; i < n_bins; ++i) {
501 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
502 int j;
503 bam_binlist_t *p = &kh_value(index, k);
504 for (j = 0; j < p->n; ++j)
505 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
506 }
507 }
508 free(bins);
509 {
510 bam1_t *b;
511 int l, ret, n_seeks;
512 uint64_t curr_off;
513 b = (bam1_t*)calloc(1, sizeof(bam1_t));
514 ks_introsort(off, n_off, off);
515 // resolve completely contained adjacent blocks
516 for (i = 1, l = 0; i < n_off; ++i)
517 if (off[l].v < off[i].v)
518 off[++l] = off[i];
519 n_off = l + 1;
520 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
521 for (i = 1; i < n_off; ++i)
522 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
523 { // merge adjacent blocks
524 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
525 for (i = 1, l = 0; i < n_off; ++i) {
526 #ifdef BAM_TRUE_OFFSET
527 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
528 #else
529 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
530 #endif
531 else off[++l] = off[i];
532 }
533 n_off = l + 1;
534 #endif
535 }
536 // retrive alignments
537 n_seeks = 0; i = -1; curr_off = 0;
538 for (;;) {
539 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
540 if (i == n_off - 1) break; // no more chunks
541 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
542 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
543 bam_seek(fp, off[i+1].u, SEEK_SET);
544 curr_off = bam_tell(fp);
545 ++n_seeks;
546 }
547 ++i;
548 }
549 if ((ret = bam_read1(fp, b)) > 0) {
550 curr_off = bam_tell(fp);
551 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
552 else if (is_overlap(beg, end, b)) func(b, data);
553 } else break; // end of file
554 }
555 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);
556 bam_destroy1(b);
557 }
558 free(off);
559 return 0;
560 }