Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/cram/cram_index.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* | |
2 Copyright (c) 2013-2014 Genome Research Ltd. | |
3 Author: James Bonfield <jkb@sanger.ac.uk> | |
4 | |
5 Redistribution and use in source and binary forms, with or without | |
6 modification, are permitted provided that the following conditions are met: | |
7 | |
8 1. Redistributions of source code must retain the above copyright notice, | |
9 this list of conditions and the following disclaimer. | |
10 | |
11 2. Redistributions in binary form must reproduce the above copyright notice, | |
12 this list of conditions and the following disclaimer in the documentation | |
13 and/or other materials provided with the distribution. | |
14 | |
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger | |
16 Institute nor the names of its contributors may be used to endorse or promote | |
17 products derived from this software without specific prior written permission. | |
18 | |
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND | |
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE | |
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 */ | |
30 | |
31 /* | |
32 * The index is a gzipped tab-delimited text file with one line per slice. | |
33 * The columns are: | |
34 * 1: reference number (0 to N-1, as per BAM ref_id) | |
35 * 2: reference position of 1st read in slice (1..?) | |
36 * 3: number of reads in slice | |
37 * 4: offset of container start (relative to end of SAM header, so 1st | |
38 * container is offset 0). | |
39 * 5: slice number within container (ie which landmark). | |
40 * | |
41 * In memory, we hold this in a nested containment list. Each list element is | |
42 * a cram_index struct. Each element in turn can contain its own list of | |
43 * cram_index structs. | |
44 * | |
45 * Any start..end range which is entirely contained within another (and | |
46 * earlier as it is sorted) range will be held within it. This ensures that | |
47 * the outer list will never have containments and we can safely do a | |
48 * binary search to find the first range which overlaps any given coordinate. | |
49 */ | |
50 | |
51 #ifdef HAVE_CONFIG_H | |
52 #include "io_lib_config.h" | |
53 #endif | |
54 | |
55 #include <stdio.h> | |
56 #include <errno.h> | |
57 #include <assert.h> | |
58 #include <stdlib.h> | |
59 #include <string.h> | |
60 #include <zlib.h> | |
61 #include <sys/types.h> | |
62 #include <sys/stat.h> | |
63 #include <math.h> | |
64 #include <ctype.h> | |
65 | |
66 #include "htslib/hfile.h" | |
67 #include "cram/cram.h" | |
68 #include "cram/os.h" | |
69 #include "cram/zfio.h" | |
70 | |
71 #if 0 | |
72 static void dump_index_(cram_index *e, int level) { | |
73 int i, n; | |
74 n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end); | |
75 printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset); | |
76 for (i = 0; i < e->nslice; i++) { | |
77 dump_index_(&e->e[i], level+1); | |
78 } | |
79 } | |
80 | |
81 static void dump_index(cram_fd *fd) { | |
82 int i; | |
83 for (i = 0; i < fd->index_sz; i++) { | |
84 dump_index_(&fd->index[i], 0); | |
85 } | |
86 } | |
87 #endif | |
88 | |
89 static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) { | |
90 int sign = 1; | |
91 int32_t val = 0; | |
92 size_t p = *pos; | |
93 | |
94 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t')) | |
95 p++; | |
96 | |
97 if (p < k->l && k->s[p] == '-') | |
98 sign = -1, p++; | |
99 | |
100 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9')) | |
101 return -1; | |
102 | |
103 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') | |
104 val = val*10 + k->s[p++]-'0'; | |
105 | |
106 *pos = p; | |
107 *val_p = sign*val; | |
108 | |
109 return 0; | |
110 } | |
111 | |
112 static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) { | |
113 int sign = 1; | |
114 int64_t val = 0; | |
115 size_t p = *pos; | |
116 | |
117 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t')) | |
118 p++; | |
119 | |
120 if (p < k->l && k->s[p] == '-') | |
121 sign = -1, p++; | |
122 | |
123 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9')) | |
124 return -1; | |
125 | |
126 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') | |
127 val = val*10 + k->s[p++]-'0'; | |
128 | |
129 *pos = p; | |
130 *val_p = sign*val; | |
131 | |
132 return 0; | |
133 } | |
134 | |
135 /* | |
136 * Loads a CRAM .crai index into memory. | |
137 * | |
138 * Returns 0 for success | |
139 * -1 for failure | |
140 */ | |
141 int cram_index_load(cram_fd *fd, const char *fn) { | |
142 char fn2[PATH_MAX]; | |
143 char buf[65536]; | |
144 ssize_t len; | |
145 kstring_t kstr = {0}; | |
146 hFILE *fp; | |
147 cram_index *idx; | |
148 cram_index **idx_stack = NULL, *ep, e; | |
149 int idx_stack_alloc = 0, idx_stack_ptr = 0; | |
150 size_t pos = 0; | |
151 | |
152 /* Check if already loaded */ | |
153 if (fd->index) | |
154 return 0; | |
155 | |
156 fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index)); | |
157 if (!fd->index) | |
158 return -1; | |
159 | |
160 idx = &fd->index[0]; | |
161 idx->refid = -1; | |
162 idx->start = INT_MIN; | |
163 idx->end = INT_MAX; | |
164 | |
165 idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack)); | |
166 idx_stack[idx_stack_ptr] = idx; | |
167 | |
168 sprintf(fn2, "%s.crai", fn); | |
169 if (!(fp = hopen(fn2, "r"))) { | |
170 perror(fn2); | |
171 free(idx_stack); | |
172 return -1; | |
173 } | |
174 | |
175 // Load the file into memory | |
176 while ((len = hread(fp, buf, 65536)) > 0) | |
177 kputsn(buf, len, &kstr); | |
178 if (len < 0 || kstr.l < 2) { | |
179 if (kstr.s) | |
180 free(kstr.s); | |
181 free(idx_stack); | |
182 return -1; | |
183 } | |
184 | |
185 if (hclose(fp)) { | |
186 if (kstr.s) | |
187 free(kstr.s); | |
188 free(idx_stack); | |
189 return -1; | |
190 } | |
191 | |
192 | |
193 // Uncompress if required | |
194 if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) { | |
195 size_t l; | |
196 char *s = zlib_mem_inflate(kstr.s, kstr.l, &l); | |
197 free(kstr.s); | |
198 if (!s) { | |
199 free(idx_stack); | |
200 return -1; | |
201 } | |
202 kstr.s = s; | |
203 kstr.l = l; | |
204 kstr.m = l; // conservative estimate of the size allocated | |
205 kputsn("", 0, &kstr); // ensure kstr.s is NUL-terminated | |
206 } | |
207 | |
208 | |
209 // Parse it line at a time | |
210 do { | |
211 /* 1.1 layout */ | |
212 if (kget_int32(&kstr, &pos, &e.refid) == -1) { | |
213 free(kstr.s); free(idx_stack); return -1; | |
214 } | |
215 if (kget_int32(&kstr, &pos, &e.start) == -1) { | |
216 free(kstr.s); free(idx_stack); return -1; | |
217 } | |
218 if (kget_int32(&kstr, &pos, &e.end) == -1) { | |
219 free(kstr.s); free(idx_stack); return -1; | |
220 } | |
221 if (kget_int64(&kstr, &pos, &e.offset) == -1) { | |
222 free(kstr.s); free(idx_stack); return -1; | |
223 } | |
224 if (kget_int32(&kstr, &pos, &e.slice) == -1) { | |
225 free(kstr.s); free(idx_stack); return -1; | |
226 } | |
227 if (kget_int32(&kstr, &pos, &e.len) == -1) { | |
228 free(kstr.s); free(idx_stack); return -1; | |
229 } | |
230 | |
231 e.end += e.start-1; | |
232 //printf("%d/%d..%d\n", e.refid, e.start, e.end); | |
233 | |
234 if (e.refid < -1) { | |
235 free(kstr.s); | |
236 free(idx_stack); | |
237 fprintf(stderr, "Malformed index file, refid %d\n", e.refid); | |
238 return -1; | |
239 } | |
240 | |
241 if (e.refid != idx->refid) { | |
242 if (fd->index_sz < e.refid+2) { | |
243 size_t index_end = fd->index_sz * sizeof(*fd->index); | |
244 fd->index_sz = e.refid+2; | |
245 fd->index = realloc(fd->index, | |
246 fd->index_sz * sizeof(*fd->index)); | |
247 memset(((char *)fd->index) + index_end, 0, | |
248 fd->index_sz * sizeof(*fd->index) - index_end); | |
249 } | |
250 idx = &fd->index[e.refid+1]; | |
251 idx->refid = e.refid; | |
252 idx->start = INT_MIN; | |
253 idx->end = INT_MAX; | |
254 idx->nslice = idx->nalloc = 0; | |
255 idx->e = NULL; | |
256 idx_stack[(idx_stack_ptr = 0)] = idx; | |
257 } | |
258 | |
259 while (!(e.start >= idx->start && e.end <= idx->end)) { | |
260 idx = idx_stack[--idx_stack_ptr]; | |
261 } | |
262 | |
263 // Now contains, so append | |
264 if (idx->nslice+1 >= idx->nalloc) { | |
265 idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16; | |
266 idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e)); | |
267 } | |
268 | |
269 e.nalloc = e.nslice = 0; e.e = NULL; | |
270 *(ep = &idx->e[idx->nslice++]) = e; | |
271 idx = ep; | |
272 | |
273 if (++idx_stack_ptr >= idx_stack_alloc) { | |
274 idx_stack_alloc *= 2; | |
275 idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack)); | |
276 } | |
277 idx_stack[idx_stack_ptr] = idx; | |
278 | |
279 while (pos < kstr.l && kstr.s[pos] != '\n') | |
280 pos++; | |
281 pos++; | |
282 } while (pos < kstr.l); | |
283 | |
284 free(idx_stack); | |
285 free(kstr.s); | |
286 | |
287 // dump_index(fd); | |
288 | |
289 return 0; | |
290 } | |
291 | |
292 static void cram_index_free_recurse(cram_index *e) { | |
293 if (e->e) { | |
294 int i; | |
295 for (i = 0; i < e->nslice; i++) { | |
296 cram_index_free_recurse(&e->e[i]); | |
297 } | |
298 free(e->e); | |
299 } | |
300 } | |
301 | |
302 void cram_index_free(cram_fd *fd) { | |
303 int i; | |
304 | |
305 if (!fd->index) | |
306 return; | |
307 | |
308 for (i = 0; i < fd->index_sz; i++) { | |
309 cram_index_free_recurse(&fd->index[i]); | |
310 } | |
311 free(fd->index); | |
312 | |
313 fd->index = NULL; | |
314 } | |
315 | |
316 /* | |
317 * Searches the index for the first slice overlapping a reference ID | |
318 * and position, or one immediately preceding it if none is found in | |
319 * the index to overlap this position. (Our index may have missing | |
320 * entries, but we require at least one per reference.) | |
321 * | |
322 * If the index finds multiple slices overlapping this position we | |
323 * return the first one only. Subsequent calls should specifying | |
324 * "from" as the last slice we checked to find the next one. Otherwise | |
325 * set "from" to be NULL to find the first one. | |
326 * | |
327 * Returns the cram_index pointer on sucess | |
328 * NULL on failure | |
329 */ | |
330 cram_index *cram_index_query(cram_fd *fd, int refid, int pos, | |
331 cram_index *from) { | |
332 int i, j, k; | |
333 cram_index *e; | |
334 | |
335 if (refid+1 < 0 || refid+1 >= fd->index_sz) | |
336 return NULL; | |
337 | |
338 i = 0, j = fd->index[refid+1].nslice-1; | |
339 | |
340 if (!from) | |
341 from = &fd->index[refid+1]; | |
342 | |
343 for (k = j/2; k != i; k = (j-i)/2 + i) { | |
344 if (from->e[k].refid > refid) { | |
345 j = k; | |
346 continue; | |
347 } | |
348 | |
349 if (from->e[k].refid < refid) { | |
350 i = k; | |
351 continue; | |
352 } | |
353 | |
354 if (from->e[k].start >= pos) { | |
355 j = k; | |
356 continue; | |
357 } | |
358 | |
359 if (from->e[k].start < pos) { | |
360 i = k; | |
361 continue; | |
362 } | |
363 } | |
364 // i==j or i==j-1. Check if j is better. | |
365 if (from->e[j].start < pos && from->e[j].refid == refid) | |
366 i = j; | |
367 | |
368 /* The above found *a* bin overlapping, but not necessarily the first */ | |
369 while (i > 0 && from->e[i-1].end >= pos) | |
370 i--; | |
371 | |
372 /* Special case for matching a start pos */ | |
373 if (i+1 < from->nslice && | |
374 from->e[i+1].start == pos && | |
375 from->e[i+1].refid == refid) | |
376 i++; | |
377 | |
378 e = &from->e[i]; | |
379 | |
380 return e; | |
381 } | |
382 | |
383 | |
384 /* | |
385 * Skips to a container overlapping the start coordinate listed in | |
386 * cram_range. | |
387 * | |
388 * In theory we call cram_index_query multiple times, once per slice | |
389 * overlapping the range. However slices may be absent from the index | |
390 * which makes this problematic. Instead we find the left-most slice | |
391 * and then read from then on, skipping decoding of slices and/or | |
392 * whole containers when they don't overlap the specified cram_range. | |
393 * | |
394 * Returns 0 on success | |
395 * -1 on failure | |
396 */ | |
397 int cram_seek_to_refpos(cram_fd *fd, cram_range *r) { | |
398 cram_index *e; | |
399 | |
400 // Ideally use an index, so see if we have one. | |
401 if ((e = cram_index_query(fd, r->refid, r->start, NULL))) { | |
402 if (0 != cram_seek(fd, e->offset, SEEK_SET)) | |
403 if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR)) | |
404 return -1; | |
405 } else { | |
406 fprintf(stderr, "Unknown reference ID. Missing from index?\n"); | |
407 return -1; | |
408 } | |
409 | |
410 if (fd->ctr) { | |
411 cram_free_container(fd->ctr); | |
412 fd->ctr = NULL; | |
413 fd->ooc = 0; | |
414 } | |
415 | |
416 return 0; | |
417 } | |
418 | |
419 | |
420 /* | |
421 * A specialised form of cram_index_build (below) that deals with slices | |
422 * having multiple references in this (ref_id -2). In this scenario we | |
423 * decode the slice to look at the RI data series instead. | |
424 * | |
425 * Returns 0 on success | |
426 * -1 on failure | |
427 */ | |
428 static int cram_index_build_multiref(cram_fd *fd, | |
429 cram_container *c, | |
430 cram_slice *s, | |
431 zfp *fp, | |
432 off_t cpos, | |
433 int32_t landmark, | |
434 int sz) { | |
435 int i, ref = -2, ref_start = 0, ref_end; | |
436 char buf[1024]; | |
437 | |
438 if (0 != cram_decode_slice(fd, c, s, fd->header)) | |
439 return -1; | |
440 | |
441 ref_end = INT_MIN; | |
442 for (i = 0; i < s->hdr->num_records; i++) { | |
443 if (s->crecs[i].ref_id == ref) { | |
444 if (ref_end < s->crecs[i].aend) | |
445 ref_end = s->crecs[i].aend; | |
446 continue; | |
447 } | |
448 | |
449 if (ref != -2) { | |
450 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", | |
451 ref, ref_start, ref_end - ref_start + 1, | |
452 (int64_t)cpos, landmark, sz); | |
453 zfputs(buf, fp); | |
454 } | |
455 | |
456 ref = s->crecs[i].ref_id; | |
457 ref_start = s->crecs[i].apos; | |
458 ref_end = INT_MIN; | |
459 } | |
460 | |
461 if (ref != -2) { | |
462 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", | |
463 ref, ref_start, ref_end - ref_start + 1, | |
464 (int64_t)cpos, landmark, sz); | |
465 zfputs(buf, fp); | |
466 } | |
467 | |
468 return 0; | |
469 } | |
470 | |
471 /* | |
472 * Builds an index file. | |
473 * | |
474 * fd is a newly opened cram file that we wish to index. | |
475 * fn_base is the filename of the associated CRAM file. Internally we | |
476 * add ".crai" to this to get the index filename. | |
477 * | |
478 * Returns 0 on success | |
479 * -1 on failure | |
480 */ | |
481 int cram_index_build(cram_fd *fd, const char *fn_base) { | |
482 cram_container *c; | |
483 off_t cpos, spos, hpos; | |
484 zfp *fp; | |
485 char fn_idx[PATH_MAX]; | |
486 | |
487 if (strlen(fn_base) > PATH_MAX-6) | |
488 return -1; | |
489 | |
490 sprintf(fn_idx, "%s.crai", fn_base); | |
491 if (!(fp = zfopen(fn_idx, "wz"))) { | |
492 perror(fn_idx); | |
493 return -1; | |
494 } | |
495 | |
496 cpos = htell(fd->fp); | |
497 while ((c = cram_read_container(fd))) { | |
498 int j; | |
499 | |
500 if (fd->err) { | |
501 perror("Cram container read"); | |
502 return 1; | |
503 } | |
504 | |
505 hpos = htell(fd->fp); | |
506 | |
507 if (!(c->comp_hdr_block = cram_read_block(fd))) | |
508 return 1; | |
509 assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER); | |
510 | |
511 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); | |
512 if (!c->comp_hdr) | |
513 return -1; | |
514 | |
515 // 2.0 format | |
516 for (j = 0; j < c->num_landmarks; j++) { | |
517 char buf[1024]; | |
518 cram_slice *s; | |
519 int sz; | |
520 | |
521 spos = htell(fd->fp); | |
522 assert(spos - cpos - c->offset == c->landmark[j]); | |
523 | |
524 if (!(s = cram_read_slice(fd))) { | |
525 zfclose(fp); | |
526 return -1; | |
527 } | |
528 | |
529 sz = (int)(htell(fd->fp) - spos); | |
530 | |
531 if (s->hdr->ref_seq_id == -2) { | |
532 cram_index_build_multiref(fd, c, s, fp, | |
533 cpos, c->landmark[j], sz); | |
534 } else { | |
535 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", | |
536 s->hdr->ref_seq_id, s->hdr->ref_seq_start, | |
537 s->hdr->ref_seq_span, (int64_t)cpos, | |
538 c->landmark[j], sz); | |
539 zfputs(buf, fp); | |
540 } | |
541 | |
542 cram_free_slice(s); | |
543 } | |
544 | |
545 cpos = htell(fd->fp); | |
546 assert(cpos == hpos + c->length); | |
547 | |
548 cram_free_container(c); | |
549 } | |
550 if (fd->err) { | |
551 zfclose(fp); | |
552 return -1; | |
553 } | |
554 | |
555 | |
556 return zfclose(fp); | |
557 } |