Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/cram/cram_index.c @ 0:dfa3745e5fd8
Uploaded
| author | youngkim |
|---|---|
| date | Thu, 24 Mar 2016 17:12:52 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfa3745e5fd8 |
|---|---|
| 1 /* | |
| 2 Copyright (c) 2013-2014 Genome Research Ltd. | |
| 3 Author: James Bonfield <jkb@sanger.ac.uk> | |
| 4 | |
| 5 Redistribution and use in source and binary forms, with or without | |
| 6 modification, are permitted provided that the following conditions are met: | |
| 7 | |
| 8 1. Redistributions of source code must retain the above copyright notice, | |
| 9 this list of conditions and the following disclaimer. | |
| 10 | |
| 11 2. Redistributions in binary form must reproduce the above copyright notice, | |
| 12 this list of conditions and the following disclaimer in the documentation | |
| 13 and/or other materials provided with the distribution. | |
| 14 | |
| 15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger | |
| 16 Institute nor the names of its contributors may be used to endorse or promote | |
| 17 products derived from this software without specific prior written permission. | |
| 18 | |
| 19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND | |
| 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE | |
| 23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| 25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
| 26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 29 */ | |
| 30 | |
| 31 /* | |
| 32 * The index is a gzipped tab-delimited text file with one line per slice. | |
| 33 * The columns are: | |
| 34 * 1: reference number (0 to N-1, as per BAM ref_id) | |
| 35 * 2: reference position of 1st read in slice (1..?) | |
| 36 * 3: number of reads in slice | |
| 37 * 4: offset of container start (relative to end of SAM header, so 1st | |
| 38 * container is offset 0). | |
| 39 * 5: slice number within container (ie which landmark). | |
| 40 * | |
| 41 * In memory, we hold this in a nested containment list. Each list element is | |
| 42 * a cram_index struct. Each element in turn can contain its own list of | |
| 43 * cram_index structs. | |
| 44 * | |
| 45 * Any start..end range which is entirely contained within another (and | |
| 46 * earlier as it is sorted) range will be held within it. This ensures that | |
| 47 * the outer list will never have containments and we can safely do a | |
| 48 * binary search to find the first range which overlaps any given coordinate. | |
| 49 */ | |
| 50 | |
| 51 #ifdef HAVE_CONFIG_H | |
| 52 #include "io_lib_config.h" | |
| 53 #endif | |
| 54 | |
| 55 #include <stdio.h> | |
| 56 #include <errno.h> | |
| 57 #include <assert.h> | |
| 58 #include <stdlib.h> | |
| 59 #include <string.h> | |
| 60 #include <zlib.h> | |
| 61 #include <sys/types.h> | |
| 62 #include <sys/stat.h> | |
| 63 #include <math.h> | |
| 64 #include <ctype.h> | |
| 65 | |
| 66 #include "htslib/hfile.h" | |
| 67 #include "cram/cram.h" | |
| 68 #include "cram/os.h" | |
| 69 #include "cram/zfio.h" | |
| 70 | |
| 71 #if 0 | |
| 72 static void dump_index_(cram_index *e, int level) { | |
| 73 int i, n; | |
| 74 n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end); | |
| 75 printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset); | |
| 76 for (i = 0; i < e->nslice; i++) { | |
| 77 dump_index_(&e->e[i], level+1); | |
| 78 } | |
| 79 } | |
| 80 | |
| 81 static void dump_index(cram_fd *fd) { | |
| 82 int i; | |
| 83 for (i = 0; i < fd->index_sz; i++) { | |
| 84 dump_index_(&fd->index[i], 0); | |
| 85 } | |
| 86 } | |
| 87 #endif | |
| 88 | |
| 89 static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) { | |
| 90 int sign = 1; | |
| 91 int32_t val = 0; | |
| 92 size_t p = *pos; | |
| 93 | |
| 94 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t')) | |
| 95 p++; | |
| 96 | |
| 97 if (p < k->l && k->s[p] == '-') | |
| 98 sign = -1, p++; | |
| 99 | |
| 100 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9')) | |
| 101 return -1; | |
| 102 | |
| 103 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') | |
| 104 val = val*10 + k->s[p++]-'0'; | |
| 105 | |
| 106 *pos = p; | |
| 107 *val_p = sign*val; | |
| 108 | |
| 109 return 0; | |
| 110 } | |
| 111 | |
| 112 static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) { | |
| 113 int sign = 1; | |
| 114 int64_t val = 0; | |
| 115 size_t p = *pos; | |
| 116 | |
| 117 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t')) | |
| 118 p++; | |
| 119 | |
| 120 if (p < k->l && k->s[p] == '-') | |
| 121 sign = -1, p++; | |
| 122 | |
| 123 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9')) | |
| 124 return -1; | |
| 125 | |
| 126 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') | |
| 127 val = val*10 + k->s[p++]-'0'; | |
| 128 | |
| 129 *pos = p; | |
| 130 *val_p = sign*val; | |
| 131 | |
| 132 return 0; | |
| 133 } | |
| 134 | |
| 135 /* | |
| 136 * Loads a CRAM .crai index into memory. | |
| 137 * | |
| 138 * Returns 0 for success | |
| 139 * -1 for failure | |
| 140 */ | |
| 141 int cram_index_load(cram_fd *fd, const char *fn) { | |
| 142 char fn2[PATH_MAX]; | |
| 143 char buf[65536]; | |
| 144 ssize_t len; | |
| 145 kstring_t kstr = {0}; | |
| 146 hFILE *fp; | |
| 147 cram_index *idx; | |
| 148 cram_index **idx_stack = NULL, *ep, e; | |
| 149 int idx_stack_alloc = 0, idx_stack_ptr = 0; | |
| 150 size_t pos = 0; | |
| 151 | |
| 152 /* Check if already loaded */ | |
| 153 if (fd->index) | |
| 154 return 0; | |
| 155 | |
| 156 fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index)); | |
| 157 if (!fd->index) | |
| 158 return -1; | |
| 159 | |
| 160 idx = &fd->index[0]; | |
| 161 idx->refid = -1; | |
| 162 idx->start = INT_MIN; | |
| 163 idx->end = INT_MAX; | |
| 164 | |
| 165 idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack)); | |
| 166 idx_stack[idx_stack_ptr] = idx; | |
| 167 | |
| 168 sprintf(fn2, "%s.crai", fn); | |
| 169 if (!(fp = hopen(fn2, "r"))) { | |
| 170 perror(fn2); | |
| 171 free(idx_stack); | |
| 172 return -1; | |
| 173 } | |
| 174 | |
| 175 // Load the file into memory | |
| 176 while ((len = hread(fp, buf, 65536)) > 0) | |
| 177 kputsn(buf, len, &kstr); | |
| 178 if (len < 0 || kstr.l < 2) { | |
| 179 if (kstr.s) | |
| 180 free(kstr.s); | |
| 181 free(idx_stack); | |
| 182 return -1; | |
| 183 } | |
| 184 | |
| 185 if (hclose(fp)) { | |
| 186 if (kstr.s) | |
| 187 free(kstr.s); | |
| 188 free(idx_stack); | |
| 189 return -1; | |
| 190 } | |
| 191 | |
| 192 | |
| 193 // Uncompress if required | |
| 194 if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) { | |
| 195 size_t l; | |
| 196 char *s = zlib_mem_inflate(kstr.s, kstr.l, &l); | |
| 197 free(kstr.s); | |
| 198 if (!s) { | |
| 199 free(idx_stack); | |
| 200 return -1; | |
| 201 } | |
| 202 kstr.s = s; | |
| 203 kstr.l = l; | |
| 204 kstr.m = l; // conservative estimate of the size allocated | |
| 205 kputsn("", 0, &kstr); // ensure kstr.s is NUL-terminated | |
| 206 } | |
| 207 | |
| 208 | |
| 209 // Parse it line at a time | |
| 210 do { | |
| 211 /* 1.1 layout */ | |
| 212 if (kget_int32(&kstr, &pos, &e.refid) == -1) { | |
| 213 free(kstr.s); free(idx_stack); return -1; | |
| 214 } | |
| 215 if (kget_int32(&kstr, &pos, &e.start) == -1) { | |
| 216 free(kstr.s); free(idx_stack); return -1; | |
| 217 } | |
| 218 if (kget_int32(&kstr, &pos, &e.end) == -1) { | |
| 219 free(kstr.s); free(idx_stack); return -1; | |
| 220 } | |
| 221 if (kget_int64(&kstr, &pos, &e.offset) == -1) { | |
| 222 free(kstr.s); free(idx_stack); return -1; | |
| 223 } | |
| 224 if (kget_int32(&kstr, &pos, &e.slice) == -1) { | |
| 225 free(kstr.s); free(idx_stack); return -1; | |
| 226 } | |
| 227 if (kget_int32(&kstr, &pos, &e.len) == -1) { | |
| 228 free(kstr.s); free(idx_stack); return -1; | |
| 229 } | |
| 230 | |
| 231 e.end += e.start-1; | |
| 232 //printf("%d/%d..%d\n", e.refid, e.start, e.end); | |
| 233 | |
| 234 if (e.refid < -1) { | |
| 235 free(kstr.s); | |
| 236 free(idx_stack); | |
| 237 fprintf(stderr, "Malformed index file, refid %d\n", e.refid); | |
| 238 return -1; | |
| 239 } | |
| 240 | |
| 241 if (e.refid != idx->refid) { | |
| 242 if (fd->index_sz < e.refid+2) { | |
| 243 size_t index_end = fd->index_sz * sizeof(*fd->index); | |
| 244 fd->index_sz = e.refid+2; | |
| 245 fd->index = realloc(fd->index, | |
| 246 fd->index_sz * sizeof(*fd->index)); | |
| 247 memset(((char *)fd->index) + index_end, 0, | |
| 248 fd->index_sz * sizeof(*fd->index) - index_end); | |
| 249 } | |
| 250 idx = &fd->index[e.refid+1]; | |
| 251 idx->refid = e.refid; | |
| 252 idx->start = INT_MIN; | |
| 253 idx->end = INT_MAX; | |
| 254 idx->nslice = idx->nalloc = 0; | |
| 255 idx->e = NULL; | |
| 256 idx_stack[(idx_stack_ptr = 0)] = idx; | |
| 257 } | |
| 258 | |
| 259 while (!(e.start >= idx->start && e.end <= idx->end)) { | |
| 260 idx = idx_stack[--idx_stack_ptr]; | |
| 261 } | |
| 262 | |
| 263 // Now contains, so append | |
| 264 if (idx->nslice+1 >= idx->nalloc) { | |
| 265 idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16; | |
| 266 idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e)); | |
| 267 } | |
| 268 | |
| 269 e.nalloc = e.nslice = 0; e.e = NULL; | |
| 270 *(ep = &idx->e[idx->nslice++]) = e; | |
| 271 idx = ep; | |
| 272 | |
| 273 if (++idx_stack_ptr >= idx_stack_alloc) { | |
| 274 idx_stack_alloc *= 2; | |
| 275 idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack)); | |
| 276 } | |
| 277 idx_stack[idx_stack_ptr] = idx; | |
| 278 | |
| 279 while (pos < kstr.l && kstr.s[pos] != '\n') | |
| 280 pos++; | |
| 281 pos++; | |
| 282 } while (pos < kstr.l); | |
| 283 | |
| 284 free(idx_stack); | |
| 285 free(kstr.s); | |
| 286 | |
| 287 // dump_index(fd); | |
| 288 | |
| 289 return 0; | |
| 290 } | |
| 291 | |
| 292 static void cram_index_free_recurse(cram_index *e) { | |
| 293 if (e->e) { | |
| 294 int i; | |
| 295 for (i = 0; i < e->nslice; i++) { | |
| 296 cram_index_free_recurse(&e->e[i]); | |
| 297 } | |
| 298 free(e->e); | |
| 299 } | |
| 300 } | |
| 301 | |
| 302 void cram_index_free(cram_fd *fd) { | |
| 303 int i; | |
| 304 | |
| 305 if (!fd->index) | |
| 306 return; | |
| 307 | |
| 308 for (i = 0; i < fd->index_sz; i++) { | |
| 309 cram_index_free_recurse(&fd->index[i]); | |
| 310 } | |
| 311 free(fd->index); | |
| 312 | |
| 313 fd->index = NULL; | |
| 314 } | |
| 315 | |
| 316 /* | |
| 317 * Searches the index for the first slice overlapping a reference ID | |
| 318 * and position, or one immediately preceding it if none is found in | |
| 319 * the index to overlap this position. (Our index may have missing | |
| 320 * entries, but we require at least one per reference.) | |
| 321 * | |
| 322 * If the index finds multiple slices overlapping this position we | |
| 323 * return the first one only. Subsequent calls should specifying | |
| 324 * "from" as the last slice we checked to find the next one. Otherwise | |
| 325 * set "from" to be NULL to find the first one. | |
| 326 * | |
| 327 * Returns the cram_index pointer on sucess | |
| 328 * NULL on failure | |
| 329 */ | |
| 330 cram_index *cram_index_query(cram_fd *fd, int refid, int pos, | |
| 331 cram_index *from) { | |
| 332 int i, j, k; | |
| 333 cram_index *e; | |
| 334 | |
| 335 if (refid+1 < 0 || refid+1 >= fd->index_sz) | |
| 336 return NULL; | |
| 337 | |
| 338 i = 0, j = fd->index[refid+1].nslice-1; | |
| 339 | |
| 340 if (!from) | |
| 341 from = &fd->index[refid+1]; | |
| 342 | |
| 343 for (k = j/2; k != i; k = (j-i)/2 + i) { | |
| 344 if (from->e[k].refid > refid) { | |
| 345 j = k; | |
| 346 continue; | |
| 347 } | |
| 348 | |
| 349 if (from->e[k].refid < refid) { | |
| 350 i = k; | |
| 351 continue; | |
| 352 } | |
| 353 | |
| 354 if (from->e[k].start >= pos) { | |
| 355 j = k; | |
| 356 continue; | |
| 357 } | |
| 358 | |
| 359 if (from->e[k].start < pos) { | |
| 360 i = k; | |
| 361 continue; | |
| 362 } | |
| 363 } | |
| 364 // i==j or i==j-1. Check if j is better. | |
| 365 if (from->e[j].start < pos && from->e[j].refid == refid) | |
| 366 i = j; | |
| 367 | |
| 368 /* The above found *a* bin overlapping, but not necessarily the first */ | |
| 369 while (i > 0 && from->e[i-1].end >= pos) | |
| 370 i--; | |
| 371 | |
| 372 /* Special case for matching a start pos */ | |
| 373 if (i+1 < from->nslice && | |
| 374 from->e[i+1].start == pos && | |
| 375 from->e[i+1].refid == refid) | |
| 376 i++; | |
| 377 | |
| 378 e = &from->e[i]; | |
| 379 | |
| 380 return e; | |
| 381 } | |
| 382 | |
| 383 | |
| 384 /* | |
| 385 * Skips to a container overlapping the start coordinate listed in | |
| 386 * cram_range. | |
| 387 * | |
| 388 * In theory we call cram_index_query multiple times, once per slice | |
| 389 * overlapping the range. However slices may be absent from the index | |
| 390 * which makes this problematic. Instead we find the left-most slice | |
| 391 * and then read from then on, skipping decoding of slices and/or | |
| 392 * whole containers when they don't overlap the specified cram_range. | |
| 393 * | |
| 394 * Returns 0 on success | |
| 395 * -1 on failure | |
| 396 */ | |
| 397 int cram_seek_to_refpos(cram_fd *fd, cram_range *r) { | |
| 398 cram_index *e; | |
| 399 | |
| 400 // Ideally use an index, so see if we have one. | |
| 401 if ((e = cram_index_query(fd, r->refid, r->start, NULL))) { | |
| 402 if (0 != cram_seek(fd, e->offset, SEEK_SET)) | |
| 403 if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR)) | |
| 404 return -1; | |
| 405 } else { | |
| 406 fprintf(stderr, "Unknown reference ID. Missing from index?\n"); | |
| 407 return -1; | |
| 408 } | |
| 409 | |
| 410 if (fd->ctr) { | |
| 411 cram_free_container(fd->ctr); | |
| 412 fd->ctr = NULL; | |
| 413 fd->ooc = 0; | |
| 414 } | |
| 415 | |
| 416 return 0; | |
| 417 } | |
| 418 | |
| 419 | |
| 420 /* | |
| 421 * A specialised form of cram_index_build (below) that deals with slices | |
| 422 * having multiple references in this (ref_id -2). In this scenario we | |
| 423 * decode the slice to look at the RI data series instead. | |
| 424 * | |
| 425 * Returns 0 on success | |
| 426 * -1 on failure | |
| 427 */ | |
| 428 static int cram_index_build_multiref(cram_fd *fd, | |
| 429 cram_container *c, | |
| 430 cram_slice *s, | |
| 431 zfp *fp, | |
| 432 off_t cpos, | |
| 433 int32_t landmark, | |
| 434 int sz) { | |
| 435 int i, ref = -2, ref_start = 0, ref_end; | |
| 436 char buf[1024]; | |
| 437 | |
| 438 if (0 != cram_decode_slice(fd, c, s, fd->header)) | |
| 439 return -1; | |
| 440 | |
| 441 ref_end = INT_MIN; | |
| 442 for (i = 0; i < s->hdr->num_records; i++) { | |
| 443 if (s->crecs[i].ref_id == ref) { | |
| 444 if (ref_end < s->crecs[i].aend) | |
| 445 ref_end = s->crecs[i].aend; | |
| 446 continue; | |
| 447 } | |
| 448 | |
| 449 if (ref != -2) { | |
| 450 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", | |
| 451 ref, ref_start, ref_end - ref_start + 1, | |
| 452 (int64_t)cpos, landmark, sz); | |
| 453 zfputs(buf, fp); | |
| 454 } | |
| 455 | |
| 456 ref = s->crecs[i].ref_id; | |
| 457 ref_start = s->crecs[i].apos; | |
| 458 ref_end = INT_MIN; | |
| 459 } | |
| 460 | |
| 461 if (ref != -2) { | |
| 462 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", | |
| 463 ref, ref_start, ref_end - ref_start + 1, | |
| 464 (int64_t)cpos, landmark, sz); | |
| 465 zfputs(buf, fp); | |
| 466 } | |
| 467 | |
| 468 return 0; | |
| 469 } | |
| 470 | |
| 471 /* | |
| 472 * Builds an index file. | |
| 473 * | |
| 474 * fd is a newly opened cram file that we wish to index. | |
| 475 * fn_base is the filename of the associated CRAM file. Internally we | |
| 476 * add ".crai" to this to get the index filename. | |
| 477 * | |
| 478 * Returns 0 on success | |
| 479 * -1 on failure | |
| 480 */ | |
| 481 int cram_index_build(cram_fd *fd, const char *fn_base) { | |
| 482 cram_container *c; | |
| 483 off_t cpos, spos, hpos; | |
| 484 zfp *fp; | |
| 485 char fn_idx[PATH_MAX]; | |
| 486 | |
| 487 if (strlen(fn_base) > PATH_MAX-6) | |
| 488 return -1; | |
| 489 | |
| 490 sprintf(fn_idx, "%s.crai", fn_base); | |
| 491 if (!(fp = zfopen(fn_idx, "wz"))) { | |
| 492 perror(fn_idx); | |
| 493 return -1; | |
| 494 } | |
| 495 | |
| 496 cpos = htell(fd->fp); | |
| 497 while ((c = cram_read_container(fd))) { | |
| 498 int j; | |
| 499 | |
| 500 if (fd->err) { | |
| 501 perror("Cram container read"); | |
| 502 return 1; | |
| 503 } | |
| 504 | |
| 505 hpos = htell(fd->fp); | |
| 506 | |
| 507 if (!(c->comp_hdr_block = cram_read_block(fd))) | |
| 508 return 1; | |
| 509 assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER); | |
| 510 | |
| 511 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); | |
| 512 if (!c->comp_hdr) | |
| 513 return -1; | |
| 514 | |
| 515 // 2.0 format | |
| 516 for (j = 0; j < c->num_landmarks; j++) { | |
| 517 char buf[1024]; | |
| 518 cram_slice *s; | |
| 519 int sz; | |
| 520 | |
| 521 spos = htell(fd->fp); | |
| 522 assert(spos - cpos - c->offset == c->landmark[j]); | |
| 523 | |
| 524 if (!(s = cram_read_slice(fd))) { | |
| 525 zfclose(fp); | |
| 526 return -1; | |
| 527 } | |
| 528 | |
| 529 sz = (int)(htell(fd->fp) - spos); | |
| 530 | |
| 531 if (s->hdr->ref_seq_id == -2) { | |
| 532 cram_index_build_multiref(fd, c, s, fp, | |
| 533 cpos, c->landmark[j], sz); | |
| 534 } else { | |
| 535 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", | |
| 536 s->hdr->ref_seq_id, s->hdr->ref_seq_start, | |
| 537 s->hdr->ref_seq_span, (int64_t)cpos, | |
| 538 c->landmark[j], sz); | |
| 539 zfputs(buf, fp); | |
| 540 } | |
| 541 | |
| 542 cram_free_slice(s); | |
| 543 } | |
| 544 | |
| 545 cpos = htell(fd->fp); | |
| 546 assert(cpos == hpos + c->length); | |
| 547 | |
| 548 cram_free_container(c); | |
| 549 } | |
| 550 if (fd->err) { | |
| 551 zfclose(fp); | |
| 552 return -1; | |
| 553 } | |
| 554 | |
| 555 | |
| 556 return zfclose(fp); | |
| 557 } |
