Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/hts.c @ 0:dfa3745e5fd8
Uploaded
| author | youngkim |
|---|---|
| date | Thu, 24 Mar 2016 17:12:52 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfa3745e5fd8 |
|---|---|
| 1 /* hts.c -- format-neutral I/O, indexing, and iterator API functions. | |
| 2 | |
| 3 Copyright (C) 2008, 2009, 2012-2015 Genome Research Ltd. | |
| 4 Copyright (C) 2012, 2013 Broad Institute. | |
| 5 | |
| 6 Author: Heng Li <lh3@sanger.ac.uk> | |
| 7 | |
| 8 Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 9 of this software and associated documentation files (the "Software"), to deal | |
| 10 in the Software without restriction, including without limitation the rights | |
| 11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 12 copies of the Software, and to permit persons to whom the Software is | |
| 13 furnished to do so, subject to the following conditions: | |
| 14 | |
| 15 The above copyright notice and this permission notice shall be included in | |
| 16 all copies or substantial portions of the Software. | |
| 17 | |
| 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
| 21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
| 23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
| 24 DEALINGS IN THE SOFTWARE. */ | |
| 25 | |
| 26 #include <zlib.h> | |
| 27 #include <ctype.h> | |
| 28 #include <stdio.h> | |
| 29 #include <string.h> | |
| 30 #include <stdlib.h> | |
| 31 #include <limits.h> | |
| 32 #include <fcntl.h> | |
| 33 #include <errno.h> | |
| 34 #include <sys/stat.h> | |
| 35 #include "htslib/bgzf.h" | |
| 36 #include "htslib/hts.h" | |
| 37 #include "cram/cram.h" | |
| 38 #include "htslib/hfile.h" | |
| 39 #include "version.h" | |
| 40 | |
| 41 #include "htslib/kseq.h" | |
| 42 #define KS_BGZF 1 | |
| 43 #if KS_BGZF | |
| 44 // bgzf now supports gzip-compressed files, the gzFile branch can be removed | |
| 45 KSTREAM_INIT2(, BGZF*, bgzf_read, 65536) | |
| 46 #else | |
| 47 KSTREAM_INIT2(, gzFile, gzread, 16384) | |
| 48 #endif | |
| 49 | |
| 50 #include "htslib/khash.h" | |
| 51 KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal) | |
| 52 | |
| 53 int hts_verbose = 3; | |
| 54 | |
| 55 const char *hts_version() | |
| 56 { | |
| 57 return HTS_VERSION; | |
| 58 } | |
| 59 | |
| 60 const unsigned char seq_nt16_table[256] = { | |
| 61 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
| 62 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
| 63 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
| 64 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, | |
| 65 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, | |
| 66 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, | |
| 67 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, | |
| 68 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, | |
| 69 | |
| 70 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
| 71 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
| 72 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
| 73 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
| 74 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
| 75 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
| 76 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
| 77 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 | |
| 78 }; | |
| 79 | |
| 80 const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN"; | |
| 81 | |
| 82 const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; | |
| 83 | |
| 84 /********************** | |
| 85 *** Basic file I/O *** | |
| 86 **********************/ | |
| 87 | |
| 88 static enum htsFormatCategory format_category(enum htsExactFormat fmt) | |
| 89 { | |
| 90 switch (fmt) { | |
| 91 case bam: | |
| 92 case sam: | |
| 93 case cram: | |
| 94 return sequence_data; | |
| 95 | |
| 96 case vcf: | |
| 97 case bcf: | |
| 98 return variant_data; | |
| 99 | |
| 100 case bai: | |
| 101 case crai: | |
| 102 case csi: | |
| 103 case gzi: | |
| 104 case tbi: | |
| 105 return index_file; | |
| 106 | |
| 107 case bed: | |
| 108 return region_list; | |
| 109 | |
| 110 case unknown_format: | |
| 111 case binary_format: | |
| 112 case text_format: | |
| 113 case format_maximum: | |
| 114 break; | |
| 115 } | |
| 116 | |
| 117 return unknown_category; | |
| 118 } | |
| 119 | |
| 120 // Decompress up to ten or so bytes by peeking at the file, which must be | |
| 121 // positioned at the start of a GZIP block. | |
| 122 static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize) | |
| 123 { | |
| 124 // Typically at most a couple of hundred bytes of input are required | |
| 125 // to get a few bytes of output from inflate(), so hopefully this buffer | |
| 126 // size suffices in general. | |
| 127 unsigned char buffer[512]; | |
| 128 z_stream zs; | |
| 129 ssize_t npeek = hpeek(fp, buffer, sizeof buffer); | |
| 130 | |
| 131 if (npeek < 0) return 0; | |
| 132 | |
| 133 zs.zalloc = NULL; | |
| 134 zs.zfree = NULL; | |
| 135 zs.next_in = buffer; | |
| 136 zs.avail_in = npeek; | |
| 137 zs.next_out = dest; | |
| 138 zs.avail_out = destsize; | |
| 139 if (inflateInit2(&zs, 31) != Z_OK) return 0; | |
| 140 | |
| 141 while (zs.total_out < destsize) | |
| 142 if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break; | |
| 143 | |
| 144 destsize = zs.total_out; | |
| 145 inflateEnd(&zs); | |
| 146 | |
| 147 return destsize; | |
| 148 } | |
| 149 | |
| 150 // Parse "x.y" text, taking care because the string is not NUL-terminated | |
| 151 // and filling in major/minor only when the digits are followed by a delimiter, | |
| 152 // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer. | |
| 153 static void | |
| 154 parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim) | |
| 155 { | |
| 156 const char *str = (const char *) u; | |
| 157 const char *slim = (const char *) ulim; | |
| 158 const char *s; | |
| 159 | |
| 160 fmt->version.major = fmt->version.minor = -1; | |
| 161 | |
| 162 for (s = str; s < slim; s++) if (!isdigit(*s)) break; | |
| 163 if (s < slim) { | |
| 164 fmt->version.major = atoi(str); | |
| 165 if (*s == '.') { | |
| 166 str = &s[1]; | |
| 167 for (s = str; s < slim; s++) if (!isdigit(*s)) break; | |
| 168 if (s < slim) | |
| 169 fmt->version.minor = atoi(str); | |
| 170 } | |
| 171 else | |
| 172 fmt->version.minor = 0; | |
| 173 } | |
| 174 } | |
| 175 | |
| 176 int hts_detect_format(hFILE *hfile, htsFormat *fmt) | |
| 177 { | |
| 178 unsigned char s[21]; | |
| 179 ssize_t len = hpeek(hfile, s, 18); | |
| 180 if (len < 0) return -1; | |
| 181 | |
| 182 if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) { | |
| 183 // The stream is either gzip-compressed or BGZF-compressed. | |
| 184 // Determine which, and decompress the first few bytes. | |
| 185 fmt->compression = (len >= 18 && (s[3] & 4) && | |
| 186 memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip; | |
| 187 len = decompress_peek(hfile, s, sizeof s); | |
| 188 } | |
| 189 else { | |
| 190 fmt->compression = no_compression; | |
| 191 len = hpeek(hfile, s, sizeof s); | |
| 192 } | |
| 193 if (len < 0) return -1; | |
| 194 | |
| 195 fmt->compression_level = -1; | |
| 196 fmt->specific = NULL; | |
| 197 | |
| 198 if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) { | |
| 199 fmt->category = sequence_data; | |
| 200 fmt->format = cram; | |
| 201 fmt->version.major = s[4], fmt->version.minor = s[5]; | |
| 202 fmt->compression = custom; | |
| 203 return 0; | |
| 204 } | |
| 205 else if (len >= 4 && s[3] <= '\4') { | |
| 206 if (memcmp(s, "BAM\1", 4) == 0) { | |
| 207 fmt->category = sequence_data; | |
| 208 fmt->format = bam; | |
| 209 // TODO Decompress enough to pick version from @HD-VN header | |
| 210 fmt->version.major = 1, fmt->version.minor = -1; | |
| 211 return 0; | |
| 212 } | |
| 213 else if (memcmp(s, "BAI\1", 4) == 0) { | |
| 214 fmt->category = index_file; | |
| 215 fmt->format = bai; | |
| 216 fmt->version.major = -1, fmt->version.minor = -1; | |
| 217 return 0; | |
| 218 } | |
| 219 else if (memcmp(s, "BCF\4", 4) == 0) { | |
| 220 fmt->category = variant_data; | |
| 221 fmt->format = bcf; | |
| 222 fmt->version.major = 1, fmt->version.minor = -1; | |
| 223 return 0; | |
| 224 } | |
| 225 else if (memcmp(s, "BCF\2", 4) == 0) { | |
| 226 fmt->category = variant_data; | |
| 227 fmt->format = bcf; | |
| 228 fmt->version.major = s[3]; | |
| 229 fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0; | |
| 230 return 0; | |
| 231 } | |
| 232 else if (memcmp(s, "CSI\1", 4) == 0) { | |
| 233 fmt->category = index_file; | |
| 234 fmt->format = csi; | |
| 235 fmt->version.major = 1, fmt->version.minor = -1; | |
| 236 return 0; | |
| 237 } | |
| 238 else if (memcmp(s, "TBI\1", 4) == 0) { | |
| 239 fmt->category = index_file; | |
| 240 fmt->format = tbi; | |
| 241 fmt->version.major = -1, fmt->version.minor = -1; | |
| 242 return 0; | |
| 243 } | |
| 244 } | |
| 245 else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) { | |
| 246 fmt->category = variant_data; | |
| 247 fmt->format = vcf; | |
| 248 if (len >= 21 && s[16] == 'v') | |
| 249 parse_version(fmt, &s[17], &s[len]); | |
| 250 else | |
| 251 fmt->version.major = fmt->version.minor = -1; | |
| 252 return 0; | |
| 253 } | |
| 254 else if (len >= 4 && s[0] == '@' && | |
| 255 (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 || | |
| 256 memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) { | |
| 257 fmt->category = sequence_data; | |
| 258 fmt->format = sam; | |
| 259 // @HD-VN is not guaranteed to be the first tag, but then @HD is | |
| 260 // not guaranteed to be present at all... | |
| 261 if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0) | |
| 262 parse_version(fmt, &s[7], &s[len]); | |
| 263 else | |
| 264 fmt->version.major = 1, fmt->version.minor = -1; | |
| 265 return 0; | |
| 266 } | |
| 267 else { | |
| 268 // Various possibilities for tab-delimited text: | |
| 269 // .crai (gzipped tab-delimited six columns: seqid 5*number) | |
| 270 // .bed ([3..12] tab-delimited columns) | |
| 271 // .bedpe (>= 10 tab-delimited columns) | |
| 272 // .sam (tab-delimited >= 11 columns: seqid number seqid...) | |
| 273 // FIXME For now, assume it's SAM | |
| 274 fmt->category = sequence_data; | |
| 275 fmt->format = sam; | |
| 276 fmt->version.major = 1, fmt->version.minor = -1; | |
| 277 return 0; | |
| 278 } | |
| 279 | |
| 280 fmt->category = unknown_category; | |
| 281 fmt->format = unknown_format; | |
| 282 fmt->version.major = fmt->version.minor = -1; | |
| 283 fmt->compression = no_compression; | |
| 284 return 0; | |
| 285 } | |
| 286 | |
| 287 char *hts_format_description(const htsFormat *format) | |
| 288 { | |
| 289 kstring_t str = { 0, 0, NULL }; | |
| 290 | |
| 291 switch (format->format) { | |
| 292 case sam: kputs("SAM", &str); break; | |
| 293 case bam: kputs("BAM", &str); break; | |
| 294 case cram: kputs("CRAM", &str); break; | |
| 295 case vcf: kputs("VCF", &str); break; | |
| 296 case bcf: | |
| 297 if (format->version.major == 1) kputs("Legacy BCF", &str); | |
| 298 else kputs("BCF", &str); | |
| 299 break; | |
| 300 case bai: kputs("BAI", &str); break; | |
| 301 case crai: kputs("CRAI", &str); break; | |
| 302 case csi: kputs("CSI", &str); break; | |
| 303 case tbi: kputs("Tabix", &str); break; | |
| 304 default: kputs("unknown", &str); break; | |
| 305 } | |
| 306 | |
| 307 if (format->version.major >= 0) { | |
| 308 kputs(" version ", &str); | |
| 309 kputw(format->version.major, &str); | |
| 310 if (format->version.minor >= 0) { | |
| 311 kputc('.', &str); | |
| 312 kputw(format->version.minor, &str); | |
| 313 } | |
| 314 } | |
| 315 | |
| 316 switch (format->compression) { | |
| 317 case custom: kputs(" compressed", &str); break; | |
| 318 case gzip: kputs(" gzip-compressed", &str); break; | |
| 319 case bgzf: | |
| 320 switch (format->format) { | |
| 321 case bam: | |
| 322 case bcf: | |
| 323 case csi: | |
| 324 case tbi: | |
| 325 // These are by definition BGZF, so just use the generic term | |
| 326 kputs(" compressed", &str); | |
| 327 break; | |
| 328 default: | |
| 329 kputs(" BGZF-compressed", &str); | |
| 330 break; | |
| 331 } | |
| 332 break; | |
| 333 default: break; | |
| 334 } | |
| 335 | |
| 336 switch (format->category) { | |
| 337 case sequence_data: kputs(" sequence", &str); break; | |
| 338 case variant_data: kputs(" variant calling", &str); break; | |
| 339 case index_file: kputs(" index", &str); break; | |
| 340 case region_list: kputs(" genomic region", &str); break; | |
| 341 default: break; | |
| 342 } | |
| 343 | |
| 344 if (format->compression == no_compression) | |
| 345 switch (format->format) { | |
| 346 case sam: | |
| 347 case crai: | |
| 348 case vcf: | |
| 349 case bed: | |
| 350 kputs(" text", &str); | |
| 351 break; | |
| 352 | |
| 353 default: | |
| 354 kputs(" data", &str); | |
| 355 break; | |
| 356 } | |
| 357 else | |
| 358 kputs(" data", &str); | |
| 359 | |
| 360 return ks_release(&str); | |
| 361 } | |
| 362 | |
| 363 htsFile *hts_open(const char *fn, const char *mode) | |
| 364 { | |
| 365 htsFile *fp = NULL; | |
| 366 hFILE *hfile = hopen(fn, mode); | |
| 367 if (hfile == NULL) goto error; | |
| 368 | |
| 369 fp = hts_hopen(hfile, fn, mode); | |
| 370 if (fp == NULL) goto error; | |
| 371 | |
| 372 return fp; | |
| 373 | |
| 374 error: | |
| 375 if (hts_verbose >= 2) | |
| 376 fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn); | |
| 377 | |
| 378 if (hfile) | |
| 379 hclose_abruptly(hfile); | |
| 380 | |
| 381 return NULL; | |
| 382 } | |
| 383 | |
| 384 htsFile *hts_hopen(struct hFILE *hfile, const char *fn, const char *mode) | |
| 385 { | |
| 386 htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile)); | |
| 387 if (fp == NULL) goto error; | |
| 388 | |
| 389 fp->fn = strdup(fn); | |
| 390 fp->is_be = ed_is_big(); | |
| 391 | |
| 392 if (strchr(mode, 'r')) { | |
| 393 if (hts_detect_format(hfile, &fp->format) < 0) goto error; | |
| 394 } | |
| 395 else if (strchr(mode, 'w') || strchr(mode, 'a')) { | |
| 396 htsFormat *fmt = &fp->format; | |
| 397 fp->is_write = 1; | |
| 398 | |
| 399 if (strchr(mode, 'b')) fmt->format = binary_format; | |
| 400 else if (strchr(mode, 'c')) fmt->format = cram; | |
| 401 else fmt->format = text_format; | |
| 402 | |
| 403 if (strchr(mode, 'z')) fmt->compression = bgzf; | |
| 404 else if (strchr(mode, 'g')) fmt->compression = gzip; | |
| 405 else if (strchr(mode, 'u')) fmt->compression = no_compression; | |
| 406 else { | |
| 407 // No compression mode specified, set to the default for the format | |
| 408 switch (fmt->format) { | |
| 409 case binary_format: fmt->compression = bgzf; break; | |
| 410 case cram: fmt->compression = custom; break; | |
| 411 case text_format: fmt->compression = no_compression; break; | |
| 412 default: abort(); | |
| 413 } | |
| 414 } | |
| 415 | |
| 416 // Fill in category (if determinable; e.g. 'b' could be BAM or BCF) | |
| 417 fmt->category = format_category(fmt->format); | |
| 418 | |
| 419 fmt->version.major = fmt->version.minor = -1; | |
| 420 fmt->compression_level = -1; | |
| 421 fmt->specific = NULL; | |
| 422 } | |
| 423 else goto error; | |
| 424 | |
| 425 switch (fp->format.format) { | |
| 426 case binary_format: | |
| 427 case bam: | |
| 428 case bcf: | |
| 429 fp->fp.bgzf = bgzf_hopen(hfile, mode); | |
| 430 if (fp->fp.bgzf == NULL) goto error; | |
| 431 fp->is_bin = 1; | |
| 432 break; | |
| 433 | |
| 434 case cram: | |
| 435 fp->fp.cram = cram_dopen(hfile, fn, mode); | |
| 436 if (fp->fp.cram == NULL) goto error; | |
| 437 if (!fp->is_write) | |
| 438 cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1); | |
| 439 fp->is_cram = 1; | |
| 440 break; | |
| 441 | |
| 442 case text_format: | |
| 443 case sam: | |
| 444 case vcf: | |
| 445 if (!fp->is_write) { | |
| 446 #if KS_BGZF | |
| 447 BGZF *gzfp = bgzf_hopen(hfile, mode); | |
| 448 #else | |
| 449 // TODO Implement gzip hFILE adaptor | |
| 450 hclose(hfile); // This won't work, especially for stdin | |
| 451 gzFile gzfp = strcmp(fn, "-")? gzopen(fn, "rb") : gzdopen(fileno(stdin), "rb"); | |
| 452 #endif | |
| 453 if (gzfp) fp->fp.voidp = ks_init(gzfp); | |
| 454 else goto error; | |
| 455 } | |
| 456 else if (fp->format.compression != no_compression) { | |
| 457 fp->fp.bgzf = bgzf_hopen(hfile, mode); | |
| 458 if (fp->fp.bgzf == NULL) goto error; | |
| 459 } | |
| 460 else | |
| 461 fp->fp.hfile = hfile; | |
| 462 break; | |
| 463 | |
| 464 default: | |
| 465 goto error; | |
| 466 } | |
| 467 | |
| 468 return fp; | |
| 469 | |
| 470 error: | |
| 471 if (hts_verbose >= 2) | |
| 472 fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn); | |
| 473 | |
| 474 if (fp) { | |
| 475 free(fp->fn); | |
| 476 free(fp->fn_aux); | |
| 477 free(fp); | |
| 478 } | |
| 479 return NULL; | |
| 480 } | |
| 481 | |
| 482 int hts_close(htsFile *fp) | |
| 483 { | |
| 484 int ret, save; | |
| 485 | |
| 486 switch (fp->format.format) { | |
| 487 case binary_format: | |
| 488 case bam: | |
| 489 case bcf: | |
| 490 ret = bgzf_close(fp->fp.bgzf); | |
| 491 break; | |
| 492 | |
| 493 case cram: | |
| 494 if (!fp->is_write) { | |
| 495 switch (cram_eof(fp->fp.cram)) { | |
| 496 case 0: | |
| 497 fprintf(stderr, "[E::%s] Failed to decode sequence.\n", __func__); | |
| 498 return -1; | |
| 499 case 2: | |
| 500 fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__); | |
| 501 break; | |
| 502 default: /* case 1, expected EOF */ | |
| 503 break; | |
| 504 } | |
| 505 } | |
| 506 ret = cram_close(fp->fp.cram); | |
| 507 break; | |
| 508 | |
| 509 case text_format: | |
| 510 case sam: | |
| 511 case vcf: | |
| 512 if (!fp->is_write) { | |
| 513 #if KS_BGZF | |
| 514 BGZF *gzfp = ((kstream_t*)fp->fp.voidp)->f; | |
| 515 ret = bgzf_close(gzfp); | |
| 516 #else | |
| 517 gzFile gzfp = ((kstream_t*)fp->fp.voidp)->f; | |
| 518 ret = gzclose(gzfp); | |
| 519 #endif | |
| 520 ks_destroy((kstream_t*)fp->fp.voidp); | |
| 521 } | |
| 522 else if (fp->format.compression != no_compression) | |
| 523 ret = bgzf_close(fp->fp.bgzf); | |
| 524 else | |
| 525 ret = hclose(fp->fp.hfile); | |
| 526 break; | |
| 527 | |
| 528 default: | |
| 529 ret = -1; | |
| 530 break; | |
| 531 } | |
| 532 | |
| 533 save = errno; | |
| 534 free(fp->fn); | |
| 535 free(fp->fn_aux); | |
| 536 free(fp->line.s); | |
| 537 free(fp); | |
| 538 errno = save; | |
| 539 return ret; | |
| 540 } | |
| 541 | |
| 542 const htsFormat *hts_get_format(htsFile *fp) | |
| 543 { | |
| 544 return fp? &fp->format : NULL; | |
| 545 } | |
| 546 | |
| 547 int hts_set_opt(htsFile *fp, enum cram_option opt, ...) { | |
| 548 int r; | |
| 549 va_list args; | |
| 550 | |
| 551 if (fp->format.format != cram) | |
| 552 return 0; | |
| 553 | |
| 554 va_start(args, opt); | |
| 555 r = cram_set_voption(fp->fp.cram, opt, args); | |
| 556 va_end(args); | |
| 557 | |
| 558 return r; | |
| 559 } | |
| 560 | |
| 561 int hts_set_threads(htsFile *fp, int n) | |
| 562 { | |
| 563 if (fp->format.compression == bgzf) { | |
| 564 return bgzf_mt(fp->fp.bgzf, n, 256); | |
| 565 } else if (fp->format.format == cram) { | |
| 566 return hts_set_opt(fp, CRAM_OPT_NTHREADS, n); | |
| 567 } | |
| 568 else return 0; | |
| 569 } | |
| 570 | |
| 571 int hts_set_fai_filename(htsFile *fp, const char *fn_aux) | |
| 572 { | |
| 573 free(fp->fn_aux); | |
| 574 if (fn_aux) { | |
| 575 fp->fn_aux = strdup(fn_aux); | |
| 576 if (fp->fn_aux == NULL) return -1; | |
| 577 } | |
| 578 else fp->fn_aux = NULL; | |
| 579 | |
| 580 if (fp->format.format == cram) | |
| 581 cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux); | |
| 582 | |
| 583 return 0; | |
| 584 } | |
| 585 | |
| 586 // For VCF/BCF backward sweeper. Not exposing these functions because their | |
| 587 // future is uncertain. Things will probably have to change with hFILE... | |
| 588 BGZF *hts_get_bgzfp(htsFile *fp) | |
| 589 { | |
| 590 if ( fp->is_bin ) | |
| 591 return fp->fp.bgzf; | |
| 592 else | |
| 593 return ((kstream_t*)fp->fp.voidp)->f; | |
| 594 } | |
| 595 int hts_useek(htsFile *fp, long uoffset, int where) | |
| 596 { | |
| 597 if ( fp->is_bin ) | |
| 598 return bgzf_useek(fp->fp.bgzf, uoffset, where); | |
| 599 else | |
| 600 { | |
| 601 ks_rewind((kstream_t*)fp->fp.voidp); | |
| 602 ((kstream_t*)fp->fp.voidp)->seek_pos = uoffset; | |
| 603 return bgzf_useek(((kstream_t*)fp->fp.voidp)->f, uoffset, where); | |
| 604 } | |
| 605 } | |
| 606 long hts_utell(htsFile *fp) | |
| 607 { | |
| 608 if ( fp->is_bin ) | |
| 609 return bgzf_utell(fp->fp.bgzf); | |
| 610 else | |
| 611 return ((kstream_t*)fp->fp.voidp)->seek_pos; | |
| 612 } | |
| 613 | |
| 614 int hts_getline(htsFile *fp, int delimiter, kstring_t *str) | |
| 615 { | |
| 616 int ret, dret; | |
| 617 ret = ks_getuntil((kstream_t*)fp->fp.voidp, delimiter, str, &dret); | |
| 618 ++fp->lineno; | |
| 619 return ret; | |
| 620 } | |
| 621 | |
| 622 char **hts_readlist(const char *string, int is_file, int *_n) | |
| 623 { | |
| 624 int m = 0, n = 0, dret; | |
| 625 char **s = 0; | |
| 626 if ( is_file ) | |
| 627 { | |
| 628 #if KS_BGZF | |
| 629 BGZF *fp = bgzf_open(string, "r"); | |
| 630 #else | |
| 631 gzFile fp = gzopen(string, "r"); | |
| 632 #endif | |
| 633 if ( !fp ) return NULL; | |
| 634 | |
| 635 kstream_t *ks; | |
| 636 kstring_t str; | |
| 637 str.s = 0; str.l = str.m = 0; | |
| 638 ks = ks_init(fp); | |
| 639 while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) | |
| 640 { | |
| 641 if (str.l == 0) continue; | |
| 642 n++; | |
| 643 hts_expand(char*,n,m,s); | |
| 644 s[n-1] = strdup(str.s); | |
| 645 } | |
| 646 ks_destroy(ks); | |
| 647 #if KS_BGZF | |
| 648 bgzf_close(fp); | |
| 649 #else | |
| 650 gzclose(fp); | |
| 651 #endif | |
| 652 free(str.s); | |
| 653 } | |
| 654 else | |
| 655 { | |
| 656 const char *q = string, *p = string; | |
| 657 while ( 1 ) | |
| 658 { | |
| 659 if (*p == ',' || *p == 0) | |
| 660 { | |
| 661 n++; | |
| 662 hts_expand(char*,n,m,s); | |
| 663 s[n-1] = (char*)calloc(p - q + 1, 1); | |
| 664 strncpy(s[n-1], q, p - q); | |
| 665 q = p + 1; | |
| 666 } | |
| 667 if ( !*p ) break; | |
| 668 p++; | |
| 669 } | |
| 670 } | |
| 671 s = (char**)realloc(s, n * sizeof(char*)); | |
| 672 *_n = n; | |
| 673 return s; | |
| 674 } | |
| 675 | |
| 676 char **hts_readlines(const char *fn, int *_n) | |
| 677 { | |
| 678 int m = 0, n = 0, dret; | |
| 679 char **s = 0; | |
| 680 #if KS_BGZF | |
| 681 BGZF *fp = bgzf_open(fn, "r"); | |
| 682 #else | |
| 683 gzFile fp = gzopen(fn, "r"); | |
| 684 #endif | |
| 685 if ( fp ) { // read from file | |
| 686 kstream_t *ks; | |
| 687 kstring_t str; | |
| 688 str.s = 0; str.l = str.m = 0; | |
| 689 ks = ks_init(fp); | |
| 690 while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { | |
| 691 if (str.l == 0) continue; | |
| 692 if (m == n) { | |
| 693 m = m? m<<1 : 16; | |
| 694 s = (char**)realloc(s, m * sizeof(char*)); | |
| 695 } | |
| 696 s[n++] = strdup(str.s); | |
| 697 } | |
| 698 ks_destroy(ks); | |
| 699 #if KS_BGZF | |
| 700 bgzf_close(fp); | |
| 701 #else | |
| 702 gzclose(fp); | |
| 703 #endif | |
| 704 s = (char**)realloc(s, n * sizeof(char*)); | |
| 705 free(str.s); | |
| 706 } else if (*fn == ':') { // read from string | |
| 707 const char *q, *p; | |
| 708 for (q = p = fn + 1;; ++p) | |
| 709 if (*p == ',' || *p == 0) { | |
| 710 if (m == n) { | |
| 711 m = m? m<<1 : 16; | |
| 712 s = (char**)realloc(s, m * sizeof(char*)); | |
| 713 } | |
| 714 s[n] = (char*)calloc(p - q + 1, 1); | |
| 715 strncpy(s[n++], q, p - q); | |
| 716 q = p + 1; | |
| 717 if (*p == 0) break; | |
| 718 } | |
| 719 } else return 0; | |
| 720 s = (char**)realloc(s, n * sizeof(char*)); | |
| 721 *_n = n; | |
| 722 return s; | |
| 723 } | |
| 724 | |
| 725 // DEPRECATED: To be removed in a future HTSlib release | |
| 726 int hts_file_type(const char *fname) | |
| 727 { | |
| 728 int len = strlen(fname); | |
| 729 if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ; | |
| 730 if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF; | |
| 731 if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ; | |
| 732 if ( !strcmp("-",fname) ) return FT_STDIN; | |
| 733 | |
| 734 hFILE *f = hopen(fname, "r"); | |
| 735 if (f == NULL) return 0; | |
| 736 | |
| 737 htsFormat fmt; | |
| 738 if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; } | |
| 739 if (hclose(f) < 0) return 0; | |
| 740 | |
| 741 switch (fmt.format) { | |
| 742 case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ; | |
| 743 case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ; | |
| 744 default: return 0; | |
| 745 } | |
| 746 } | |
| 747 | |
| 748 /**************** | |
| 749 *** Indexing *** | |
| 750 ****************/ | |
| 751 | |
| 752 #define HTS_MIN_MARKER_DIST 0x10000 | |
| 753 | |
| 754 // Finds the special meta bin | |
| 755 // ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1 | |
| 756 #define META_BIN(idx) ((idx)->n_bins + 1) | |
| 757 | |
| 758 #define pair64_lt(a,b) ((a).u < (b).u) | |
| 759 | |
| 760 #include "htslib/ksort.h" | |
| 761 KSORT_INIT(_off, hts_pair64_t, pair64_lt) | |
| 762 | |
| 763 typedef struct { | |
| 764 int32_t m, n; | |
| 765 uint64_t loff; | |
| 766 hts_pair64_t *list; | |
| 767 } bins_t; | |
| 768 | |
| 769 #include "htslib/khash.h" | |
| 770 KHASH_MAP_INIT_INT(bin, bins_t) | |
| 771 typedef khash_t(bin) bidx_t; | |
| 772 | |
| 773 typedef struct { | |
| 774 int32_t n, m; | |
| 775 uint64_t *offset; | |
| 776 } lidx_t; | |
| 777 | |
| 778 struct __hts_idx_t { | |
| 779 int fmt, min_shift, n_lvls, n_bins; | |
| 780 uint32_t l_meta; | |
| 781 int32_t n, m; | |
| 782 uint64_t n_no_coor; | |
| 783 bidx_t **bidx; | |
| 784 lidx_t *lidx; | |
| 785 uint8_t *meta; | |
| 786 struct { | |
| 787 uint32_t last_bin, save_bin; | |
| 788 int last_coor, last_tid, save_tid, finished; | |
| 789 uint64_t last_off, save_off; | |
| 790 uint64_t off_beg, off_end; | |
| 791 uint64_t n_mapped, n_unmapped; | |
| 792 } z; // keep internal states | |
| 793 }; | |
| 794 | |
| 795 static inline void insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end) | |
| 796 { | |
| 797 khint_t k; | |
| 798 bins_t *l; | |
| 799 int absent; | |
| 800 k = kh_put(bin, b, bin, &absent); | |
| 801 l = &kh_value(b, k); | |
| 802 if (absent) { | |
| 803 l->m = 1; l->n = 0; | |
| 804 l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t)); | |
| 805 } | |
| 806 if (l->n == l->m) { | |
| 807 l->m <<= 1; | |
| 808 l->list = (hts_pair64_t*)realloc(l->list, l->m * sizeof(hts_pair64_t)); | |
| 809 } | |
| 810 l->list[l->n].u = beg; | |
| 811 l->list[l->n++].v = end; | |
| 812 } | |
| 813 | |
| 814 static inline void insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift) | |
| 815 { | |
| 816 int i, beg, end; | |
| 817 beg = _beg >> min_shift; | |
| 818 end = (_end - 1) >> min_shift; | |
| 819 if (l->m < end + 1) { | |
| 820 int old_m = l->m; | |
| 821 l->m = end + 1; | |
| 822 kroundup32(l->m); | |
| 823 l->offset = (uint64_t*)realloc(l->offset, l->m * sizeof(uint64_t)); | |
| 824 memset(l->offset + old_m, 0xff, 8 * (l->m - old_m)); // fill l->offset with (uint64_t)-1 | |
| 825 } | |
| 826 if (beg == end) { // to save a loop in this case | |
| 827 if (l->offset[beg] == (uint64_t)-1) l->offset[beg] = offset; | |
| 828 } else { | |
| 829 for (i = beg; i <= end; ++i) | |
| 830 if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset; | |
| 831 } | |
| 832 if (l->n < end + 1) l->n = end + 1; | |
| 833 } | |
| 834 | |
| 835 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls) | |
| 836 { | |
| 837 hts_idx_t *idx; | |
| 838 idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t)); | |
| 839 if (idx == NULL) return NULL; | |
| 840 idx->fmt = fmt; | |
| 841 idx->min_shift = min_shift; | |
| 842 idx->n_lvls = n_lvls; | |
| 843 idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7; | |
| 844 idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu; | |
| 845 idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0; | |
| 846 idx->z.last_coor = 0xffffffffu; | |
| 847 if (n) { | |
| 848 idx->n = idx->m = n; | |
| 849 idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*)); | |
| 850 if (idx->bidx == NULL) { free(idx); return NULL; } | |
| 851 idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t)); | |
| 852 if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; } | |
| 853 } | |
| 854 return idx; | |
| 855 } | |
| 856 | |
| 857 static void update_loff(hts_idx_t *idx, int i, int free_lidx) | |
| 858 { | |
| 859 bidx_t *bidx = idx->bidx[i]; | |
| 860 lidx_t *lidx = &idx->lidx[i]; | |
| 861 khint_t k; | |
| 862 int l; | |
| 863 uint64_t offset0 = 0; | |
| 864 if (bidx) { | |
| 865 k = kh_get(bin, bidx, META_BIN(idx)); | |
| 866 if (k != kh_end(bidx)) | |
| 867 offset0 = kh_val(bidx, k).list[0].u; | |
| 868 for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l) | |
| 869 lidx->offset[l] = offset0; | |
| 870 } else l = 1; | |
| 871 for (; l < lidx->n; ++l) // fill missing values | |
| 872 if (lidx->offset[l] == (uint64_t)-1) | |
| 873 lidx->offset[l] = lidx->offset[l-1]; | |
| 874 if (bidx == 0) return; | |
| 875 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff | |
| 876 if (kh_exist(bidx, k)) | |
| 877 { | |
| 878 if ( kh_key(bidx, k) < idx->n_bins ) | |
| 879 { | |
| 880 int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls); | |
| 881 // disable linear index if bot_bin out of bounds | |
| 882 kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0; | |
| 883 } | |
| 884 else | |
| 885 kh_val(bidx, k).loff = 0; | |
| 886 } | |
| 887 if (free_lidx) { | |
| 888 free(lidx->offset); | |
| 889 lidx->m = lidx->n = 0; | |
| 890 lidx->offset = 0; | |
| 891 } | |
| 892 } | |
| 893 | |
| 894 static void compress_binning(hts_idx_t *idx, int i) | |
| 895 { | |
| 896 bidx_t *bidx = idx->bidx[i]; | |
| 897 khint_t k; | |
| 898 int l, m; | |
| 899 if (bidx == 0) return; | |
| 900 // merge a bin to its parent if the bin is too small | |
| 901 for (l = idx->n_lvls; l > 0; --l) { | |
| 902 unsigned start = hts_bin_first(l); | |
| 903 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) { | |
| 904 bins_t *p, *q; | |
| 905 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue; | |
| 906 p = &kh_value(bidx, k); | |
| 907 if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list); | |
| 908 if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) { | |
| 909 khint_t kp; | |
| 910 kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k))); | |
| 911 if (kp == kh_end(bidx)) continue; | |
| 912 q = &kh_val(bidx, kp); | |
| 913 if (q->n + p->n > q->m) { | |
| 914 q->m = q->n + p->n; | |
| 915 kroundup32(q->m); | |
| 916 q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t)); | |
| 917 } | |
| 918 memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t)); | |
| 919 q->n += p->n; | |
| 920 free(p->list); | |
| 921 kh_del(bin, bidx, k); | |
| 922 } | |
| 923 } | |
| 924 } | |
| 925 k = kh_get(bin, bidx, 0); | |
| 926 if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list); | |
| 927 // merge adjacent chunks that start from the same BGZF block | |
| 928 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) { | |
| 929 bins_t *p; | |
| 930 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue; | |
| 931 p = &kh_value(bidx, k); | |
| 932 for (l = 1, m = 0; l < p->n; ++l) { | |
| 933 if (p->list[m].v>>16 >= p->list[l].u>>16) { | |
| 934 if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v; | |
| 935 } else p->list[++m] = p->list[l]; | |
| 936 } | |
| 937 p->n = m + 1; | |
| 938 } | |
| 939 } | |
| 940 | |
| 941 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) | |
| 942 { | |
| 943 int i; | |
| 944 if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times | |
| 945 if (idx->z.save_tid >= 0) { | |
| 946 insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset); | |
| 947 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset); | |
| 948 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped); | |
| 949 } | |
| 950 for (i = 0; i < idx->n; ++i) { | |
| 951 update_loff(idx, i, (idx->fmt == HTS_FMT_CSI)); | |
| 952 compress_binning(idx, i); | |
| 953 } | |
| 954 idx->z.finished = 1; | |
| 955 } | |
| 956 | |
| 957 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) | |
| 958 { | |
| 959 int bin; | |
| 960 if (tid<0) beg = -1, end = 0; | |
| 961 if (tid >= idx->m) { // enlarge the index | |
| 962 int32_t oldm = idx->m; | |
| 963 idx->m = idx->m? idx->m<<1 : 2; | |
| 964 idx->bidx = (bidx_t**)realloc(idx->bidx, idx->m * sizeof(bidx_t*)); | |
| 965 idx->lidx = (lidx_t*) realloc(idx->lidx, idx->m * sizeof(lidx_t)); | |
| 966 memset(&idx->bidx[oldm], 0, (idx->m - oldm) * sizeof(bidx_t*)); | |
| 967 memset(&idx->lidx[oldm], 0, (idx->m - oldm) * sizeof(lidx_t)); | |
| 968 } | |
| 969 if (idx->n < tid + 1) idx->n = tid + 1; | |
| 970 if (idx->z.finished) return 0; | |
| 971 if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome | |
| 972 if ( tid>=0 && idx->n_no_coor ) | |
| 973 { | |
| 974 if (hts_verbose >= 1) fprintf(stderr,"[E::%s] NO_COOR reads not in a single block at the end %d %d\n", __func__, tid,idx->z.last_tid); | |
| 975 return -1; | |
| 976 } | |
| 977 if (tid>=0 && idx->bidx[tid] != 0) | |
| 978 { | |
| 979 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] chromosome blocks not continuous\n", __func__); | |
| 980 return -1; | |
| 981 } | |
| 982 idx->z.last_tid = tid; | |
| 983 idx->z.last_bin = 0xffffffffu; | |
| 984 } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order | |
| 985 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] unsorted positions\n", __func__); | |
| 986 return -1; | |
| 987 } | |
| 988 if ( tid>=0 ) | |
| 989 { | |
| 990 if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin); | |
| 991 if ( is_mapped) | |
| 992 insert_to_l(&idx->lidx[tid], beg, end, idx->z.last_off, idx->min_shift); // last_off points to the start of the current record | |
| 993 } | |
| 994 else idx->n_no_coor++; | |
| 995 bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls); | |
| 996 if ((int)idx->z.last_bin != bin) { // then possibly write the binning index | |
| 997 if (idx->z.save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record | |
| 998 insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, idx->z.last_off); | |
| 999 if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information | |
| 1000 idx->z.off_end = idx->z.last_off; | |
| 1001 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, idx->z.off_end); | |
| 1002 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped); | |
| 1003 idx->z.n_mapped = idx->z.n_unmapped = 0; | |
| 1004 idx->z.off_beg = idx->z.off_end; | |
| 1005 } | |
| 1006 idx->z.save_off = idx->z.last_off; | |
| 1007 idx->z.save_bin = idx->z.last_bin = bin; | |
| 1008 idx->z.save_tid = tid; | |
| 1009 } | |
| 1010 if (is_mapped) ++idx->z.n_mapped; | |
| 1011 else ++idx->z.n_unmapped; | |
| 1012 idx->z.last_off = offset; | |
| 1013 idx->z.last_coor = beg; | |
| 1014 return 0; | |
| 1015 } | |
| 1016 | |
| 1017 void hts_idx_destroy(hts_idx_t *idx) | |
| 1018 { | |
| 1019 khint_t k; | |
| 1020 int i; | |
| 1021 if (idx == 0) return; | |
| 1022 // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c | |
| 1023 if (idx->fmt == HTS_FMT_CRAI) { free(idx); return; } | |
| 1024 | |
| 1025 for (i = 0; i < idx->m; ++i) { | |
| 1026 bidx_t *bidx = idx->bidx[i]; | |
| 1027 free(idx->lidx[i].offset); | |
| 1028 if (bidx == 0) continue; | |
| 1029 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) | |
| 1030 if (kh_exist(bidx, k)) | |
| 1031 free(kh_value(bidx, k).list); | |
| 1032 kh_destroy(bin, bidx); | |
| 1033 } | |
| 1034 free(idx->bidx); free(idx->lidx); free(idx->meta); | |
| 1035 free(idx); | |
| 1036 } | |
| 1037 | |
| 1038 static inline long idx_read(int is_bgzf, void *fp, void *buf, long l) | |
| 1039 { | |
| 1040 if (is_bgzf) return bgzf_read((BGZF*)fp, buf, l); | |
| 1041 else return (long)fread(buf, 1, l, (FILE*)fp); | |
| 1042 } | |
| 1043 | |
| 1044 static inline long idx_write(int is_bgzf, void *fp, const void *buf, long l) | |
| 1045 { | |
| 1046 if (is_bgzf) return bgzf_write((BGZF*)fp, buf, l); | |
| 1047 else return (long)fwrite(buf, 1, l, (FILE*)fp); | |
| 1048 } | |
| 1049 | |
| 1050 static inline void swap_bins(bins_t *p) | |
| 1051 { | |
| 1052 int i; | |
| 1053 for (i = 0; i < p->n; ++i) { | |
| 1054 ed_swap_8p(&p->list[i].u); | |
| 1055 ed_swap_8p(&p->list[i].v); | |
| 1056 } | |
| 1057 } | |
| 1058 | |
| 1059 static void hts_idx_save_core(const hts_idx_t *idx, void *fp, int fmt) | |
| 1060 { | |
| 1061 int32_t i, size, is_be; | |
| 1062 int is_bgzf = (fmt != HTS_FMT_BAI); | |
| 1063 is_be = ed_is_big(); | |
| 1064 if (is_be) { | |
| 1065 uint32_t x = idx->n; | |
| 1066 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); | |
| 1067 } else idx_write(is_bgzf, fp, &idx->n, 4); | |
| 1068 if (fmt == HTS_FMT_TBI && idx->l_meta) idx_write(is_bgzf, fp, idx->meta, idx->l_meta); | |
| 1069 for (i = 0; i < idx->n; ++i) { | |
| 1070 khint_t k; | |
| 1071 bidx_t *bidx = idx->bidx[i]; | |
| 1072 lidx_t *lidx = &idx->lidx[i]; | |
| 1073 // write binning index | |
| 1074 size = bidx? kh_size(bidx) : 0; | |
| 1075 if (is_be) { // big endian | |
| 1076 uint32_t x = size; | |
| 1077 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); | |
| 1078 } else idx_write(is_bgzf, fp, &size, 4); | |
| 1079 if (bidx == 0) goto write_lidx; | |
| 1080 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) { | |
| 1081 bins_t *p; | |
| 1082 if (!kh_exist(bidx, k)) continue; | |
| 1083 p = &kh_value(bidx, k); | |
| 1084 if (is_be) { // big endian | |
| 1085 uint32_t x; | |
| 1086 x = kh_key(bidx, k); idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); | |
| 1087 if (fmt == HTS_FMT_CSI) { | |
| 1088 uint64_t y = kh_val(bidx, k).loff; | |
| 1089 idx_write(is_bgzf, fp, ed_swap_4p(&y), 8); | |
| 1090 } | |
| 1091 x = p->n; idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); | |
| 1092 swap_bins(p); | |
| 1093 idx_write(is_bgzf, fp, p->list, 16 * p->n); | |
| 1094 swap_bins(p); | |
| 1095 } else { | |
| 1096 idx_write(is_bgzf, fp, &kh_key(bidx, k), 4); | |
| 1097 if (fmt == HTS_FMT_CSI) idx_write(is_bgzf, fp, &kh_val(bidx, k).loff, 8); | |
| 1098 //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v); | |
| 1099 idx_write(is_bgzf, fp, &p->n, 4); | |
| 1100 idx_write(is_bgzf, fp, p->list, p->n << 4); | |
| 1101 } | |
| 1102 } | |
| 1103 write_lidx: | |
| 1104 if (fmt != HTS_FMT_CSI) { | |
| 1105 if (is_be) { | |
| 1106 int32_t x = lidx->n; | |
| 1107 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); | |
| 1108 for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]); | |
| 1109 idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3); | |
| 1110 for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]); | |
| 1111 } else { | |
| 1112 idx_write(is_bgzf, fp, &lidx->n, 4); | |
| 1113 idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3); | |
| 1114 } | |
| 1115 } | |
| 1116 } | |
| 1117 if (is_be) { // write the number of reads without coordinates | |
| 1118 uint64_t x = idx->n_no_coor; | |
| 1119 idx_write(is_bgzf, fp, &x, 8); | |
| 1120 } else idx_write(is_bgzf, fp, &idx->n_no_coor, 8); | |
| 1121 } | |
| 1122 | |
| 1123 void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) | |
| 1124 { | |
| 1125 char *fnidx; | |
| 1126 fnidx = (char*)calloc(1, strlen(fn) + 5); | |
| 1127 strcpy(fnidx, fn); | |
| 1128 if (fmt == HTS_FMT_CSI) { | |
| 1129 BGZF *fp; | |
| 1130 uint32_t x[3]; | |
| 1131 int is_be, i; | |
| 1132 is_be = ed_is_big(); | |
| 1133 fp = bgzf_open(strcat(fnidx, ".csi"), "w"); | |
| 1134 bgzf_write(fp, "CSI\1", 4); | |
| 1135 x[0] = idx->min_shift; x[1] = idx->n_lvls; x[2] = idx->l_meta; | |
| 1136 if (is_be) { | |
| 1137 for (i = 0; i < 3; ++i) | |
| 1138 bgzf_write(fp, ed_swap_4p(&x[i]), 4); | |
| 1139 } else bgzf_write(fp, &x, 12); | |
| 1140 if (idx->l_meta) bgzf_write(fp, idx->meta, idx->l_meta); | |
| 1141 hts_idx_save_core(idx, fp, HTS_FMT_CSI); | |
| 1142 bgzf_close(fp); | |
| 1143 } else if (fmt == HTS_FMT_TBI) { | |
| 1144 BGZF *fp; | |
| 1145 fp = bgzf_open(strcat(fnidx, ".tbi"), "w"); | |
| 1146 bgzf_write(fp, "TBI\1", 4); | |
| 1147 hts_idx_save_core(idx, fp, HTS_FMT_TBI); | |
| 1148 bgzf_close(fp); | |
| 1149 } else if (fmt == HTS_FMT_BAI) { | |
| 1150 FILE *fp; | |
| 1151 fp = fopen(strcat(fnidx, ".bai"), "w"); | |
| 1152 fwrite("BAI\1", 1, 4, fp); | |
| 1153 hts_idx_save_core(idx, fp, HTS_FMT_BAI); | |
| 1154 fclose(fp); | |
| 1155 } else abort(); | |
| 1156 free(fnidx); | |
| 1157 } | |
| 1158 | |
| 1159 static int hts_idx_load_core(hts_idx_t *idx, void *fp, int fmt) | |
| 1160 { | |
| 1161 int32_t i, n, is_be; | |
| 1162 int is_bgzf = (fmt != HTS_FMT_BAI); | |
| 1163 is_be = ed_is_big(); | |
| 1164 if (idx == NULL) return -4; | |
| 1165 for (i = 0; i < idx->n; ++i) { | |
| 1166 bidx_t *h; | |
| 1167 lidx_t *l = &idx->lidx[i]; | |
| 1168 uint32_t key; | |
| 1169 int j, absent; | |
| 1170 bins_t *p; | |
| 1171 h = idx->bidx[i] = kh_init(bin); | |
| 1172 if (idx_read(is_bgzf, fp, &n, 4) != 4) return -1; | |
| 1173 if (is_be) ed_swap_4p(&n); | |
| 1174 for (j = 0; j < n; ++j) { | |
| 1175 khint_t k; | |
| 1176 if (idx_read(is_bgzf, fp, &key, 4) != 4) return -1; | |
| 1177 if (is_be) ed_swap_4p(&key); | |
| 1178 k = kh_put(bin, h, key, &absent); | |
| 1179 if (absent <= 0) return -3; // Duplicate bin number | |
| 1180 p = &kh_val(h, k); | |
| 1181 if (fmt == HTS_FMT_CSI) { | |
| 1182 if (idx_read(is_bgzf, fp, &p->loff, 8) != 8) return -1; | |
| 1183 if (is_be) ed_swap_8p(&p->loff); | |
| 1184 } else p->loff = 0; | |
| 1185 if (idx_read(is_bgzf, fp, &p->n, 4) != 4) return -1; | |
| 1186 if (is_be) ed_swap_4p(&p->n); | |
| 1187 p->m = p->n; | |
| 1188 p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t)); | |
| 1189 if (p->list == NULL) return -2; | |
| 1190 if (idx_read(is_bgzf, fp, p->list, p->n<<4) != p->n<<4) return -1; | |
| 1191 if (is_be) swap_bins(p); | |
| 1192 } | |
| 1193 if (fmt != HTS_FMT_CSI) { // load linear index | |
| 1194 int j; | |
| 1195 if (idx_read(is_bgzf, fp, &l->n, 4) != 4) return -1; | |
| 1196 if (is_be) ed_swap_4p(&l->n); | |
| 1197 l->m = l->n; | |
| 1198 l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t)); | |
| 1199 if (l->offset == NULL) return -2; | |
| 1200 if (idx_read(is_bgzf, fp, l->offset, l->n << 3) != l->n << 3) return -1; | |
| 1201 if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]); | |
| 1202 for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix | |
| 1203 if (l->offset[j] == 0) l->offset[j] = l->offset[j-1]; | |
| 1204 update_loff(idx, i, 1); | |
| 1205 } | |
| 1206 } | |
| 1207 if (idx_read(is_bgzf, fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0; | |
| 1208 if (is_be) ed_swap_8p(&idx->n_no_coor); | |
| 1209 return 0; | |
| 1210 } | |
| 1211 | |
| 1212 hts_idx_t *hts_idx_load_local(const char *fn, int fmt) | |
| 1213 { | |
| 1214 uint8_t magic[4]; | |
| 1215 int i, is_be; | |
| 1216 hts_idx_t *idx = NULL; | |
| 1217 is_be = ed_is_big(); | |
| 1218 if (fmt == HTS_FMT_CSI) { | |
| 1219 BGZF *fp; | |
| 1220 uint32_t x[3], n; | |
| 1221 uint8_t *meta = 0; | |
| 1222 if ((fp = bgzf_open(fn, "r")) == 0) return NULL; | |
| 1223 if (bgzf_read(fp, magic, 4) != 4) goto csi_fail; | |
| 1224 if (memcmp(magic, "CSI\1", 4) != 0) goto csi_fail; | |
| 1225 if (bgzf_read(fp, x, 12) != 12) goto csi_fail; | |
| 1226 if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]); | |
| 1227 if (x[2]) { | |
| 1228 if ((meta = (uint8_t*)malloc(x[2])) == NULL) goto csi_fail; | |
| 1229 if (bgzf_read(fp, meta, x[2]) != x[2]) goto csi_fail; | |
| 1230 } | |
| 1231 if (bgzf_read(fp, &n, 4) != 4) goto csi_fail; | |
| 1232 if (is_be) ed_swap_4p(&n); | |
| 1233 if ((idx = hts_idx_init(n, fmt, 0, x[0], x[1])) == NULL) goto csi_fail; | |
| 1234 idx->l_meta = x[2]; | |
| 1235 idx->meta = meta; | |
| 1236 meta = NULL; | |
| 1237 if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto csi_fail; | |
| 1238 bgzf_close(fp); | |
| 1239 return idx; | |
| 1240 | |
| 1241 csi_fail: | |
| 1242 bgzf_close(fp); | |
| 1243 hts_idx_destroy(idx); | |
| 1244 free(meta); | |
| 1245 return NULL; | |
| 1246 | |
| 1247 } else if (fmt == HTS_FMT_TBI) { | |
| 1248 BGZF *fp; | |
| 1249 uint32_t x[8]; | |
| 1250 if ((fp = bgzf_open(fn, "r")) == 0) return NULL; | |
| 1251 if (bgzf_read(fp, magic, 4) != 4) goto tbi_fail; | |
| 1252 if (memcmp(magic, "TBI\1", 4) != 0) goto tbi_fail; | |
| 1253 if (bgzf_read(fp, x, 32) != 32) goto tbi_fail; | |
| 1254 if (is_be) for (i = 0; i < 8; ++i) ed_swap_4p(&x[i]); | |
| 1255 if ((idx = hts_idx_init(x[0], fmt, 0, 14, 5)) == NULL) goto tbi_fail; | |
| 1256 idx->l_meta = 28 + x[7]; | |
| 1257 if ((idx->meta = (uint8_t*)malloc(idx->l_meta)) == NULL) goto tbi_fail; | |
| 1258 memcpy(idx->meta, &x[1], 28); | |
| 1259 if (bgzf_read(fp, idx->meta + 28, x[7]) != x[7]) goto tbi_fail; | |
| 1260 if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto tbi_fail; | |
| 1261 bgzf_close(fp); | |
| 1262 return idx; | |
| 1263 | |
| 1264 tbi_fail: | |
| 1265 bgzf_close(fp); | |
| 1266 hts_idx_destroy(idx); | |
| 1267 return NULL; | |
| 1268 | |
| 1269 } else if (fmt == HTS_FMT_BAI) { | |
| 1270 uint32_t n; | |
| 1271 FILE *fp; | |
| 1272 if ((fp = fopen(fn, "rb")) == 0) return NULL; | |
| 1273 if (fread(magic, 1, 4, fp) != 4) goto bai_fail; | |
| 1274 if (memcmp(magic, "BAI\1", 4) != 0) goto bai_fail; | |
| 1275 if (fread(&n, 4, 1, fp) != 1) goto bai_fail; | |
| 1276 if (is_be) ed_swap_4p(&n); | |
| 1277 idx = hts_idx_init(n, fmt, 0, 14, 5); | |
| 1278 if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto bai_fail; | |
| 1279 fclose(fp); | |
| 1280 return idx; | |
| 1281 | |
| 1282 bai_fail: | |
| 1283 fclose(fp); | |
| 1284 hts_idx_destroy(idx); | |
| 1285 return NULL; | |
| 1286 | |
| 1287 } else abort(); | |
| 1288 } | |
| 1289 | |
| 1290 void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy) | |
| 1291 { | |
| 1292 if (idx->meta) free(idx->meta); | |
| 1293 idx->l_meta = l_meta; | |
| 1294 if (is_copy) { | |
| 1295 idx->meta = (uint8_t*)malloc(l_meta); | |
| 1296 memcpy(idx->meta, meta, l_meta); | |
| 1297 } else idx->meta = meta; | |
| 1298 } | |
| 1299 | |
| 1300 uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta) | |
| 1301 { | |
| 1302 *l_meta = idx->l_meta; | |
| 1303 return idx->meta; | |
| 1304 } | |
| 1305 | |
| 1306 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) | |
| 1307 { | |
| 1308 if ( !idx->n ) | |
| 1309 { | |
| 1310 *n = 0; | |
| 1311 return NULL; | |
| 1312 } | |
| 1313 | |
| 1314 int tid = 0, i; | |
| 1315 const char **names = (const char**) calloc(idx->n,sizeof(const char*)); | |
| 1316 for (i=0; i<idx->n; i++) | |
| 1317 { | |
| 1318 bidx_t *bidx = idx->bidx[i]; | |
| 1319 if ( !bidx ) continue; | |
| 1320 names[tid++] = getid(hdr,i); | |
| 1321 } | |
| 1322 *n = tid; | |
| 1323 return names; | |
| 1324 } | |
| 1325 | |
| 1326 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped) | |
| 1327 { | |
| 1328 if ( idx->fmt == HTS_FMT_CRAI ) { | |
| 1329 *mapped = 0; *unmapped = 0; | |
| 1330 return -1; | |
| 1331 } | |
| 1332 | |
| 1333 bidx_t *h = idx->bidx[tid]; | |
| 1334 khint_t k = kh_get(bin, h, META_BIN(idx)); | |
| 1335 if (k != kh_end(h)) { | |
| 1336 *mapped = kh_val(h, k).list[1].u; | |
| 1337 *unmapped = kh_val(h, k).list[1].v; | |
| 1338 return 0; | |
| 1339 } else { | |
| 1340 *mapped = 0; *unmapped = 0; | |
| 1341 return -1; | |
| 1342 } | |
| 1343 } | |
| 1344 | |
| 1345 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) | |
| 1346 { | |
| 1347 return idx->n_no_coor; | |
| 1348 } | |
| 1349 | |
| 1350 /**************** | |
| 1351 *** Iterator *** | |
| 1352 ****************/ | |
| 1353 | |
| 1354 static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls) | |
| 1355 { | |
| 1356 int l, t, s = min_shift + (n_lvls<<1) + n_lvls; | |
| 1357 if (beg >= end) return 0; | |
| 1358 if (end >= 1LL<<s) end = 1LL<<s; | |
| 1359 for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { | |
| 1360 int b, e, n, i; | |
| 1361 b = t + (beg>>s); e = t + (end>>s); n = e - b + 1; | |
| 1362 if (itr->bins.n + n > itr->bins.m) { | |
| 1363 itr->bins.m = itr->bins.n + n; | |
| 1364 kroundup32(itr->bins.m); | |
| 1365 itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m); | |
| 1366 } | |
| 1367 for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i; | |
| 1368 } | |
| 1369 return itr->bins.n; | |
| 1370 } | |
| 1371 | |
| 1372 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) | |
| 1373 { | |
| 1374 int i, n_off, l, bin; | |
| 1375 hts_pair64_t *off; | |
| 1376 khint_t k; | |
| 1377 bidx_t *bidx; | |
| 1378 uint64_t min_off; | |
| 1379 hts_itr_t *iter = 0; | |
| 1380 if (tid < 0) { | |
| 1381 int finished0 = 0; | |
| 1382 uint64_t off0 = (uint64_t)-1; | |
| 1383 khint_t k; | |
| 1384 switch (tid) { | |
| 1385 case HTS_IDX_START: | |
| 1386 // Find the smallest offset, note that sequence ids may not be ordered sequentially | |
| 1387 for (i=0; i<idx->n; i++) | |
| 1388 { | |
| 1389 bidx = idx->bidx[i]; | |
| 1390 k = kh_get(bin, bidx, META_BIN(idx)); | |
| 1391 if (k == kh_end(bidx)) continue; | |
| 1392 if ( off0 > kh_val(bidx, k).list[0].u ) off0 = kh_val(bidx, k).list[0].u; | |
| 1393 } | |
| 1394 if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam | |
| 1395 break; | |
| 1396 | |
| 1397 case HTS_IDX_NOCOOR: | |
| 1398 if ( idx->n>0 ) | |
| 1399 { | |
| 1400 bidx = idx->bidx[idx->n - 1]; | |
| 1401 k = kh_get(bin, bidx, META_BIN(idx)); | |
| 1402 if (k != kh_end(bidx)) off0 = kh_val(bidx, k).list[0].v; | |
| 1403 } | |
| 1404 if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam | |
| 1405 break; | |
| 1406 | |
| 1407 case HTS_IDX_REST: | |
| 1408 off0 = 0; | |
| 1409 break; | |
| 1410 | |
| 1411 case HTS_IDX_NONE: | |
| 1412 finished0 = 1; | |
| 1413 off0 = 0; | |
| 1414 break; | |
| 1415 | |
| 1416 default: | |
| 1417 return 0; | |
| 1418 } | |
| 1419 if (off0 != (uint64_t)-1) { | |
| 1420 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); | |
| 1421 iter->read_rest = 1; | |
| 1422 iter->finished = finished0; | |
| 1423 iter->curr_off = off0; | |
| 1424 iter->readrec = readrec; | |
| 1425 return iter; | |
| 1426 } else return 0; | |
| 1427 } | |
| 1428 | |
| 1429 if (beg < 0) beg = 0; | |
| 1430 if (end < beg) return 0; | |
| 1431 if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0; | |
| 1432 | |
| 1433 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); | |
| 1434 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; | |
| 1435 iter->readrec = readrec; | |
| 1436 | |
| 1437 // compute min_off | |
| 1438 bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); | |
| 1439 do { | |
| 1440 int first; | |
| 1441 k = kh_get(bin, bidx, bin); | |
| 1442 if (k != kh_end(bidx)) break; | |
| 1443 first = (hts_bin_parent(bin)<<3) + 1; | |
| 1444 if (bin > first) --bin; | |
| 1445 else bin = hts_bin_parent(bin); | |
| 1446 } while (bin); | |
| 1447 if (bin == 0) k = kh_get(bin, bidx, bin); | |
| 1448 min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; | |
| 1449 // retrieve bins | |
| 1450 reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls); | |
| 1451 for (i = n_off = 0; i < iter->bins.n; ++i) | |
| 1452 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) | |
| 1453 n_off += kh_value(bidx, k).n; | |
| 1454 if (n_off == 0) return iter; | |
| 1455 off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t)); | |
| 1456 for (i = n_off = 0; i < iter->bins.n; ++i) { | |
| 1457 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) { | |
| 1458 int j; | |
| 1459 bins_t *p = &kh_value(bidx, k); | |
| 1460 for (j = 0; j < p->n; ++j) | |
| 1461 if (p->list[j].v > min_off) off[n_off++] = p->list[j]; | |
| 1462 } | |
| 1463 } | |
| 1464 if (n_off == 0) { | |
| 1465 free(off); return iter; | |
| 1466 } | |
| 1467 ks_introsort(_off, n_off, off); | |
| 1468 // resolve completely contained adjacent blocks | |
| 1469 for (i = 1, l = 0; i < n_off; ++i) | |
| 1470 if (off[l].v < off[i].v) off[++l] = off[i]; | |
| 1471 n_off = l + 1; | |
| 1472 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing | |
| 1473 for (i = 1; i < n_off; ++i) | |
| 1474 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; | |
| 1475 // merge adjacent blocks | |
| 1476 for (i = 1, l = 0; i < n_off; ++i) { | |
| 1477 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; | |
| 1478 else off[++l] = off[i]; | |
| 1479 } | |
| 1480 n_off = l + 1; | |
| 1481 iter->n_off = n_off; iter->off = off; | |
| 1482 return iter; | |
| 1483 } | |
| 1484 | |
| 1485 void hts_itr_destroy(hts_itr_t *iter) | |
| 1486 { | |
| 1487 if (iter) { free(iter->off); free(iter->bins.a); free(iter); } | |
| 1488 } | |
| 1489 | |
| 1490 const char *hts_parse_reg(const char *s, int *beg, int *end) | |
| 1491 { | |
| 1492 int i, k, l, name_end; | |
| 1493 *beg = *end = -1; | |
| 1494 name_end = l = strlen(s); | |
| 1495 // determine the sequence name | |
| 1496 for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end | |
| 1497 if (i >= 0) name_end = i; | |
| 1498 if (name_end < l) { // check if this is really the end | |
| 1499 int n_hyphen = 0; | |
| 1500 for (i = name_end + 1; i < l; ++i) { | |
| 1501 if (s[i] == '-') ++n_hyphen; | |
| 1502 else if (!isdigit(s[i]) && s[i] != ',') break; | |
| 1503 } | |
| 1504 if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name | |
| 1505 } | |
| 1506 // parse the interval | |
| 1507 if (name_end < l) { | |
| 1508 char *tmp; | |
| 1509 tmp = (char*)alloca(l - name_end + 1); | |
| 1510 for (i = name_end + 1, k = 0; i < l; ++i) | |
| 1511 if (s[i] != ',') tmp[k++] = s[i]; | |
| 1512 tmp[k] = 0; | |
| 1513 if ((*beg = strtol(tmp, &tmp, 10) - 1) < 0) *beg = 0; | |
| 1514 *end = *tmp? strtol(tmp + 1, &tmp, 10) : INT_MAX; | |
| 1515 if (*beg > *end) name_end = l; | |
| 1516 } | |
| 1517 if (name_end == l) *beg = 0, *end = INT_MAX; | |
| 1518 return s + name_end; | |
| 1519 } | |
| 1520 | |
| 1521 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) | |
| 1522 { | |
| 1523 int tid, beg, end; | |
| 1524 char *q, *tmp; | |
| 1525 if (strcmp(reg, ".") == 0) | |
| 1526 return itr_query(idx, HTS_IDX_START, 0, 0, readrec); | |
| 1527 else if (strcmp(reg, "*") != 0) { | |
| 1528 q = (char*)hts_parse_reg(reg, &beg, &end); | |
| 1529 tmp = (char*)alloca(q - reg + 1); | |
| 1530 strncpy(tmp, reg, q - reg); | |
| 1531 tmp[q - reg] = 0; | |
| 1532 if ((tid = getid(hdr, tmp)) < 0) | |
| 1533 tid = getid(hdr, reg); | |
| 1534 if (tid < 0) return 0; | |
| 1535 return itr_query(idx, tid, beg, end, readrec); | |
| 1536 } else return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec); | |
| 1537 } | |
| 1538 | |
| 1539 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) | |
| 1540 { | |
| 1541 int ret, tid, beg, end; | |
| 1542 if (iter == NULL || iter->finished) return -1; | |
| 1543 if (iter->read_rest) { | |
| 1544 if (iter->curr_off) { // seek to the start | |
| 1545 bgzf_seek(fp, iter->curr_off, SEEK_SET); | |
| 1546 iter->curr_off = 0; // only seek once | |
| 1547 } | |
| 1548 ret = iter->readrec(fp, data, r, &tid, &beg, &end); | |
| 1549 if (ret < 0) iter->finished = 1; | |
| 1550 iter->curr_tid = tid; | |
| 1551 iter->curr_beg = beg; | |
| 1552 iter->curr_end = end; | |
| 1553 return ret; | |
| 1554 } | |
| 1555 if (iter->off == 0) return -1; | |
| 1556 for (;;) { | |
| 1557 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk | |
| 1558 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks | |
| 1559 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek | |
| 1560 bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET); | |
| 1561 iter->curr_off = bgzf_tell(fp); | |
| 1562 } | |
| 1563 ++iter->i; | |
| 1564 } | |
| 1565 if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) { | |
| 1566 iter->curr_off = bgzf_tell(fp); | |
| 1567 if (tid != iter->tid || beg >= iter->end) { // no need to proceed | |
| 1568 ret = -1; break; | |
| 1569 } else if (end > iter->beg && iter->end > beg) { | |
| 1570 iter->curr_tid = tid; | |
| 1571 iter->curr_beg = beg; | |
| 1572 iter->curr_end = end; | |
| 1573 return ret; | |
| 1574 } | |
| 1575 } else break; // end of file or error | |
| 1576 } | |
| 1577 iter->finished = 1; | |
| 1578 return ret; | |
| 1579 } | |
| 1580 | |
| 1581 /********************** | |
| 1582 *** Retrieve index *** | |
| 1583 **********************/ | |
| 1584 | |
| 1585 static char *test_and_fetch(const char *fn) | |
| 1586 { | |
| 1587 FILE *fp; | |
| 1588 if (hisremote(fn)) { | |
| 1589 const int buf_size = 1 * 1024 * 1024; | |
| 1590 hFILE *fp_remote; | |
| 1591 uint8_t *buf; | |
| 1592 int l; | |
| 1593 const char *p; | |
| 1594 for (p = fn + strlen(fn) - 1; p >= fn; --p) | |
| 1595 if (*p == '/') break; | |
| 1596 ++p; // p now points to the local file name | |
| 1597 // Attempt to open local file first | |
| 1598 if ((fp = fopen((char*)p, "rb")) != 0) | |
| 1599 { | |
| 1600 fclose(fp); | |
| 1601 return (char*)p; | |
| 1602 } | |
| 1603 // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index. | |
| 1604 if ((fp_remote = hopen(fn, "r")) == 0) return 0; | |
| 1605 if ((fp = fopen(p, "w")) == 0) { | |
| 1606 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to create file '%s' in the working directory\n", __func__, p); | |
| 1607 hclose_abruptly(fp_remote); | |
| 1608 return 0; | |
| 1609 } | |
| 1610 if (hts_verbose >= 3) fprintf(stderr, "[M::%s] downloading file '%s' to local directory\n", __func__, fn); | |
| 1611 buf = (uint8_t*)calloc(buf_size, 1); | |
| 1612 while ((l = hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp); | |
| 1613 free(buf); | |
| 1614 fclose(fp); | |
| 1615 if (hclose(fp_remote) != 0) fprintf(stderr, "[E::%s] fail to close remote file '%s'\n", __func__, fn); | |
| 1616 return (char*)p; | |
| 1617 } else { | |
| 1618 if ((fp = fopen(fn, "rb")) == 0) return 0; | |
| 1619 fclose(fp); | |
| 1620 return (char*)fn; | |
| 1621 } | |
| 1622 } | |
| 1623 | |
| 1624 char *hts_idx_getfn(const char *fn, const char *ext) | |
| 1625 { | |
| 1626 int i, l_fn, l_ext; | |
| 1627 char *fnidx, *ret; | |
| 1628 l_fn = strlen(fn); l_ext = strlen(ext); | |
| 1629 fnidx = (char*)calloc(l_fn + l_ext + 1, 1); | |
| 1630 strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext); | |
| 1631 if ((ret = test_and_fetch(fnidx)) == 0) { | |
| 1632 for (i = l_fn - 1; i > 0; --i) | |
| 1633 if (fnidx[i] == '.') break; | |
| 1634 strcpy(fnidx + i, ext); | |
| 1635 ret = test_and_fetch(fnidx); | |
| 1636 } | |
| 1637 if (ret == 0) { | |
| 1638 free(fnidx); | |
| 1639 return 0; | |
| 1640 } | |
| 1641 l_fn = strlen(ret); | |
| 1642 memmove(fnidx, ret, l_fn + 1); | |
| 1643 return fnidx; | |
| 1644 } | |
| 1645 | |
| 1646 hts_idx_t *hts_idx_load(const char *fn, int fmt) | |
| 1647 { | |
| 1648 char *fnidx; | |
| 1649 hts_idx_t *idx; | |
| 1650 fnidx = hts_idx_getfn(fn, ".csi"); | |
| 1651 if (fnidx) fmt = HTS_FMT_CSI; | |
| 1652 else fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi"); | |
| 1653 if (fnidx == 0) return 0; | |
| 1654 | |
| 1655 // Check that the index file is up to date, the main file might have changed | |
| 1656 struct stat stat_idx,stat_main; | |
| 1657 if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) ) | |
| 1658 { | |
| 1659 if ( stat_idx.st_mtime < stat_main.st_mtime ) | |
| 1660 fprintf(stderr, "Warning: The index file is older than the data file: %s\n", fnidx); | |
| 1661 } | |
| 1662 idx = hts_idx_load_local(fnidx, fmt); | |
| 1663 free(fnidx); | |
| 1664 return idx; | |
| 1665 } |
