Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/hts.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* hts.c -- format-neutral I/O, indexing, and iterator API functions. | |
2 | |
3 Copyright (C) 2008, 2009, 2012-2015 Genome Research Ltd. | |
4 Copyright (C) 2012, 2013 Broad Institute. | |
5 | |
6 Author: Heng Li <lh3@sanger.ac.uk> | |
7 | |
8 Permission is hereby granted, free of charge, to any person obtaining a copy | |
9 of this software and associated documentation files (the "Software"), to deal | |
10 in the Software without restriction, including without limitation the rights | |
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
12 copies of the Software, and to permit persons to whom the Software is | |
13 furnished to do so, subject to the following conditions: | |
14 | |
15 The above copyright notice and this permission notice shall be included in | |
16 all copies or substantial portions of the Software. | |
17 | |
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
24 DEALINGS IN THE SOFTWARE. */ | |
25 | |
26 #include <zlib.h> | |
27 #include <ctype.h> | |
28 #include <stdio.h> | |
29 #include <string.h> | |
30 #include <stdlib.h> | |
31 #include <limits.h> | |
32 #include <fcntl.h> | |
33 #include <errno.h> | |
34 #include <sys/stat.h> | |
35 #include "htslib/bgzf.h" | |
36 #include "htslib/hts.h" | |
37 #include "cram/cram.h" | |
38 #include "htslib/hfile.h" | |
39 #include "version.h" | |
40 | |
41 #include "htslib/kseq.h" | |
42 #define KS_BGZF 1 | |
43 #if KS_BGZF | |
44 // bgzf now supports gzip-compressed files, the gzFile branch can be removed | |
45 KSTREAM_INIT2(, BGZF*, bgzf_read, 65536) | |
46 #else | |
47 KSTREAM_INIT2(, gzFile, gzread, 16384) | |
48 #endif | |
49 | |
50 #include "htslib/khash.h" | |
51 KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal) | |
52 | |
53 int hts_verbose = 3; | |
54 | |
55 const char *hts_version() | |
56 { | |
57 return HTS_VERSION; | |
58 } | |
59 | |
60 const unsigned char seq_nt16_table[256] = { | |
61 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
62 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
63 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
64 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, | |
65 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, | |
66 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, | |
67 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, | |
68 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, | |
69 | |
70 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
71 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
72 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
73 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
74 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
75 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
76 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, | |
77 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 | |
78 }; | |
79 | |
80 const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN"; | |
81 | |
82 const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; | |
83 | |
84 /********************** | |
85 *** Basic file I/O *** | |
86 **********************/ | |
87 | |
88 static enum htsFormatCategory format_category(enum htsExactFormat fmt) | |
89 { | |
90 switch (fmt) { | |
91 case bam: | |
92 case sam: | |
93 case cram: | |
94 return sequence_data; | |
95 | |
96 case vcf: | |
97 case bcf: | |
98 return variant_data; | |
99 | |
100 case bai: | |
101 case crai: | |
102 case csi: | |
103 case gzi: | |
104 case tbi: | |
105 return index_file; | |
106 | |
107 case bed: | |
108 return region_list; | |
109 | |
110 case unknown_format: | |
111 case binary_format: | |
112 case text_format: | |
113 case format_maximum: | |
114 break; | |
115 } | |
116 | |
117 return unknown_category; | |
118 } | |
119 | |
120 // Decompress up to ten or so bytes by peeking at the file, which must be | |
121 // positioned at the start of a GZIP block. | |
122 static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize) | |
123 { | |
124 // Typically at most a couple of hundred bytes of input are required | |
125 // to get a few bytes of output from inflate(), so hopefully this buffer | |
126 // size suffices in general. | |
127 unsigned char buffer[512]; | |
128 z_stream zs; | |
129 ssize_t npeek = hpeek(fp, buffer, sizeof buffer); | |
130 | |
131 if (npeek < 0) return 0; | |
132 | |
133 zs.zalloc = NULL; | |
134 zs.zfree = NULL; | |
135 zs.next_in = buffer; | |
136 zs.avail_in = npeek; | |
137 zs.next_out = dest; | |
138 zs.avail_out = destsize; | |
139 if (inflateInit2(&zs, 31) != Z_OK) return 0; | |
140 | |
141 while (zs.total_out < destsize) | |
142 if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break; | |
143 | |
144 destsize = zs.total_out; | |
145 inflateEnd(&zs); | |
146 | |
147 return destsize; | |
148 } | |
149 | |
150 // Parse "x.y" text, taking care because the string is not NUL-terminated | |
151 // and filling in major/minor only when the digits are followed by a delimiter, | |
152 // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer. | |
153 static void | |
154 parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim) | |
155 { | |
156 const char *str = (const char *) u; | |
157 const char *slim = (const char *) ulim; | |
158 const char *s; | |
159 | |
160 fmt->version.major = fmt->version.minor = -1; | |
161 | |
162 for (s = str; s < slim; s++) if (!isdigit(*s)) break; | |
163 if (s < slim) { | |
164 fmt->version.major = atoi(str); | |
165 if (*s == '.') { | |
166 str = &s[1]; | |
167 for (s = str; s < slim; s++) if (!isdigit(*s)) break; | |
168 if (s < slim) | |
169 fmt->version.minor = atoi(str); | |
170 } | |
171 else | |
172 fmt->version.minor = 0; | |
173 } | |
174 } | |
175 | |
176 int hts_detect_format(hFILE *hfile, htsFormat *fmt) | |
177 { | |
178 unsigned char s[21]; | |
179 ssize_t len = hpeek(hfile, s, 18); | |
180 if (len < 0) return -1; | |
181 | |
182 if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) { | |
183 // The stream is either gzip-compressed or BGZF-compressed. | |
184 // Determine which, and decompress the first few bytes. | |
185 fmt->compression = (len >= 18 && (s[3] & 4) && | |
186 memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip; | |
187 len = decompress_peek(hfile, s, sizeof s); | |
188 } | |
189 else { | |
190 fmt->compression = no_compression; | |
191 len = hpeek(hfile, s, sizeof s); | |
192 } | |
193 if (len < 0) return -1; | |
194 | |
195 fmt->compression_level = -1; | |
196 fmt->specific = NULL; | |
197 | |
198 if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) { | |
199 fmt->category = sequence_data; | |
200 fmt->format = cram; | |
201 fmt->version.major = s[4], fmt->version.minor = s[5]; | |
202 fmt->compression = custom; | |
203 return 0; | |
204 } | |
205 else if (len >= 4 && s[3] <= '\4') { | |
206 if (memcmp(s, "BAM\1", 4) == 0) { | |
207 fmt->category = sequence_data; | |
208 fmt->format = bam; | |
209 // TODO Decompress enough to pick version from @HD-VN header | |
210 fmt->version.major = 1, fmt->version.minor = -1; | |
211 return 0; | |
212 } | |
213 else if (memcmp(s, "BAI\1", 4) == 0) { | |
214 fmt->category = index_file; | |
215 fmt->format = bai; | |
216 fmt->version.major = -1, fmt->version.minor = -1; | |
217 return 0; | |
218 } | |
219 else if (memcmp(s, "BCF\4", 4) == 0) { | |
220 fmt->category = variant_data; | |
221 fmt->format = bcf; | |
222 fmt->version.major = 1, fmt->version.minor = -1; | |
223 return 0; | |
224 } | |
225 else if (memcmp(s, "BCF\2", 4) == 0) { | |
226 fmt->category = variant_data; | |
227 fmt->format = bcf; | |
228 fmt->version.major = s[3]; | |
229 fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0; | |
230 return 0; | |
231 } | |
232 else if (memcmp(s, "CSI\1", 4) == 0) { | |
233 fmt->category = index_file; | |
234 fmt->format = csi; | |
235 fmt->version.major = 1, fmt->version.minor = -1; | |
236 return 0; | |
237 } | |
238 else if (memcmp(s, "TBI\1", 4) == 0) { | |
239 fmt->category = index_file; | |
240 fmt->format = tbi; | |
241 fmt->version.major = -1, fmt->version.minor = -1; | |
242 return 0; | |
243 } | |
244 } | |
245 else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) { | |
246 fmt->category = variant_data; | |
247 fmt->format = vcf; | |
248 if (len >= 21 && s[16] == 'v') | |
249 parse_version(fmt, &s[17], &s[len]); | |
250 else | |
251 fmt->version.major = fmt->version.minor = -1; | |
252 return 0; | |
253 } | |
254 else if (len >= 4 && s[0] == '@' && | |
255 (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 || | |
256 memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) { | |
257 fmt->category = sequence_data; | |
258 fmt->format = sam; | |
259 // @HD-VN is not guaranteed to be the first tag, but then @HD is | |
260 // not guaranteed to be present at all... | |
261 if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0) | |
262 parse_version(fmt, &s[7], &s[len]); | |
263 else | |
264 fmt->version.major = 1, fmt->version.minor = -1; | |
265 return 0; | |
266 } | |
267 else { | |
268 // Various possibilities for tab-delimited text: | |
269 // .crai (gzipped tab-delimited six columns: seqid 5*number) | |
270 // .bed ([3..12] tab-delimited columns) | |
271 // .bedpe (>= 10 tab-delimited columns) | |
272 // .sam (tab-delimited >= 11 columns: seqid number seqid...) | |
273 // FIXME For now, assume it's SAM | |
274 fmt->category = sequence_data; | |
275 fmt->format = sam; | |
276 fmt->version.major = 1, fmt->version.minor = -1; | |
277 return 0; | |
278 } | |
279 | |
280 fmt->category = unknown_category; | |
281 fmt->format = unknown_format; | |
282 fmt->version.major = fmt->version.minor = -1; | |
283 fmt->compression = no_compression; | |
284 return 0; | |
285 } | |
286 | |
287 char *hts_format_description(const htsFormat *format) | |
288 { | |
289 kstring_t str = { 0, 0, NULL }; | |
290 | |
291 switch (format->format) { | |
292 case sam: kputs("SAM", &str); break; | |
293 case bam: kputs("BAM", &str); break; | |
294 case cram: kputs("CRAM", &str); break; | |
295 case vcf: kputs("VCF", &str); break; | |
296 case bcf: | |
297 if (format->version.major == 1) kputs("Legacy BCF", &str); | |
298 else kputs("BCF", &str); | |
299 break; | |
300 case bai: kputs("BAI", &str); break; | |
301 case crai: kputs("CRAI", &str); break; | |
302 case csi: kputs("CSI", &str); break; | |
303 case tbi: kputs("Tabix", &str); break; | |
304 default: kputs("unknown", &str); break; | |
305 } | |
306 | |
307 if (format->version.major >= 0) { | |
308 kputs(" version ", &str); | |
309 kputw(format->version.major, &str); | |
310 if (format->version.minor >= 0) { | |
311 kputc('.', &str); | |
312 kputw(format->version.minor, &str); | |
313 } | |
314 } | |
315 | |
316 switch (format->compression) { | |
317 case custom: kputs(" compressed", &str); break; | |
318 case gzip: kputs(" gzip-compressed", &str); break; | |
319 case bgzf: | |
320 switch (format->format) { | |
321 case bam: | |
322 case bcf: | |
323 case csi: | |
324 case tbi: | |
325 // These are by definition BGZF, so just use the generic term | |
326 kputs(" compressed", &str); | |
327 break; | |
328 default: | |
329 kputs(" BGZF-compressed", &str); | |
330 break; | |
331 } | |
332 break; | |
333 default: break; | |
334 } | |
335 | |
336 switch (format->category) { | |
337 case sequence_data: kputs(" sequence", &str); break; | |
338 case variant_data: kputs(" variant calling", &str); break; | |
339 case index_file: kputs(" index", &str); break; | |
340 case region_list: kputs(" genomic region", &str); break; | |
341 default: break; | |
342 } | |
343 | |
344 if (format->compression == no_compression) | |
345 switch (format->format) { | |
346 case sam: | |
347 case crai: | |
348 case vcf: | |
349 case bed: | |
350 kputs(" text", &str); | |
351 break; | |
352 | |
353 default: | |
354 kputs(" data", &str); | |
355 break; | |
356 } | |
357 else | |
358 kputs(" data", &str); | |
359 | |
360 return ks_release(&str); | |
361 } | |
362 | |
363 htsFile *hts_open(const char *fn, const char *mode) | |
364 { | |
365 htsFile *fp = NULL; | |
366 hFILE *hfile = hopen(fn, mode); | |
367 if (hfile == NULL) goto error; | |
368 | |
369 fp = hts_hopen(hfile, fn, mode); | |
370 if (fp == NULL) goto error; | |
371 | |
372 return fp; | |
373 | |
374 error: | |
375 if (hts_verbose >= 2) | |
376 fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn); | |
377 | |
378 if (hfile) | |
379 hclose_abruptly(hfile); | |
380 | |
381 return NULL; | |
382 } | |
383 | |
384 htsFile *hts_hopen(struct hFILE *hfile, const char *fn, const char *mode) | |
385 { | |
386 htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile)); | |
387 if (fp == NULL) goto error; | |
388 | |
389 fp->fn = strdup(fn); | |
390 fp->is_be = ed_is_big(); | |
391 | |
392 if (strchr(mode, 'r')) { | |
393 if (hts_detect_format(hfile, &fp->format) < 0) goto error; | |
394 } | |
395 else if (strchr(mode, 'w') || strchr(mode, 'a')) { | |
396 htsFormat *fmt = &fp->format; | |
397 fp->is_write = 1; | |
398 | |
399 if (strchr(mode, 'b')) fmt->format = binary_format; | |
400 else if (strchr(mode, 'c')) fmt->format = cram; | |
401 else fmt->format = text_format; | |
402 | |
403 if (strchr(mode, 'z')) fmt->compression = bgzf; | |
404 else if (strchr(mode, 'g')) fmt->compression = gzip; | |
405 else if (strchr(mode, 'u')) fmt->compression = no_compression; | |
406 else { | |
407 // No compression mode specified, set to the default for the format | |
408 switch (fmt->format) { | |
409 case binary_format: fmt->compression = bgzf; break; | |
410 case cram: fmt->compression = custom; break; | |
411 case text_format: fmt->compression = no_compression; break; | |
412 default: abort(); | |
413 } | |
414 } | |
415 | |
416 // Fill in category (if determinable; e.g. 'b' could be BAM or BCF) | |
417 fmt->category = format_category(fmt->format); | |
418 | |
419 fmt->version.major = fmt->version.minor = -1; | |
420 fmt->compression_level = -1; | |
421 fmt->specific = NULL; | |
422 } | |
423 else goto error; | |
424 | |
425 switch (fp->format.format) { | |
426 case binary_format: | |
427 case bam: | |
428 case bcf: | |
429 fp->fp.bgzf = bgzf_hopen(hfile, mode); | |
430 if (fp->fp.bgzf == NULL) goto error; | |
431 fp->is_bin = 1; | |
432 break; | |
433 | |
434 case cram: | |
435 fp->fp.cram = cram_dopen(hfile, fn, mode); | |
436 if (fp->fp.cram == NULL) goto error; | |
437 if (!fp->is_write) | |
438 cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1); | |
439 fp->is_cram = 1; | |
440 break; | |
441 | |
442 case text_format: | |
443 case sam: | |
444 case vcf: | |
445 if (!fp->is_write) { | |
446 #if KS_BGZF | |
447 BGZF *gzfp = bgzf_hopen(hfile, mode); | |
448 #else | |
449 // TODO Implement gzip hFILE adaptor | |
450 hclose(hfile); // This won't work, especially for stdin | |
451 gzFile gzfp = strcmp(fn, "-")? gzopen(fn, "rb") : gzdopen(fileno(stdin), "rb"); | |
452 #endif | |
453 if (gzfp) fp->fp.voidp = ks_init(gzfp); | |
454 else goto error; | |
455 } | |
456 else if (fp->format.compression != no_compression) { | |
457 fp->fp.bgzf = bgzf_hopen(hfile, mode); | |
458 if (fp->fp.bgzf == NULL) goto error; | |
459 } | |
460 else | |
461 fp->fp.hfile = hfile; | |
462 break; | |
463 | |
464 default: | |
465 goto error; | |
466 } | |
467 | |
468 return fp; | |
469 | |
470 error: | |
471 if (hts_verbose >= 2) | |
472 fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn); | |
473 | |
474 if (fp) { | |
475 free(fp->fn); | |
476 free(fp->fn_aux); | |
477 free(fp); | |
478 } | |
479 return NULL; | |
480 } | |
481 | |
482 int hts_close(htsFile *fp) | |
483 { | |
484 int ret, save; | |
485 | |
486 switch (fp->format.format) { | |
487 case binary_format: | |
488 case bam: | |
489 case bcf: | |
490 ret = bgzf_close(fp->fp.bgzf); | |
491 break; | |
492 | |
493 case cram: | |
494 if (!fp->is_write) { | |
495 switch (cram_eof(fp->fp.cram)) { | |
496 case 0: | |
497 fprintf(stderr, "[E::%s] Failed to decode sequence.\n", __func__); | |
498 return -1; | |
499 case 2: | |
500 fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__); | |
501 break; | |
502 default: /* case 1, expected EOF */ | |
503 break; | |
504 } | |
505 } | |
506 ret = cram_close(fp->fp.cram); | |
507 break; | |
508 | |
509 case text_format: | |
510 case sam: | |
511 case vcf: | |
512 if (!fp->is_write) { | |
513 #if KS_BGZF | |
514 BGZF *gzfp = ((kstream_t*)fp->fp.voidp)->f; | |
515 ret = bgzf_close(gzfp); | |
516 #else | |
517 gzFile gzfp = ((kstream_t*)fp->fp.voidp)->f; | |
518 ret = gzclose(gzfp); | |
519 #endif | |
520 ks_destroy((kstream_t*)fp->fp.voidp); | |
521 } | |
522 else if (fp->format.compression != no_compression) | |
523 ret = bgzf_close(fp->fp.bgzf); | |
524 else | |
525 ret = hclose(fp->fp.hfile); | |
526 break; | |
527 | |
528 default: | |
529 ret = -1; | |
530 break; | |
531 } | |
532 | |
533 save = errno; | |
534 free(fp->fn); | |
535 free(fp->fn_aux); | |
536 free(fp->line.s); | |
537 free(fp); | |
538 errno = save; | |
539 return ret; | |
540 } | |
541 | |
542 const htsFormat *hts_get_format(htsFile *fp) | |
543 { | |
544 return fp? &fp->format : NULL; | |
545 } | |
546 | |
547 int hts_set_opt(htsFile *fp, enum cram_option opt, ...) { | |
548 int r; | |
549 va_list args; | |
550 | |
551 if (fp->format.format != cram) | |
552 return 0; | |
553 | |
554 va_start(args, opt); | |
555 r = cram_set_voption(fp->fp.cram, opt, args); | |
556 va_end(args); | |
557 | |
558 return r; | |
559 } | |
560 | |
561 int hts_set_threads(htsFile *fp, int n) | |
562 { | |
563 if (fp->format.compression == bgzf) { | |
564 return bgzf_mt(fp->fp.bgzf, n, 256); | |
565 } else if (fp->format.format == cram) { | |
566 return hts_set_opt(fp, CRAM_OPT_NTHREADS, n); | |
567 } | |
568 else return 0; | |
569 } | |
570 | |
571 int hts_set_fai_filename(htsFile *fp, const char *fn_aux) | |
572 { | |
573 free(fp->fn_aux); | |
574 if (fn_aux) { | |
575 fp->fn_aux = strdup(fn_aux); | |
576 if (fp->fn_aux == NULL) return -1; | |
577 } | |
578 else fp->fn_aux = NULL; | |
579 | |
580 if (fp->format.format == cram) | |
581 cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux); | |
582 | |
583 return 0; | |
584 } | |
585 | |
586 // For VCF/BCF backward sweeper. Not exposing these functions because their | |
587 // future is uncertain. Things will probably have to change with hFILE... | |
588 BGZF *hts_get_bgzfp(htsFile *fp) | |
589 { | |
590 if ( fp->is_bin ) | |
591 return fp->fp.bgzf; | |
592 else | |
593 return ((kstream_t*)fp->fp.voidp)->f; | |
594 } | |
595 int hts_useek(htsFile *fp, long uoffset, int where) | |
596 { | |
597 if ( fp->is_bin ) | |
598 return bgzf_useek(fp->fp.bgzf, uoffset, where); | |
599 else | |
600 { | |
601 ks_rewind((kstream_t*)fp->fp.voidp); | |
602 ((kstream_t*)fp->fp.voidp)->seek_pos = uoffset; | |
603 return bgzf_useek(((kstream_t*)fp->fp.voidp)->f, uoffset, where); | |
604 } | |
605 } | |
606 long hts_utell(htsFile *fp) | |
607 { | |
608 if ( fp->is_bin ) | |
609 return bgzf_utell(fp->fp.bgzf); | |
610 else | |
611 return ((kstream_t*)fp->fp.voidp)->seek_pos; | |
612 } | |
613 | |
614 int hts_getline(htsFile *fp, int delimiter, kstring_t *str) | |
615 { | |
616 int ret, dret; | |
617 ret = ks_getuntil((kstream_t*)fp->fp.voidp, delimiter, str, &dret); | |
618 ++fp->lineno; | |
619 return ret; | |
620 } | |
621 | |
622 char **hts_readlist(const char *string, int is_file, int *_n) | |
623 { | |
624 int m = 0, n = 0, dret; | |
625 char **s = 0; | |
626 if ( is_file ) | |
627 { | |
628 #if KS_BGZF | |
629 BGZF *fp = bgzf_open(string, "r"); | |
630 #else | |
631 gzFile fp = gzopen(string, "r"); | |
632 #endif | |
633 if ( !fp ) return NULL; | |
634 | |
635 kstream_t *ks; | |
636 kstring_t str; | |
637 str.s = 0; str.l = str.m = 0; | |
638 ks = ks_init(fp); | |
639 while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) | |
640 { | |
641 if (str.l == 0) continue; | |
642 n++; | |
643 hts_expand(char*,n,m,s); | |
644 s[n-1] = strdup(str.s); | |
645 } | |
646 ks_destroy(ks); | |
647 #if KS_BGZF | |
648 bgzf_close(fp); | |
649 #else | |
650 gzclose(fp); | |
651 #endif | |
652 free(str.s); | |
653 } | |
654 else | |
655 { | |
656 const char *q = string, *p = string; | |
657 while ( 1 ) | |
658 { | |
659 if (*p == ',' || *p == 0) | |
660 { | |
661 n++; | |
662 hts_expand(char*,n,m,s); | |
663 s[n-1] = (char*)calloc(p - q + 1, 1); | |
664 strncpy(s[n-1], q, p - q); | |
665 q = p + 1; | |
666 } | |
667 if ( !*p ) break; | |
668 p++; | |
669 } | |
670 } | |
671 s = (char**)realloc(s, n * sizeof(char*)); | |
672 *_n = n; | |
673 return s; | |
674 } | |
675 | |
676 char **hts_readlines(const char *fn, int *_n) | |
677 { | |
678 int m = 0, n = 0, dret; | |
679 char **s = 0; | |
680 #if KS_BGZF | |
681 BGZF *fp = bgzf_open(fn, "r"); | |
682 #else | |
683 gzFile fp = gzopen(fn, "r"); | |
684 #endif | |
685 if ( fp ) { // read from file | |
686 kstream_t *ks; | |
687 kstring_t str; | |
688 str.s = 0; str.l = str.m = 0; | |
689 ks = ks_init(fp); | |
690 while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { | |
691 if (str.l == 0) continue; | |
692 if (m == n) { | |
693 m = m? m<<1 : 16; | |
694 s = (char**)realloc(s, m * sizeof(char*)); | |
695 } | |
696 s[n++] = strdup(str.s); | |
697 } | |
698 ks_destroy(ks); | |
699 #if KS_BGZF | |
700 bgzf_close(fp); | |
701 #else | |
702 gzclose(fp); | |
703 #endif | |
704 s = (char**)realloc(s, n * sizeof(char*)); | |
705 free(str.s); | |
706 } else if (*fn == ':') { // read from string | |
707 const char *q, *p; | |
708 for (q = p = fn + 1;; ++p) | |
709 if (*p == ',' || *p == 0) { | |
710 if (m == n) { | |
711 m = m? m<<1 : 16; | |
712 s = (char**)realloc(s, m * sizeof(char*)); | |
713 } | |
714 s[n] = (char*)calloc(p - q + 1, 1); | |
715 strncpy(s[n++], q, p - q); | |
716 q = p + 1; | |
717 if (*p == 0) break; | |
718 } | |
719 } else return 0; | |
720 s = (char**)realloc(s, n * sizeof(char*)); | |
721 *_n = n; | |
722 return s; | |
723 } | |
724 | |
725 // DEPRECATED: To be removed in a future HTSlib release | |
726 int hts_file_type(const char *fname) | |
727 { | |
728 int len = strlen(fname); | |
729 if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ; | |
730 if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF; | |
731 if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ; | |
732 if ( !strcmp("-",fname) ) return FT_STDIN; | |
733 | |
734 hFILE *f = hopen(fname, "r"); | |
735 if (f == NULL) return 0; | |
736 | |
737 htsFormat fmt; | |
738 if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; } | |
739 if (hclose(f) < 0) return 0; | |
740 | |
741 switch (fmt.format) { | |
742 case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ; | |
743 case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ; | |
744 default: return 0; | |
745 } | |
746 } | |
747 | |
748 /**************** | |
749 *** Indexing *** | |
750 ****************/ | |
751 | |
752 #define HTS_MIN_MARKER_DIST 0x10000 | |
753 | |
754 // Finds the special meta bin | |
755 // ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1 | |
756 #define META_BIN(idx) ((idx)->n_bins + 1) | |
757 | |
758 #define pair64_lt(a,b) ((a).u < (b).u) | |
759 | |
760 #include "htslib/ksort.h" | |
761 KSORT_INIT(_off, hts_pair64_t, pair64_lt) | |
762 | |
763 typedef struct { | |
764 int32_t m, n; | |
765 uint64_t loff; | |
766 hts_pair64_t *list; | |
767 } bins_t; | |
768 | |
769 #include "htslib/khash.h" | |
770 KHASH_MAP_INIT_INT(bin, bins_t) | |
771 typedef khash_t(bin) bidx_t; | |
772 | |
773 typedef struct { | |
774 int32_t n, m; | |
775 uint64_t *offset; | |
776 } lidx_t; | |
777 | |
778 struct __hts_idx_t { | |
779 int fmt, min_shift, n_lvls, n_bins; | |
780 uint32_t l_meta; | |
781 int32_t n, m; | |
782 uint64_t n_no_coor; | |
783 bidx_t **bidx; | |
784 lidx_t *lidx; | |
785 uint8_t *meta; | |
786 struct { | |
787 uint32_t last_bin, save_bin; | |
788 int last_coor, last_tid, save_tid, finished; | |
789 uint64_t last_off, save_off; | |
790 uint64_t off_beg, off_end; | |
791 uint64_t n_mapped, n_unmapped; | |
792 } z; // keep internal states | |
793 }; | |
794 | |
795 static inline void insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end) | |
796 { | |
797 khint_t k; | |
798 bins_t *l; | |
799 int absent; | |
800 k = kh_put(bin, b, bin, &absent); | |
801 l = &kh_value(b, k); | |
802 if (absent) { | |
803 l->m = 1; l->n = 0; | |
804 l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t)); | |
805 } | |
806 if (l->n == l->m) { | |
807 l->m <<= 1; | |
808 l->list = (hts_pair64_t*)realloc(l->list, l->m * sizeof(hts_pair64_t)); | |
809 } | |
810 l->list[l->n].u = beg; | |
811 l->list[l->n++].v = end; | |
812 } | |
813 | |
814 static inline void insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift) | |
815 { | |
816 int i, beg, end; | |
817 beg = _beg >> min_shift; | |
818 end = (_end - 1) >> min_shift; | |
819 if (l->m < end + 1) { | |
820 int old_m = l->m; | |
821 l->m = end + 1; | |
822 kroundup32(l->m); | |
823 l->offset = (uint64_t*)realloc(l->offset, l->m * sizeof(uint64_t)); | |
824 memset(l->offset + old_m, 0xff, 8 * (l->m - old_m)); // fill l->offset with (uint64_t)-1 | |
825 } | |
826 if (beg == end) { // to save a loop in this case | |
827 if (l->offset[beg] == (uint64_t)-1) l->offset[beg] = offset; | |
828 } else { | |
829 for (i = beg; i <= end; ++i) | |
830 if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset; | |
831 } | |
832 if (l->n < end + 1) l->n = end + 1; | |
833 } | |
834 | |
835 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls) | |
836 { | |
837 hts_idx_t *idx; | |
838 idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t)); | |
839 if (idx == NULL) return NULL; | |
840 idx->fmt = fmt; | |
841 idx->min_shift = min_shift; | |
842 idx->n_lvls = n_lvls; | |
843 idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7; | |
844 idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu; | |
845 idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0; | |
846 idx->z.last_coor = 0xffffffffu; | |
847 if (n) { | |
848 idx->n = idx->m = n; | |
849 idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*)); | |
850 if (idx->bidx == NULL) { free(idx); return NULL; } | |
851 idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t)); | |
852 if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; } | |
853 } | |
854 return idx; | |
855 } | |
856 | |
857 static void update_loff(hts_idx_t *idx, int i, int free_lidx) | |
858 { | |
859 bidx_t *bidx = idx->bidx[i]; | |
860 lidx_t *lidx = &idx->lidx[i]; | |
861 khint_t k; | |
862 int l; | |
863 uint64_t offset0 = 0; | |
864 if (bidx) { | |
865 k = kh_get(bin, bidx, META_BIN(idx)); | |
866 if (k != kh_end(bidx)) | |
867 offset0 = kh_val(bidx, k).list[0].u; | |
868 for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l) | |
869 lidx->offset[l] = offset0; | |
870 } else l = 1; | |
871 for (; l < lidx->n; ++l) // fill missing values | |
872 if (lidx->offset[l] == (uint64_t)-1) | |
873 lidx->offset[l] = lidx->offset[l-1]; | |
874 if (bidx == 0) return; | |
875 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff | |
876 if (kh_exist(bidx, k)) | |
877 { | |
878 if ( kh_key(bidx, k) < idx->n_bins ) | |
879 { | |
880 int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls); | |
881 // disable linear index if bot_bin out of bounds | |
882 kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0; | |
883 } | |
884 else | |
885 kh_val(bidx, k).loff = 0; | |
886 } | |
887 if (free_lidx) { | |
888 free(lidx->offset); | |
889 lidx->m = lidx->n = 0; | |
890 lidx->offset = 0; | |
891 } | |
892 } | |
893 | |
894 static void compress_binning(hts_idx_t *idx, int i) | |
895 { | |
896 bidx_t *bidx = idx->bidx[i]; | |
897 khint_t k; | |
898 int l, m; | |
899 if (bidx == 0) return; | |
900 // merge a bin to its parent if the bin is too small | |
901 for (l = idx->n_lvls; l > 0; --l) { | |
902 unsigned start = hts_bin_first(l); | |
903 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) { | |
904 bins_t *p, *q; | |
905 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue; | |
906 p = &kh_value(bidx, k); | |
907 if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list); | |
908 if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) { | |
909 khint_t kp; | |
910 kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k))); | |
911 if (kp == kh_end(bidx)) continue; | |
912 q = &kh_val(bidx, kp); | |
913 if (q->n + p->n > q->m) { | |
914 q->m = q->n + p->n; | |
915 kroundup32(q->m); | |
916 q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t)); | |
917 } | |
918 memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t)); | |
919 q->n += p->n; | |
920 free(p->list); | |
921 kh_del(bin, bidx, k); | |
922 } | |
923 } | |
924 } | |
925 k = kh_get(bin, bidx, 0); | |
926 if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list); | |
927 // merge adjacent chunks that start from the same BGZF block | |
928 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) { | |
929 bins_t *p; | |
930 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue; | |
931 p = &kh_value(bidx, k); | |
932 for (l = 1, m = 0; l < p->n; ++l) { | |
933 if (p->list[m].v>>16 >= p->list[l].u>>16) { | |
934 if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v; | |
935 } else p->list[++m] = p->list[l]; | |
936 } | |
937 p->n = m + 1; | |
938 } | |
939 } | |
940 | |
941 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) | |
942 { | |
943 int i; | |
944 if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times | |
945 if (idx->z.save_tid >= 0) { | |
946 insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset); | |
947 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset); | |
948 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped); | |
949 } | |
950 for (i = 0; i < idx->n; ++i) { | |
951 update_loff(idx, i, (idx->fmt == HTS_FMT_CSI)); | |
952 compress_binning(idx, i); | |
953 } | |
954 idx->z.finished = 1; | |
955 } | |
956 | |
957 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) | |
958 { | |
959 int bin; | |
960 if (tid<0) beg = -1, end = 0; | |
961 if (tid >= idx->m) { // enlarge the index | |
962 int32_t oldm = idx->m; | |
963 idx->m = idx->m? idx->m<<1 : 2; | |
964 idx->bidx = (bidx_t**)realloc(idx->bidx, idx->m * sizeof(bidx_t*)); | |
965 idx->lidx = (lidx_t*) realloc(idx->lidx, idx->m * sizeof(lidx_t)); | |
966 memset(&idx->bidx[oldm], 0, (idx->m - oldm) * sizeof(bidx_t*)); | |
967 memset(&idx->lidx[oldm], 0, (idx->m - oldm) * sizeof(lidx_t)); | |
968 } | |
969 if (idx->n < tid + 1) idx->n = tid + 1; | |
970 if (idx->z.finished) return 0; | |
971 if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome | |
972 if ( tid>=0 && idx->n_no_coor ) | |
973 { | |
974 if (hts_verbose >= 1) fprintf(stderr,"[E::%s] NO_COOR reads not in a single block at the end %d %d\n", __func__, tid,idx->z.last_tid); | |
975 return -1; | |
976 } | |
977 if (tid>=0 && idx->bidx[tid] != 0) | |
978 { | |
979 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] chromosome blocks not continuous\n", __func__); | |
980 return -1; | |
981 } | |
982 idx->z.last_tid = tid; | |
983 idx->z.last_bin = 0xffffffffu; | |
984 } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order | |
985 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] unsorted positions\n", __func__); | |
986 return -1; | |
987 } | |
988 if ( tid>=0 ) | |
989 { | |
990 if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin); | |
991 if ( is_mapped) | |
992 insert_to_l(&idx->lidx[tid], beg, end, idx->z.last_off, idx->min_shift); // last_off points to the start of the current record | |
993 } | |
994 else idx->n_no_coor++; | |
995 bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls); | |
996 if ((int)idx->z.last_bin != bin) { // then possibly write the binning index | |
997 if (idx->z.save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record | |
998 insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, idx->z.last_off); | |
999 if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information | |
1000 idx->z.off_end = idx->z.last_off; | |
1001 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, idx->z.off_end); | |
1002 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped); | |
1003 idx->z.n_mapped = idx->z.n_unmapped = 0; | |
1004 idx->z.off_beg = idx->z.off_end; | |
1005 } | |
1006 idx->z.save_off = idx->z.last_off; | |
1007 idx->z.save_bin = idx->z.last_bin = bin; | |
1008 idx->z.save_tid = tid; | |
1009 } | |
1010 if (is_mapped) ++idx->z.n_mapped; | |
1011 else ++idx->z.n_unmapped; | |
1012 idx->z.last_off = offset; | |
1013 idx->z.last_coor = beg; | |
1014 return 0; | |
1015 } | |
1016 | |
1017 void hts_idx_destroy(hts_idx_t *idx) | |
1018 { | |
1019 khint_t k; | |
1020 int i; | |
1021 if (idx == 0) return; | |
1022 // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c | |
1023 if (idx->fmt == HTS_FMT_CRAI) { free(idx); return; } | |
1024 | |
1025 for (i = 0; i < idx->m; ++i) { | |
1026 bidx_t *bidx = idx->bidx[i]; | |
1027 free(idx->lidx[i].offset); | |
1028 if (bidx == 0) continue; | |
1029 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) | |
1030 if (kh_exist(bidx, k)) | |
1031 free(kh_value(bidx, k).list); | |
1032 kh_destroy(bin, bidx); | |
1033 } | |
1034 free(idx->bidx); free(idx->lidx); free(idx->meta); | |
1035 free(idx); | |
1036 } | |
1037 | |
1038 static inline long idx_read(int is_bgzf, void *fp, void *buf, long l) | |
1039 { | |
1040 if (is_bgzf) return bgzf_read((BGZF*)fp, buf, l); | |
1041 else return (long)fread(buf, 1, l, (FILE*)fp); | |
1042 } | |
1043 | |
1044 static inline long idx_write(int is_bgzf, void *fp, const void *buf, long l) | |
1045 { | |
1046 if (is_bgzf) return bgzf_write((BGZF*)fp, buf, l); | |
1047 else return (long)fwrite(buf, 1, l, (FILE*)fp); | |
1048 } | |
1049 | |
1050 static inline void swap_bins(bins_t *p) | |
1051 { | |
1052 int i; | |
1053 for (i = 0; i < p->n; ++i) { | |
1054 ed_swap_8p(&p->list[i].u); | |
1055 ed_swap_8p(&p->list[i].v); | |
1056 } | |
1057 } | |
1058 | |
1059 static void hts_idx_save_core(const hts_idx_t *idx, void *fp, int fmt) | |
1060 { | |
1061 int32_t i, size, is_be; | |
1062 int is_bgzf = (fmt != HTS_FMT_BAI); | |
1063 is_be = ed_is_big(); | |
1064 if (is_be) { | |
1065 uint32_t x = idx->n; | |
1066 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); | |
1067 } else idx_write(is_bgzf, fp, &idx->n, 4); | |
1068 if (fmt == HTS_FMT_TBI && idx->l_meta) idx_write(is_bgzf, fp, idx->meta, idx->l_meta); | |
1069 for (i = 0; i < idx->n; ++i) { | |
1070 khint_t k; | |
1071 bidx_t *bidx = idx->bidx[i]; | |
1072 lidx_t *lidx = &idx->lidx[i]; | |
1073 // write binning index | |
1074 size = bidx? kh_size(bidx) : 0; | |
1075 if (is_be) { // big endian | |
1076 uint32_t x = size; | |
1077 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); | |
1078 } else idx_write(is_bgzf, fp, &size, 4); | |
1079 if (bidx == 0) goto write_lidx; | |
1080 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) { | |
1081 bins_t *p; | |
1082 if (!kh_exist(bidx, k)) continue; | |
1083 p = &kh_value(bidx, k); | |
1084 if (is_be) { // big endian | |
1085 uint32_t x; | |
1086 x = kh_key(bidx, k); idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); | |
1087 if (fmt == HTS_FMT_CSI) { | |
1088 uint64_t y = kh_val(bidx, k).loff; | |
1089 idx_write(is_bgzf, fp, ed_swap_4p(&y), 8); | |
1090 } | |
1091 x = p->n; idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); | |
1092 swap_bins(p); | |
1093 idx_write(is_bgzf, fp, p->list, 16 * p->n); | |
1094 swap_bins(p); | |
1095 } else { | |
1096 idx_write(is_bgzf, fp, &kh_key(bidx, k), 4); | |
1097 if (fmt == HTS_FMT_CSI) idx_write(is_bgzf, fp, &kh_val(bidx, k).loff, 8); | |
1098 //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v); | |
1099 idx_write(is_bgzf, fp, &p->n, 4); | |
1100 idx_write(is_bgzf, fp, p->list, p->n << 4); | |
1101 } | |
1102 } | |
1103 write_lidx: | |
1104 if (fmt != HTS_FMT_CSI) { | |
1105 if (is_be) { | |
1106 int32_t x = lidx->n; | |
1107 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); | |
1108 for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]); | |
1109 idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3); | |
1110 for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]); | |
1111 } else { | |
1112 idx_write(is_bgzf, fp, &lidx->n, 4); | |
1113 idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3); | |
1114 } | |
1115 } | |
1116 } | |
1117 if (is_be) { // write the number of reads without coordinates | |
1118 uint64_t x = idx->n_no_coor; | |
1119 idx_write(is_bgzf, fp, &x, 8); | |
1120 } else idx_write(is_bgzf, fp, &idx->n_no_coor, 8); | |
1121 } | |
1122 | |
1123 void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) | |
1124 { | |
1125 char *fnidx; | |
1126 fnidx = (char*)calloc(1, strlen(fn) + 5); | |
1127 strcpy(fnidx, fn); | |
1128 if (fmt == HTS_FMT_CSI) { | |
1129 BGZF *fp; | |
1130 uint32_t x[3]; | |
1131 int is_be, i; | |
1132 is_be = ed_is_big(); | |
1133 fp = bgzf_open(strcat(fnidx, ".csi"), "w"); | |
1134 bgzf_write(fp, "CSI\1", 4); | |
1135 x[0] = idx->min_shift; x[1] = idx->n_lvls; x[2] = idx->l_meta; | |
1136 if (is_be) { | |
1137 for (i = 0; i < 3; ++i) | |
1138 bgzf_write(fp, ed_swap_4p(&x[i]), 4); | |
1139 } else bgzf_write(fp, &x, 12); | |
1140 if (idx->l_meta) bgzf_write(fp, idx->meta, idx->l_meta); | |
1141 hts_idx_save_core(idx, fp, HTS_FMT_CSI); | |
1142 bgzf_close(fp); | |
1143 } else if (fmt == HTS_FMT_TBI) { | |
1144 BGZF *fp; | |
1145 fp = bgzf_open(strcat(fnidx, ".tbi"), "w"); | |
1146 bgzf_write(fp, "TBI\1", 4); | |
1147 hts_idx_save_core(idx, fp, HTS_FMT_TBI); | |
1148 bgzf_close(fp); | |
1149 } else if (fmt == HTS_FMT_BAI) { | |
1150 FILE *fp; | |
1151 fp = fopen(strcat(fnidx, ".bai"), "w"); | |
1152 fwrite("BAI\1", 1, 4, fp); | |
1153 hts_idx_save_core(idx, fp, HTS_FMT_BAI); | |
1154 fclose(fp); | |
1155 } else abort(); | |
1156 free(fnidx); | |
1157 } | |
1158 | |
1159 static int hts_idx_load_core(hts_idx_t *idx, void *fp, int fmt) | |
1160 { | |
1161 int32_t i, n, is_be; | |
1162 int is_bgzf = (fmt != HTS_FMT_BAI); | |
1163 is_be = ed_is_big(); | |
1164 if (idx == NULL) return -4; | |
1165 for (i = 0; i < idx->n; ++i) { | |
1166 bidx_t *h; | |
1167 lidx_t *l = &idx->lidx[i]; | |
1168 uint32_t key; | |
1169 int j, absent; | |
1170 bins_t *p; | |
1171 h = idx->bidx[i] = kh_init(bin); | |
1172 if (idx_read(is_bgzf, fp, &n, 4) != 4) return -1; | |
1173 if (is_be) ed_swap_4p(&n); | |
1174 for (j = 0; j < n; ++j) { | |
1175 khint_t k; | |
1176 if (idx_read(is_bgzf, fp, &key, 4) != 4) return -1; | |
1177 if (is_be) ed_swap_4p(&key); | |
1178 k = kh_put(bin, h, key, &absent); | |
1179 if (absent <= 0) return -3; // Duplicate bin number | |
1180 p = &kh_val(h, k); | |
1181 if (fmt == HTS_FMT_CSI) { | |
1182 if (idx_read(is_bgzf, fp, &p->loff, 8) != 8) return -1; | |
1183 if (is_be) ed_swap_8p(&p->loff); | |
1184 } else p->loff = 0; | |
1185 if (idx_read(is_bgzf, fp, &p->n, 4) != 4) return -1; | |
1186 if (is_be) ed_swap_4p(&p->n); | |
1187 p->m = p->n; | |
1188 p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t)); | |
1189 if (p->list == NULL) return -2; | |
1190 if (idx_read(is_bgzf, fp, p->list, p->n<<4) != p->n<<4) return -1; | |
1191 if (is_be) swap_bins(p); | |
1192 } | |
1193 if (fmt != HTS_FMT_CSI) { // load linear index | |
1194 int j; | |
1195 if (idx_read(is_bgzf, fp, &l->n, 4) != 4) return -1; | |
1196 if (is_be) ed_swap_4p(&l->n); | |
1197 l->m = l->n; | |
1198 l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t)); | |
1199 if (l->offset == NULL) return -2; | |
1200 if (idx_read(is_bgzf, fp, l->offset, l->n << 3) != l->n << 3) return -1; | |
1201 if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]); | |
1202 for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix | |
1203 if (l->offset[j] == 0) l->offset[j] = l->offset[j-1]; | |
1204 update_loff(idx, i, 1); | |
1205 } | |
1206 } | |
1207 if (idx_read(is_bgzf, fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0; | |
1208 if (is_be) ed_swap_8p(&idx->n_no_coor); | |
1209 return 0; | |
1210 } | |
1211 | |
1212 hts_idx_t *hts_idx_load_local(const char *fn, int fmt) | |
1213 { | |
1214 uint8_t magic[4]; | |
1215 int i, is_be; | |
1216 hts_idx_t *idx = NULL; | |
1217 is_be = ed_is_big(); | |
1218 if (fmt == HTS_FMT_CSI) { | |
1219 BGZF *fp; | |
1220 uint32_t x[3], n; | |
1221 uint8_t *meta = 0; | |
1222 if ((fp = bgzf_open(fn, "r")) == 0) return NULL; | |
1223 if (bgzf_read(fp, magic, 4) != 4) goto csi_fail; | |
1224 if (memcmp(magic, "CSI\1", 4) != 0) goto csi_fail; | |
1225 if (bgzf_read(fp, x, 12) != 12) goto csi_fail; | |
1226 if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]); | |
1227 if (x[2]) { | |
1228 if ((meta = (uint8_t*)malloc(x[2])) == NULL) goto csi_fail; | |
1229 if (bgzf_read(fp, meta, x[2]) != x[2]) goto csi_fail; | |
1230 } | |
1231 if (bgzf_read(fp, &n, 4) != 4) goto csi_fail; | |
1232 if (is_be) ed_swap_4p(&n); | |
1233 if ((idx = hts_idx_init(n, fmt, 0, x[0], x[1])) == NULL) goto csi_fail; | |
1234 idx->l_meta = x[2]; | |
1235 idx->meta = meta; | |
1236 meta = NULL; | |
1237 if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto csi_fail; | |
1238 bgzf_close(fp); | |
1239 return idx; | |
1240 | |
1241 csi_fail: | |
1242 bgzf_close(fp); | |
1243 hts_idx_destroy(idx); | |
1244 free(meta); | |
1245 return NULL; | |
1246 | |
1247 } else if (fmt == HTS_FMT_TBI) { | |
1248 BGZF *fp; | |
1249 uint32_t x[8]; | |
1250 if ((fp = bgzf_open(fn, "r")) == 0) return NULL; | |
1251 if (bgzf_read(fp, magic, 4) != 4) goto tbi_fail; | |
1252 if (memcmp(magic, "TBI\1", 4) != 0) goto tbi_fail; | |
1253 if (bgzf_read(fp, x, 32) != 32) goto tbi_fail; | |
1254 if (is_be) for (i = 0; i < 8; ++i) ed_swap_4p(&x[i]); | |
1255 if ((idx = hts_idx_init(x[0], fmt, 0, 14, 5)) == NULL) goto tbi_fail; | |
1256 idx->l_meta = 28 + x[7]; | |
1257 if ((idx->meta = (uint8_t*)malloc(idx->l_meta)) == NULL) goto tbi_fail; | |
1258 memcpy(idx->meta, &x[1], 28); | |
1259 if (bgzf_read(fp, idx->meta + 28, x[7]) != x[7]) goto tbi_fail; | |
1260 if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto tbi_fail; | |
1261 bgzf_close(fp); | |
1262 return idx; | |
1263 | |
1264 tbi_fail: | |
1265 bgzf_close(fp); | |
1266 hts_idx_destroy(idx); | |
1267 return NULL; | |
1268 | |
1269 } else if (fmt == HTS_FMT_BAI) { | |
1270 uint32_t n; | |
1271 FILE *fp; | |
1272 if ((fp = fopen(fn, "rb")) == 0) return NULL; | |
1273 if (fread(magic, 1, 4, fp) != 4) goto bai_fail; | |
1274 if (memcmp(magic, "BAI\1", 4) != 0) goto bai_fail; | |
1275 if (fread(&n, 4, 1, fp) != 1) goto bai_fail; | |
1276 if (is_be) ed_swap_4p(&n); | |
1277 idx = hts_idx_init(n, fmt, 0, 14, 5); | |
1278 if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto bai_fail; | |
1279 fclose(fp); | |
1280 return idx; | |
1281 | |
1282 bai_fail: | |
1283 fclose(fp); | |
1284 hts_idx_destroy(idx); | |
1285 return NULL; | |
1286 | |
1287 } else abort(); | |
1288 } | |
1289 | |
1290 void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy) | |
1291 { | |
1292 if (idx->meta) free(idx->meta); | |
1293 idx->l_meta = l_meta; | |
1294 if (is_copy) { | |
1295 idx->meta = (uint8_t*)malloc(l_meta); | |
1296 memcpy(idx->meta, meta, l_meta); | |
1297 } else idx->meta = meta; | |
1298 } | |
1299 | |
1300 uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta) | |
1301 { | |
1302 *l_meta = idx->l_meta; | |
1303 return idx->meta; | |
1304 } | |
1305 | |
1306 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) | |
1307 { | |
1308 if ( !idx->n ) | |
1309 { | |
1310 *n = 0; | |
1311 return NULL; | |
1312 } | |
1313 | |
1314 int tid = 0, i; | |
1315 const char **names = (const char**) calloc(idx->n,sizeof(const char*)); | |
1316 for (i=0; i<idx->n; i++) | |
1317 { | |
1318 bidx_t *bidx = idx->bidx[i]; | |
1319 if ( !bidx ) continue; | |
1320 names[tid++] = getid(hdr,i); | |
1321 } | |
1322 *n = tid; | |
1323 return names; | |
1324 } | |
1325 | |
1326 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped) | |
1327 { | |
1328 if ( idx->fmt == HTS_FMT_CRAI ) { | |
1329 *mapped = 0; *unmapped = 0; | |
1330 return -1; | |
1331 } | |
1332 | |
1333 bidx_t *h = idx->bidx[tid]; | |
1334 khint_t k = kh_get(bin, h, META_BIN(idx)); | |
1335 if (k != kh_end(h)) { | |
1336 *mapped = kh_val(h, k).list[1].u; | |
1337 *unmapped = kh_val(h, k).list[1].v; | |
1338 return 0; | |
1339 } else { | |
1340 *mapped = 0; *unmapped = 0; | |
1341 return -1; | |
1342 } | |
1343 } | |
1344 | |
1345 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) | |
1346 { | |
1347 return idx->n_no_coor; | |
1348 } | |
1349 | |
1350 /**************** | |
1351 *** Iterator *** | |
1352 ****************/ | |
1353 | |
1354 static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls) | |
1355 { | |
1356 int l, t, s = min_shift + (n_lvls<<1) + n_lvls; | |
1357 if (beg >= end) return 0; | |
1358 if (end >= 1LL<<s) end = 1LL<<s; | |
1359 for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { | |
1360 int b, e, n, i; | |
1361 b = t + (beg>>s); e = t + (end>>s); n = e - b + 1; | |
1362 if (itr->bins.n + n > itr->bins.m) { | |
1363 itr->bins.m = itr->bins.n + n; | |
1364 kroundup32(itr->bins.m); | |
1365 itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m); | |
1366 } | |
1367 for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i; | |
1368 } | |
1369 return itr->bins.n; | |
1370 } | |
1371 | |
1372 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) | |
1373 { | |
1374 int i, n_off, l, bin; | |
1375 hts_pair64_t *off; | |
1376 khint_t k; | |
1377 bidx_t *bidx; | |
1378 uint64_t min_off; | |
1379 hts_itr_t *iter = 0; | |
1380 if (tid < 0) { | |
1381 int finished0 = 0; | |
1382 uint64_t off0 = (uint64_t)-1; | |
1383 khint_t k; | |
1384 switch (tid) { | |
1385 case HTS_IDX_START: | |
1386 // Find the smallest offset, note that sequence ids may not be ordered sequentially | |
1387 for (i=0; i<idx->n; i++) | |
1388 { | |
1389 bidx = idx->bidx[i]; | |
1390 k = kh_get(bin, bidx, META_BIN(idx)); | |
1391 if (k == kh_end(bidx)) continue; | |
1392 if ( off0 > kh_val(bidx, k).list[0].u ) off0 = kh_val(bidx, k).list[0].u; | |
1393 } | |
1394 if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam | |
1395 break; | |
1396 | |
1397 case HTS_IDX_NOCOOR: | |
1398 if ( idx->n>0 ) | |
1399 { | |
1400 bidx = idx->bidx[idx->n - 1]; | |
1401 k = kh_get(bin, bidx, META_BIN(idx)); | |
1402 if (k != kh_end(bidx)) off0 = kh_val(bidx, k).list[0].v; | |
1403 } | |
1404 if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam | |
1405 break; | |
1406 | |
1407 case HTS_IDX_REST: | |
1408 off0 = 0; | |
1409 break; | |
1410 | |
1411 case HTS_IDX_NONE: | |
1412 finished0 = 1; | |
1413 off0 = 0; | |
1414 break; | |
1415 | |
1416 default: | |
1417 return 0; | |
1418 } | |
1419 if (off0 != (uint64_t)-1) { | |
1420 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); | |
1421 iter->read_rest = 1; | |
1422 iter->finished = finished0; | |
1423 iter->curr_off = off0; | |
1424 iter->readrec = readrec; | |
1425 return iter; | |
1426 } else return 0; | |
1427 } | |
1428 | |
1429 if (beg < 0) beg = 0; | |
1430 if (end < beg) return 0; | |
1431 if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0; | |
1432 | |
1433 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); | |
1434 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; | |
1435 iter->readrec = readrec; | |
1436 | |
1437 // compute min_off | |
1438 bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); | |
1439 do { | |
1440 int first; | |
1441 k = kh_get(bin, bidx, bin); | |
1442 if (k != kh_end(bidx)) break; | |
1443 first = (hts_bin_parent(bin)<<3) + 1; | |
1444 if (bin > first) --bin; | |
1445 else bin = hts_bin_parent(bin); | |
1446 } while (bin); | |
1447 if (bin == 0) k = kh_get(bin, bidx, bin); | |
1448 min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; | |
1449 // retrieve bins | |
1450 reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls); | |
1451 for (i = n_off = 0; i < iter->bins.n; ++i) | |
1452 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) | |
1453 n_off += kh_value(bidx, k).n; | |
1454 if (n_off == 0) return iter; | |
1455 off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t)); | |
1456 for (i = n_off = 0; i < iter->bins.n; ++i) { | |
1457 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) { | |
1458 int j; | |
1459 bins_t *p = &kh_value(bidx, k); | |
1460 for (j = 0; j < p->n; ++j) | |
1461 if (p->list[j].v > min_off) off[n_off++] = p->list[j]; | |
1462 } | |
1463 } | |
1464 if (n_off == 0) { | |
1465 free(off); return iter; | |
1466 } | |
1467 ks_introsort(_off, n_off, off); | |
1468 // resolve completely contained adjacent blocks | |
1469 for (i = 1, l = 0; i < n_off; ++i) | |
1470 if (off[l].v < off[i].v) off[++l] = off[i]; | |
1471 n_off = l + 1; | |
1472 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing | |
1473 for (i = 1; i < n_off; ++i) | |
1474 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; | |
1475 // merge adjacent blocks | |
1476 for (i = 1, l = 0; i < n_off; ++i) { | |
1477 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; | |
1478 else off[++l] = off[i]; | |
1479 } | |
1480 n_off = l + 1; | |
1481 iter->n_off = n_off; iter->off = off; | |
1482 return iter; | |
1483 } | |
1484 | |
1485 void hts_itr_destroy(hts_itr_t *iter) | |
1486 { | |
1487 if (iter) { free(iter->off); free(iter->bins.a); free(iter); } | |
1488 } | |
1489 | |
1490 const char *hts_parse_reg(const char *s, int *beg, int *end) | |
1491 { | |
1492 int i, k, l, name_end; | |
1493 *beg = *end = -1; | |
1494 name_end = l = strlen(s); | |
1495 // determine the sequence name | |
1496 for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end | |
1497 if (i >= 0) name_end = i; | |
1498 if (name_end < l) { // check if this is really the end | |
1499 int n_hyphen = 0; | |
1500 for (i = name_end + 1; i < l; ++i) { | |
1501 if (s[i] == '-') ++n_hyphen; | |
1502 else if (!isdigit(s[i]) && s[i] != ',') break; | |
1503 } | |
1504 if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name | |
1505 } | |
1506 // parse the interval | |
1507 if (name_end < l) { | |
1508 char *tmp; | |
1509 tmp = (char*)alloca(l - name_end + 1); | |
1510 for (i = name_end + 1, k = 0; i < l; ++i) | |
1511 if (s[i] != ',') tmp[k++] = s[i]; | |
1512 tmp[k] = 0; | |
1513 if ((*beg = strtol(tmp, &tmp, 10) - 1) < 0) *beg = 0; | |
1514 *end = *tmp? strtol(tmp + 1, &tmp, 10) : INT_MAX; | |
1515 if (*beg > *end) name_end = l; | |
1516 } | |
1517 if (name_end == l) *beg = 0, *end = INT_MAX; | |
1518 return s + name_end; | |
1519 } | |
1520 | |
1521 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) | |
1522 { | |
1523 int tid, beg, end; | |
1524 char *q, *tmp; | |
1525 if (strcmp(reg, ".") == 0) | |
1526 return itr_query(idx, HTS_IDX_START, 0, 0, readrec); | |
1527 else if (strcmp(reg, "*") != 0) { | |
1528 q = (char*)hts_parse_reg(reg, &beg, &end); | |
1529 tmp = (char*)alloca(q - reg + 1); | |
1530 strncpy(tmp, reg, q - reg); | |
1531 tmp[q - reg] = 0; | |
1532 if ((tid = getid(hdr, tmp)) < 0) | |
1533 tid = getid(hdr, reg); | |
1534 if (tid < 0) return 0; | |
1535 return itr_query(idx, tid, beg, end, readrec); | |
1536 } else return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec); | |
1537 } | |
1538 | |
1539 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) | |
1540 { | |
1541 int ret, tid, beg, end; | |
1542 if (iter == NULL || iter->finished) return -1; | |
1543 if (iter->read_rest) { | |
1544 if (iter->curr_off) { // seek to the start | |
1545 bgzf_seek(fp, iter->curr_off, SEEK_SET); | |
1546 iter->curr_off = 0; // only seek once | |
1547 } | |
1548 ret = iter->readrec(fp, data, r, &tid, &beg, &end); | |
1549 if (ret < 0) iter->finished = 1; | |
1550 iter->curr_tid = tid; | |
1551 iter->curr_beg = beg; | |
1552 iter->curr_end = end; | |
1553 return ret; | |
1554 } | |
1555 if (iter->off == 0) return -1; | |
1556 for (;;) { | |
1557 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk | |
1558 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks | |
1559 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek | |
1560 bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET); | |
1561 iter->curr_off = bgzf_tell(fp); | |
1562 } | |
1563 ++iter->i; | |
1564 } | |
1565 if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) { | |
1566 iter->curr_off = bgzf_tell(fp); | |
1567 if (tid != iter->tid || beg >= iter->end) { // no need to proceed | |
1568 ret = -1; break; | |
1569 } else if (end > iter->beg && iter->end > beg) { | |
1570 iter->curr_tid = tid; | |
1571 iter->curr_beg = beg; | |
1572 iter->curr_end = end; | |
1573 return ret; | |
1574 } | |
1575 } else break; // end of file or error | |
1576 } | |
1577 iter->finished = 1; | |
1578 return ret; | |
1579 } | |
1580 | |
1581 /********************** | |
1582 *** Retrieve index *** | |
1583 **********************/ | |
1584 | |
1585 static char *test_and_fetch(const char *fn) | |
1586 { | |
1587 FILE *fp; | |
1588 if (hisremote(fn)) { | |
1589 const int buf_size = 1 * 1024 * 1024; | |
1590 hFILE *fp_remote; | |
1591 uint8_t *buf; | |
1592 int l; | |
1593 const char *p; | |
1594 for (p = fn + strlen(fn) - 1; p >= fn; --p) | |
1595 if (*p == '/') break; | |
1596 ++p; // p now points to the local file name | |
1597 // Attempt to open local file first | |
1598 if ((fp = fopen((char*)p, "rb")) != 0) | |
1599 { | |
1600 fclose(fp); | |
1601 return (char*)p; | |
1602 } | |
1603 // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index. | |
1604 if ((fp_remote = hopen(fn, "r")) == 0) return 0; | |
1605 if ((fp = fopen(p, "w")) == 0) { | |
1606 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to create file '%s' in the working directory\n", __func__, p); | |
1607 hclose_abruptly(fp_remote); | |
1608 return 0; | |
1609 } | |
1610 if (hts_verbose >= 3) fprintf(stderr, "[M::%s] downloading file '%s' to local directory\n", __func__, fn); | |
1611 buf = (uint8_t*)calloc(buf_size, 1); | |
1612 while ((l = hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp); | |
1613 free(buf); | |
1614 fclose(fp); | |
1615 if (hclose(fp_remote) != 0) fprintf(stderr, "[E::%s] fail to close remote file '%s'\n", __func__, fn); | |
1616 return (char*)p; | |
1617 } else { | |
1618 if ((fp = fopen(fn, "rb")) == 0) return 0; | |
1619 fclose(fp); | |
1620 return (char*)fn; | |
1621 } | |
1622 } | |
1623 | |
1624 char *hts_idx_getfn(const char *fn, const char *ext) | |
1625 { | |
1626 int i, l_fn, l_ext; | |
1627 char *fnidx, *ret; | |
1628 l_fn = strlen(fn); l_ext = strlen(ext); | |
1629 fnidx = (char*)calloc(l_fn + l_ext + 1, 1); | |
1630 strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext); | |
1631 if ((ret = test_and_fetch(fnidx)) == 0) { | |
1632 for (i = l_fn - 1; i > 0; --i) | |
1633 if (fnidx[i] == '.') break; | |
1634 strcpy(fnidx + i, ext); | |
1635 ret = test_and_fetch(fnidx); | |
1636 } | |
1637 if (ret == 0) { | |
1638 free(fnidx); | |
1639 return 0; | |
1640 } | |
1641 l_fn = strlen(ret); | |
1642 memmove(fnidx, ret, l_fn + 1); | |
1643 return fnidx; | |
1644 } | |
1645 | |
1646 hts_idx_t *hts_idx_load(const char *fn, int fmt) | |
1647 { | |
1648 char *fnidx; | |
1649 hts_idx_t *idx; | |
1650 fnidx = hts_idx_getfn(fn, ".csi"); | |
1651 if (fnidx) fmt = HTS_FMT_CSI; | |
1652 else fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi"); | |
1653 if (fnidx == 0) return 0; | |
1654 | |
1655 // Check that the index file is up to date, the main file might have changed | |
1656 struct stat stat_idx,stat_main; | |
1657 if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) ) | |
1658 { | |
1659 if ( stat_idx.st_mtime < stat_main.st_mtime ) | |
1660 fprintf(stderr, "Warning: The index file is older than the data file: %s\n", fnidx); | |
1661 } | |
1662 idx = hts_idx_load_local(fnidx, fmt); | |
1663 free(fnidx); | |
1664 return idx; | |
1665 } |