Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/faidx.c @ 0:dfa3745e5fd8
Uploaded
| author | youngkim |
|---|---|
| date | Thu, 24 Mar 2016 17:12:52 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfa3745e5fd8 |
|---|---|
| 1 /* faidx.c -- FASTA random access. | |
| 2 | |
| 3 Copyright (C) 2008, 2009, 2013-2015 Genome Research Ltd. | |
| 4 Portions copyright (C) 2011 Broad Institute. | |
| 5 | |
| 6 Author: Heng Li <lh3@sanger.ac.uk> | |
| 7 | |
| 8 Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 9 of this software and associated documentation files (the "Software"), to deal | |
| 10 in the Software without restriction, including without limitation the rights | |
| 11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 12 copies of the Software, and to permit persons to whom the Software is | |
| 13 furnished to do so, subject to the following conditions: | |
| 14 | |
| 15 The above copyright notice and this permission notice shall be included in | |
| 16 all copies or substantial portions of the Software. | |
| 17 | |
| 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
| 21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
| 23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
| 24 DEALINGS IN THE SOFTWARE. */ | |
| 25 | |
| 26 #include <ctype.h> | |
| 27 #include <string.h> | |
| 28 #include <stdlib.h> | |
| 29 #include <stdio.h> | |
| 30 #include <stdint.h> | |
| 31 | |
| 32 #include "htslib/bgzf.h" | |
| 33 #include "htslib/faidx.h" | |
| 34 #include "htslib/hfile.h" | |
| 35 #include "htslib/khash.h" | |
| 36 | |
| 37 typedef struct { | |
| 38 int32_t line_len, line_blen; | |
| 39 int64_t len; | |
| 40 uint64_t offset; | |
| 41 } faidx1_t; | |
| 42 KHASH_MAP_INIT_STR(s, faidx1_t) | |
| 43 | |
| 44 struct __faidx_t { | |
| 45 BGZF *bgzf; | |
| 46 int n, m; | |
| 47 char **name; | |
| 48 khash_t(s) *hash; | |
| 49 }; | |
| 50 | |
| 51 #ifndef kroundup32 | |
| 52 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
| 53 #endif | |
| 54 | |
| 55 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset) | |
| 56 { | |
| 57 khint_t k; | |
| 58 int ret; | |
| 59 faidx1_t t; | |
| 60 if (idx->n == idx->m) { | |
| 61 idx->m = idx->m? idx->m<<1 : 16; | |
| 62 idx->name = (char**)realloc(idx->name, sizeof(char*) * idx->m); | |
| 63 } | |
| 64 idx->name[idx->n] = strdup(name); | |
| 65 k = kh_put(s, idx->hash, idx->name[idx->n], &ret); | |
| 66 t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset; | |
| 67 kh_value(idx->hash, k) = t; | |
| 68 ++idx->n; | |
| 69 } | |
| 70 | |
| 71 faidx_t *fai_build_core(BGZF *bgzf) | |
| 72 { | |
| 73 char *name; | |
| 74 int c; | |
| 75 int l_name, m_name; | |
| 76 int line_len, line_blen, state; | |
| 77 int l1, l2; | |
| 78 faidx_t *idx; | |
| 79 uint64_t offset; | |
| 80 int64_t len; | |
| 81 | |
| 82 idx = (faidx_t*)calloc(1, sizeof(faidx_t)); | |
| 83 idx->hash = kh_init(s); | |
| 84 name = 0; l_name = m_name = 0; | |
| 85 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; | |
| 86 while ( (c=bgzf_getc(bgzf))>=0 ) { | |
| 87 if (c == '\n') { // an empty line | |
| 88 if (state == 1) { | |
| 89 offset = bgzf_utell(bgzf); | |
| 90 continue; | |
| 91 } else if ((state == 0 && len < 0) || state == 2) continue; | |
| 92 } | |
| 93 if (c == '>') { // fasta header | |
| 94 if (len >= 0) | |
| 95 fai_insert_index(idx, name, len, line_len, line_blen, offset); | |
| 96 l_name = 0; | |
| 97 while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) { | |
| 98 if (m_name < l_name + 2) { | |
| 99 m_name = l_name + 2; | |
| 100 kroundup32(m_name); | |
| 101 name = (char*)realloc(name, m_name); | |
| 102 } | |
| 103 name[l_name++] = c; | |
| 104 } | |
| 105 name[l_name] = '\0'; | |
| 106 if ( c<0 ) { | |
| 107 fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); | |
| 108 free(name); fai_destroy(idx); | |
| 109 return 0; | |
| 110 } | |
| 111 if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n'); | |
| 112 state = 1; len = 0; | |
| 113 offset = bgzf_utell(bgzf); | |
| 114 } else { | |
| 115 if (state == 3) { | |
| 116 fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); | |
| 117 free(name); fai_destroy(idx); | |
| 118 return 0; | |
| 119 } | |
| 120 if (state == 2) state = 3; | |
| 121 l1 = l2 = 0; | |
| 122 do { | |
| 123 ++l1; | |
| 124 if (isgraph(c)) ++l2; | |
| 125 } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n'); | |
| 126 if (state == 3 && l2) { | |
| 127 fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); | |
| 128 free(name); fai_destroy(idx); | |
| 129 return 0; | |
| 130 } | |
| 131 ++l1; len += l2; | |
| 132 if (state == 1) line_len = l1, line_blen = l2, state = 0; | |
| 133 else if (state == 0) { | |
| 134 if (l1 != line_len || l2 != line_blen) state = 2; | |
| 135 } | |
| 136 } | |
| 137 } | |
| 138 if ( name ) | |
| 139 fai_insert_index(idx, name, len, line_len, line_blen, offset); | |
| 140 else | |
| 141 { | |
| 142 free(idx); | |
| 143 return NULL; | |
| 144 } | |
| 145 free(name); | |
| 146 return idx; | |
| 147 } | |
| 148 | |
| 149 void fai_save(const faidx_t *fai, FILE *fp) | |
| 150 { | |
| 151 khint_t k; | |
| 152 int i; | |
| 153 for (i = 0; i < fai->n; ++i) { | |
| 154 faidx1_t x; | |
| 155 k = kh_get(s, fai->hash, fai->name[i]); | |
| 156 x = kh_value(fai->hash, k); | |
| 157 #ifdef _WIN32 | |
| 158 fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len); | |
| 159 #else | |
| 160 fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len); | |
| 161 #endif | |
| 162 } | |
| 163 } | |
| 164 | |
| 165 faidx_t *fai_read(FILE *fp) | |
| 166 { | |
| 167 faidx_t *fai; | |
| 168 char *buf, *p; | |
| 169 int len, line_len, line_blen; | |
| 170 #ifdef _WIN32 | |
| 171 long offset; | |
| 172 #else | |
| 173 long long offset; | |
| 174 #endif | |
| 175 fai = (faidx_t*)calloc(1, sizeof(faidx_t)); | |
| 176 fai->hash = kh_init(s); | |
| 177 buf = (char*)calloc(0x10000, 1); | |
| 178 while (!feof(fp) && fgets(buf, 0x10000, fp)) { | |
| 179 for (p = buf; *p && isgraph(*p); ++p); | |
| 180 *p = 0; ++p; | |
| 181 #ifdef _WIN32 | |
| 182 sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len); | |
| 183 #else | |
| 184 sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len); | |
| 185 #endif | |
| 186 fai_insert_index(fai, buf, len, line_len, line_blen, offset); | |
| 187 } | |
| 188 free(buf); | |
| 189 return fai; | |
| 190 } | |
| 191 | |
| 192 void fai_destroy(faidx_t *fai) | |
| 193 { | |
| 194 int i; | |
| 195 for (i = 0; i < fai->n; ++i) free(fai->name[i]); | |
| 196 free(fai->name); | |
| 197 kh_destroy(s, fai->hash); | |
| 198 if (fai->bgzf) bgzf_close(fai->bgzf); | |
| 199 free(fai); | |
| 200 } | |
| 201 | |
| 202 int fai_build(const char *fn) | |
| 203 { | |
| 204 char *str; | |
| 205 BGZF *bgzf; | |
| 206 FILE *fp; | |
| 207 faidx_t *fai; | |
| 208 str = (char*)calloc(strlen(fn) + 5, 1); | |
| 209 sprintf(str, "%s.fai", fn); | |
| 210 bgzf = bgzf_open(fn, "r"); | |
| 211 if ( !bgzf ) { | |
| 212 fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); | |
| 213 free(str); | |
| 214 return -1; | |
| 215 } | |
| 216 if ( bgzf->is_compressed ) bgzf_index_build_init(bgzf); | |
| 217 fai = fai_build_core(bgzf); | |
| 218 if ( !fai ) | |
| 219 { | |
| 220 if ( bgzf->is_compressed && bgzf->is_gzip ) fprintf(stderr,"Cannot index files compressed with gzip, please use bgzip\n"); | |
| 221 free(str); | |
| 222 return -1; | |
| 223 } | |
| 224 if ( bgzf->is_compressed ) bgzf_index_dump(bgzf, fn, ".gzi"); | |
| 225 bgzf_close(bgzf); | |
| 226 fp = fopen(str, "wb"); | |
| 227 if ( !fp ) { | |
| 228 fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str); | |
| 229 fai_destroy(fai); free(str); | |
| 230 return -1; | |
| 231 } | |
| 232 fai_save(fai, fp); | |
| 233 fclose(fp); | |
| 234 free(str); | |
| 235 fai_destroy(fai); | |
| 236 return 0; | |
| 237 } | |
| 238 | |
| 239 static FILE *download_and_open(const char *fn) | |
| 240 { | |
| 241 const int buf_size = 1 * 1024 * 1024; | |
| 242 uint8_t *buf; | |
| 243 FILE *fp; | |
| 244 hFILE *fp_remote; | |
| 245 const char *url = fn; | |
| 246 const char *p; | |
| 247 int l = strlen(fn); | |
| 248 for (p = fn + l - 1; p >= fn; --p) | |
| 249 if (*p == '/') break; | |
| 250 fn = p + 1; | |
| 251 | |
| 252 // First try to open a local copy | |
| 253 fp = fopen(fn, "r"); | |
| 254 if (fp) | |
| 255 return fp; | |
| 256 | |
| 257 // If failed, download from remote and open | |
| 258 fp_remote = hopen(url, "rb"); | |
| 259 if (fp_remote == 0) { | |
| 260 fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url); | |
| 261 return NULL; | |
| 262 } | |
| 263 if ((fp = fopen(fn, "wb")) == 0) { | |
| 264 fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn); | |
| 265 hclose_abruptly(fp_remote); | |
| 266 return NULL; | |
| 267 } | |
| 268 buf = (uint8_t*)calloc(buf_size, 1); | |
| 269 while ((l = hread(fp_remote, buf, buf_size)) > 0) | |
| 270 fwrite(buf, 1, l, fp); | |
| 271 free(buf); | |
| 272 fclose(fp); | |
| 273 if (hclose(fp_remote) != 0) | |
| 274 fprintf(stderr, "[download_from_remote] fail to close remote file %s\n", url); | |
| 275 | |
| 276 return fopen(fn, "r"); | |
| 277 } | |
| 278 | |
| 279 faidx_t *fai_load(const char *fn) | |
| 280 { | |
| 281 char *str; | |
| 282 FILE *fp; | |
| 283 faidx_t *fai; | |
| 284 str = (char*)calloc(strlen(fn) + 5, 1); | |
| 285 sprintf(str, "%s.fai", fn); | |
| 286 | |
| 287 if (hisremote(str)) | |
| 288 { | |
| 289 fp = download_and_open(str); | |
| 290 if ( !fp ) | |
| 291 { | |
| 292 fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str); | |
| 293 free(str); | |
| 294 return 0; | |
| 295 } | |
| 296 } | |
| 297 else | |
| 298 fp = fopen(str, "rb"); | |
| 299 | |
| 300 if (fp == 0) { | |
| 301 fprintf(stderr, "[fai_load] build FASTA index.\n"); | |
| 302 fai_build(fn); | |
| 303 fp = fopen(str, "rb"); | |
| 304 if (fp == 0) { | |
| 305 fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); | |
| 306 free(str); | |
| 307 return 0; | |
| 308 } | |
| 309 } | |
| 310 | |
| 311 fai = fai_read(fp); | |
| 312 fclose(fp); | |
| 313 | |
| 314 fai->bgzf = bgzf_open(fn, "rb"); | |
| 315 free(str); | |
| 316 if (fai->bgzf == 0) { | |
| 317 fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); | |
| 318 return 0; | |
| 319 } | |
| 320 if ( fai->bgzf->is_compressed==1 ) | |
| 321 { | |
| 322 if ( bgzf_index_load(fai->bgzf, fn, ".gzi") < 0 ) | |
| 323 { | |
| 324 fprintf(stderr, "[fai_load] failed to load .gzi index: %s[.gzi]\n", fn); | |
| 325 fai_destroy(fai); | |
| 326 return NULL; | |
| 327 } | |
| 328 } | |
| 329 return fai; | |
| 330 } | |
| 331 | |
| 332 char *fai_fetch(const faidx_t *fai, const char *str, int *len) | |
| 333 { | |
| 334 char *s; | |
| 335 int c, i, l, k, name_end; | |
| 336 khiter_t iter; | |
| 337 faidx1_t val; | |
| 338 khash_t(s) *h; | |
| 339 int beg, end; | |
| 340 | |
| 341 beg = end = -1; | |
| 342 h = fai->hash; | |
| 343 name_end = l = strlen(str); | |
| 344 s = (char*)malloc(l+1); | |
| 345 // remove space | |
| 346 for (i = k = 0; i < l; ++i) | |
| 347 if (!isspace(str[i])) s[k++] = str[i]; | |
| 348 s[k] = 0; l = k; | |
| 349 // determine the sequence name | |
| 350 for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end | |
| 351 if (i >= 0) name_end = i; | |
| 352 if (name_end < l) { // check if this is really the end | |
| 353 int n_hyphen = 0; | |
| 354 for (i = name_end + 1; i < l; ++i) { | |
| 355 if (s[i] == '-') ++n_hyphen; | |
| 356 else if (!isdigit(s[i]) && s[i] != ',') break; | |
| 357 } | |
| 358 if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name | |
| 359 s[name_end] = 0; | |
| 360 iter = kh_get(s, h, s); | |
| 361 if (iter == kh_end(h)) { // cannot find the sequence name | |
| 362 iter = kh_get(s, h, str); // try str as the name | |
| 363 if (iter == kh_end(h)) { | |
| 364 *len = 0; | |
| 365 free(s); return 0; | |
| 366 } else s[name_end] = ':', name_end = l; | |
| 367 } | |
| 368 } else iter = kh_get(s, h, str); | |
| 369 if(iter == kh_end(h)) { | |
| 370 fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str); | |
| 371 free(s); | |
| 372 *len = -2; | |
| 373 return 0; | |
| 374 }; | |
| 375 val = kh_value(h, iter); | |
| 376 // parse the interval | |
| 377 if (name_end < l) { | |
| 378 for (i = k = name_end + 1; i < l; ++i) | |
| 379 if (s[i] != ',') s[k++] = s[i]; | |
| 380 s[k] = 0; | |
| 381 beg = atoi(s + name_end + 1); | |
| 382 for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; | |
| 383 end = i < k? atoi(s + i + 1) : val.len; | |
| 384 if (beg > 0) --beg; | |
| 385 } else beg = 0, end = val.len; | |
| 386 if (beg >= val.len) beg = val.len; | |
| 387 if (end >= val.len) end = val.len; | |
| 388 if (beg > end) beg = end; | |
| 389 free(s); | |
| 390 | |
| 391 // now retrieve the sequence | |
| 392 int ret = bgzf_useek(fai->bgzf, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); | |
| 393 if ( ret<0 ) | |
| 394 { | |
| 395 *len = -1; | |
| 396 fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n"); | |
| 397 return NULL; | |
| 398 } | |
| 399 l = 0; | |
| 400 s = (char*)malloc(end - beg + 2); | |
| 401 while ( (c=bgzf_getc(fai->bgzf))>=0 && l < end - beg ) | |
| 402 if (isgraph(c)) s[l++] = c; | |
| 403 s[l] = '\0'; | |
| 404 *len = l; | |
| 405 return s; | |
| 406 } | |
| 407 | |
| 408 int faidx_fetch_nseq(const faidx_t *fai) | |
| 409 { | |
| 410 return fai->n; | |
| 411 } | |
| 412 | |
| 413 int faidx_nseq(const faidx_t *fai) | |
| 414 { | |
| 415 return fai->n; | |
| 416 } | |
| 417 | |
| 418 const char *faidx_iseq(const faidx_t *fai, int i) | |
| 419 { | |
| 420 return fai->name[i]; | |
| 421 } | |
| 422 | |
| 423 int faidx_seq_len(const faidx_t *fai, const char *seq) | |
| 424 { | |
| 425 khint_t k = kh_get(s, fai->hash, seq); | |
| 426 if ( k == kh_end(fai->hash) ) return -1; | |
| 427 return kh_val(fai->hash, k).len; | |
| 428 } | |
| 429 | |
| 430 char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) | |
| 431 { | |
| 432 int l, c; | |
| 433 khiter_t iter; | |
| 434 faidx1_t val; | |
| 435 char *seq=NULL; | |
| 436 | |
| 437 // Adjust position | |
| 438 iter = kh_get(s, fai->hash, c_name); | |
| 439 if (iter == kh_end(fai->hash)) | |
| 440 { | |
| 441 *len = -2; | |
| 442 fprintf(stderr, "[fai_fetch_seq] The sequence \"%s\" not found\n", c_name); | |
| 443 return NULL; | |
| 444 } | |
| 445 val = kh_value(fai->hash, iter); | |
| 446 if(p_end_i < p_beg_i) p_beg_i = p_end_i; | |
| 447 if(p_beg_i < 0) p_beg_i = 0; | |
| 448 else if(val.len <= p_beg_i) p_beg_i = val.len - 1; | |
| 449 if(p_end_i < 0) p_end_i = 0; | |
| 450 else if(val.len <= p_end_i) p_end_i = val.len - 1; | |
| 451 | |
| 452 // Now retrieve the sequence | |
| 453 int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); | |
| 454 if ( ret<0 ) | |
| 455 { | |
| 456 *len = -1; | |
| 457 fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n"); | |
| 458 return NULL; | |
| 459 } | |
| 460 l = 0; | |
| 461 seq = (char*)malloc(p_end_i - p_beg_i + 2); | |
| 462 while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1) | |
| 463 if (isgraph(c)) seq[l++] = c; | |
| 464 seq[l] = '\0'; | |
| 465 *len = l; | |
| 466 return seq; | |
| 467 } | |
| 468 | |
| 469 int faidx_has_seq(const faidx_t *fai, const char *seq) | |
| 470 { | |
| 471 khiter_t iter = kh_get(s, fai->hash, seq); | |
| 472 if (iter == kh_end(fai->hash)) return 0; | |
| 473 return 1; | |
| 474 } | |
| 475 |
