|
0
|
1 #include <ctype.h>
|
|
|
2 #include <string.h>
|
|
|
3 #include <stdlib.h>
|
|
|
4 #include <stdio.h>
|
|
|
5 #include "faidx.h"
|
|
|
6 #include "khash.h"
|
|
|
7
|
|
|
8 typedef struct {
|
|
|
9 uint64_t len:32, line_len:16, line_blen:16;
|
|
|
10 uint64_t offset;
|
|
|
11 } faidx1_t;
|
|
|
12 KHASH_MAP_INIT_STR(s, faidx1_t)
|
|
|
13
|
|
|
14 #ifndef _NO_RAZF
|
|
|
15 #include "razf.h"
|
|
|
16 #else
|
|
|
17 #ifdef _WIN32
|
|
|
18 #define ftello(fp) ftell(fp)
|
|
|
19 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
|
|
|
20 #else
|
|
|
21 extern off_t ftello(FILE *stream);
|
|
|
22 extern int fseeko(FILE *stream, off_t offset, int whence);
|
|
|
23 #endif
|
|
|
24 #define RAZF FILE
|
|
|
25 #define razf_read(fp, buf, size) fread(buf, 1, size, fp)
|
|
|
26 #define razf_open(fn, mode) fopen(fn, mode)
|
|
|
27 #define razf_close(fp) fclose(fp)
|
|
|
28 #define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)
|
|
|
29 #define razf_tell(fp) ftello(fp)
|
|
|
30 #endif
|
|
|
31
|
|
|
32 struct __faidx_t {
|
|
|
33 RAZF *rz;
|
|
|
34 int n, m;
|
|
|
35 char **name;
|
|
|
36 khash_t(s) *hash;
|
|
|
37 };
|
|
|
38
|
|
|
39 #ifndef kroundup32
|
|
|
40 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
|
|
41 #endif
|
|
|
42
|
|
|
43 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
|
|
|
44 {
|
|
|
45 khint_t k;
|
|
|
46 int ret;
|
|
|
47 faidx1_t t;
|
|
|
48 if (idx->n == idx->m) {
|
|
|
49 idx->m = idx->m? idx->m<<1 : 16;
|
|
|
50 idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);
|
|
|
51 }
|
|
|
52 idx->name[idx->n] = strdup(name);
|
|
|
53 k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
|
|
|
54 t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
|
|
|
55 kh_value(idx->hash, k) = t;
|
|
|
56 ++idx->n;
|
|
|
57 }
|
|
|
58
|
|
|
59 faidx_t *fai_build_core(RAZF *rz)
|
|
|
60 {
|
|
|
61 char c, *name;
|
|
|
62 int l_name, m_name, ret;
|
|
|
63 int len, line_len, line_blen, state;
|
|
|
64 int l1, l2;
|
|
|
65 faidx_t *idx;
|
|
|
66 uint64_t offset;
|
|
|
67
|
|
|
68 idx = (faidx_t*)calloc(1, sizeof(faidx_t));
|
|
|
69 idx->hash = kh_init(s);
|
|
|
70 name = 0; l_name = m_name = 0;
|
|
|
71 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
|
|
|
72 while (razf_read(rz, &c, 1)) {
|
|
|
73 if (c == '\n') { // an empty line
|
|
|
74 if (state == 1) {
|
|
|
75 offset = razf_tell(rz);
|
|
|
76 continue;
|
|
|
77 } else if ((state == 0 && len < 0) || state == 2) continue;
|
|
|
78 }
|
|
|
79 if (c == '>') { // fasta header
|
|
|
80 if (len >= 0)
|
|
|
81 fai_insert_index(idx, name, len, line_len, line_blen, offset);
|
|
|
82 l_name = 0;
|
|
|
83 while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {
|
|
|
84 if (m_name < l_name + 2) {
|
|
|
85 m_name = l_name + 2;
|
|
|
86 kroundup32(m_name);
|
|
|
87 name = (char*)realloc(name, m_name);
|
|
|
88 }
|
|
|
89 name[l_name++] = c;
|
|
|
90 }
|
|
|
91 name[l_name] = '\0';
|
|
|
92 if (ret == 0) {
|
|
|
93 fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
|
|
|
94 free(name); fai_destroy(idx);
|
|
|
95 return 0;
|
|
|
96 }
|
|
|
97 if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n');
|
|
|
98 state = 1; len = 0;
|
|
|
99 offset = razf_tell(rz);
|
|
|
100 } else {
|
|
|
101 if (state == 3) {
|
|
|
102 fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
|
|
|
103 free(name); fai_destroy(idx);
|
|
|
104 return 0;
|
|
|
105 }
|
|
|
106 if (state == 2) state = 3;
|
|
|
107 l1 = l2 = 0;
|
|
|
108 do {
|
|
|
109 ++l1;
|
|
|
110 if (isgraph(c)) ++l2;
|
|
|
111 } while ((ret = razf_read(rz, &c, 1)) && c != '\n');
|
|
|
112 if (state == 3 && l2) {
|
|
|
113 fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
|
|
|
114 free(name); fai_destroy(idx);
|
|
|
115 return 0;
|
|
|
116 }
|
|
|
117 ++l1; len += l2;
|
|
|
118 if (l2 >= 0x10000) {
|
|
|
119 fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name);
|
|
|
120 free(name); fai_destroy(idx);
|
|
|
121 return 0;
|
|
|
122 }
|
|
|
123 if (state == 1) line_len = l1, line_blen = l2, state = 0;
|
|
|
124 else if (state == 0) {
|
|
|
125 if (l1 != line_len || l2 != line_blen) state = 2;
|
|
|
126 }
|
|
|
127 }
|
|
|
128 }
|
|
|
129 fai_insert_index(idx, name, len, line_len, line_blen, offset);
|
|
|
130 free(name);
|
|
|
131 return idx;
|
|
|
132 }
|
|
|
133
|
|
|
134 void fai_save(const faidx_t *fai, FILE *fp)
|
|
|
135 {
|
|
|
136 khint_t k;
|
|
|
137 int i;
|
|
|
138 for (i = 0; i < fai->n; ++i) {
|
|
|
139 faidx1_t x;
|
|
|
140 k = kh_get(s, fai->hash, fai->name[i]);
|
|
|
141 x = kh_value(fai->hash, k);
|
|
|
142 #ifdef _WIN32
|
|
|
143 fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
|
|
|
144 #else
|
|
|
145 fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
|
|
|
146 #endif
|
|
|
147 }
|
|
|
148 }
|
|
|
149
|
|
|
150 faidx_t *fai_read(FILE *fp)
|
|
|
151 {
|
|
|
152 faidx_t *fai;
|
|
|
153 char *buf, *p;
|
|
|
154 int len, line_len, line_blen;
|
|
|
155 #ifdef _WIN32
|
|
|
156 long offset;
|
|
|
157 #else
|
|
|
158 long long offset;
|
|
|
159 #endif
|
|
|
160 fai = (faidx_t*)calloc(1, sizeof(faidx_t));
|
|
|
161 fai->hash = kh_init(s);
|
|
|
162 buf = (char*)calloc(0x10000, 1);
|
|
|
163 while (!feof(fp) && fgets(buf, 0x10000, fp)) {
|
|
|
164 for (p = buf; *p && isgraph(*p); ++p);
|
|
|
165 *p = 0; ++p;
|
|
|
166 #ifdef _WIN32
|
|
|
167 sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
|
|
|
168 #else
|
|
|
169 sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
|
|
|
170 #endif
|
|
|
171 fai_insert_index(fai, buf, len, line_len, line_blen, offset);
|
|
|
172 }
|
|
|
173 free(buf);
|
|
|
174 return fai;
|
|
|
175 }
|
|
|
176
|
|
|
177 void fai_destroy(faidx_t *fai)
|
|
|
178 {
|
|
|
179 int i;
|
|
|
180 for (i = 0; i < fai->n; ++i) free(fai->name[i]);
|
|
|
181 free(fai->name);
|
|
|
182 kh_destroy(s, fai->hash);
|
|
|
183 if (fai->rz) razf_close(fai->rz);
|
|
|
184 free(fai);
|
|
|
185 }
|
|
|
186
|
|
|
187 int fai_build(const char *fn)
|
|
|
188 {
|
|
|
189 char *str;
|
|
|
190 RAZF *rz;
|
|
|
191 FILE *fp;
|
|
|
192 faidx_t *fai;
|
|
|
193 str = (char*)calloc(strlen(fn) + 5, 1);
|
|
|
194 sprintf(str, "%s.fai", fn);
|
|
|
195 rz = razf_open(fn, "r");
|
|
|
196 if (rz == 0) {
|
|
|
197 fprintf(stderr, "[fai_build] fail to open the FASTA file.\n");
|
|
|
198 free(str);
|
|
|
199 return -1;
|
|
|
200 }
|
|
|
201 fai = fai_build_core(rz);
|
|
|
202 razf_close(rz);
|
|
|
203 fp = fopen(str, "wb");
|
|
|
204 if (fp == 0) {
|
|
|
205 fprintf(stderr, "[fai_build] fail to write FASTA index.\n");
|
|
|
206 fai_destroy(fai); free(str);
|
|
|
207 return -1;
|
|
|
208 }
|
|
|
209 fai_save(fai, fp);
|
|
|
210 fclose(fp);
|
|
|
211 free(str);
|
|
|
212 fai_destroy(fai);
|
|
|
213 return 0;
|
|
|
214 }
|
|
|
215
|
|
|
216 faidx_t *fai_load(const char *fn)
|
|
|
217 {
|
|
|
218 char *str;
|
|
|
219 FILE *fp;
|
|
|
220 faidx_t *fai;
|
|
|
221 str = (char*)calloc(strlen(fn) + 5, 1);
|
|
|
222 sprintf(str, "%s.fai", fn);
|
|
|
223 fp = fopen(str, "rb");
|
|
|
224 if (fp == 0) {
|
|
|
225 fprintf(stderr, "[fai_load] build FASTA index.\n");
|
|
|
226 fai_build(fn);
|
|
|
227 fp = fopen(str, "r");
|
|
|
228 if (fp == 0) {
|
|
|
229 fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
|
|
|
230 free(str);
|
|
|
231 return 0;
|
|
|
232 }
|
|
|
233 }
|
|
|
234 fai = fai_read(fp);
|
|
|
235 fclose(fp);
|
|
|
236 fai->rz = razf_open(fn, "rb");
|
|
|
237 free(str);
|
|
|
238 if (fai->rz == 0) {
|
|
|
239 fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
|
|
|
240 return 0;
|
|
|
241 }
|
|
|
242 return fai;
|
|
|
243 }
|
|
|
244
|
|
|
245 char *fai_fetch(const faidx_t *fai, const char *str, int *len)
|
|
|
246 {
|
|
|
247 char *s, *p, c;
|
|
|
248 int i, l, k;
|
|
|
249 khiter_t iter;
|
|
|
250 faidx1_t val;
|
|
|
251 khash_t(s) *h;
|
|
|
252 int beg, end;
|
|
|
253
|
|
|
254 beg = end = -1;
|
|
|
255 h = fai->hash;
|
|
|
256 l = strlen(str);
|
|
|
257 p = s = (char*)malloc(l+1);
|
|
|
258 /* squeeze out "," */
|
|
|
259 for (i = k = 0; i != l; ++i)
|
|
|
260 if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
|
|
|
261 s[k] = 0;
|
|
|
262 for (i = 0; i != k; ++i) if (s[i] == ':') break;
|
|
|
263 s[i] = 0;
|
|
|
264 iter = kh_get(s, h, s); /* get the ref_id */
|
|
|
265 if (iter == kh_end(h)) {
|
|
|
266 *len = 0;
|
|
|
267 free(s); return 0;
|
|
|
268 }
|
|
|
269 val = kh_value(h, iter);
|
|
|
270 if (i == k) { /* dump the whole sequence */
|
|
|
271 beg = 0; end = val.len;
|
|
|
272 } else {
|
|
|
273 for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
|
|
|
274 beg = atoi(p);
|
|
|
275 if (i < k) {
|
|
|
276 p = s + i + 1;
|
|
|
277 end = atoi(p);
|
|
|
278 } else end = val.len;
|
|
|
279 }
|
|
|
280 if (beg > 0) --beg;
|
|
|
281 if (beg >= val.len) beg = val.len;
|
|
|
282 if (end >= val.len) end = val.len;
|
|
|
283 if (beg > end) beg = end;
|
|
|
284 free(s);
|
|
|
285
|
|
|
286 // now retrieve the sequence
|
|
|
287 l = 0;
|
|
|
288 s = (char*)malloc(end - beg + 2);
|
|
|
289 razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
|
|
|
290 while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg)
|
|
|
291 if (isgraph(c)) s[l++] = c;
|
|
|
292 s[l] = '\0';
|
|
|
293 *len = l;
|
|
|
294 return s;
|
|
|
295 }
|
|
|
296
|
|
|
297 int faidx_main(int argc, char *argv[])
|
|
|
298 {
|
|
|
299 if (argc == 1) {
|
|
|
300 fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n");
|
|
|
301 return 1;
|
|
|
302 } else {
|
|
|
303 if (argc == 2) fai_build(argv[1]);
|
|
|
304 else {
|
|
|
305 int i, j, k, l;
|
|
|
306 char *s;
|
|
|
307 faidx_t *fai;
|
|
|
308 fai = fai_load(argv[1]);
|
|
|
309 if (fai == 0) return 1;
|
|
|
310 for (i = 2; i != argc; ++i) {
|
|
|
311 printf(">%s\n", argv[i]);
|
|
|
312 s = fai_fetch(fai, argv[i], &l);
|
|
|
313 for (j = 0; j < l; j += 60) {
|
|
|
314 for (k = 0; k < 60 && k < l - j; ++k)
|
|
|
315 putchar(s[j + k]);
|
|
|
316 putchar('\n');
|
|
|
317 }
|
|
|
318 free(s);
|
|
|
319 }
|
|
|
320 fai_destroy(fai);
|
|
|
321 }
|
|
|
322 }
|
|
|
323 return 0;
|
|
|
324 }
|
|
|
325
|
|
|
326 #ifdef FAIDX_MAIN
|
|
|
327 int main(int argc, char *argv[]) { return faidx_main(argc, argv); }
|
|
|
328 #endif
|