Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/tools/samtools-0.1.16/faidx.c @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 #include <ctype.h> | |
2 #include <string.h> | |
3 #include <stdlib.h> | |
4 #include <stdio.h> | |
5 #include <stdint.h> | |
6 #include "faidx.h" | |
7 #include "khash.h" | |
8 | |
9 typedef struct { | |
10 uint64_t len:32, line_len:16, line_blen:16; | |
11 uint64_t offset; | |
12 } faidx1_t; | |
13 KHASH_MAP_INIT_STR(s, faidx1_t) | |
14 | |
15 #ifndef _NO_RAZF | |
16 #include "razf.h" | |
17 #else | |
18 #ifdef _WIN32 | |
19 #define ftello(fp) ftell(fp) | |
20 #define fseeko(fp, offset, whence) fseek(fp, offset, whence) | |
21 #else | |
22 extern off_t ftello(FILE *stream); | |
23 extern int fseeko(FILE *stream, off_t offset, int whence); | |
24 #endif | |
25 #define RAZF FILE | |
26 #define razf_read(fp, buf, size) fread(buf, 1, size, fp) | |
27 #define razf_open(fn, mode) fopen(fn, mode) | |
28 #define razf_close(fp) fclose(fp) | |
29 #define razf_seek(fp, offset, whence) fseeko(fp, offset, whence) | |
30 #define razf_tell(fp) ftello(fp) | |
31 #endif | |
32 #ifdef _USE_KNETFILE | |
33 #include "knetfile.h" | |
34 #endif | |
35 | |
36 struct __faidx_t { | |
37 RAZF *rz; | |
38 int n, m; | |
39 char **name; | |
40 khash_t(s) *hash; | |
41 }; | |
42 | |
43 #ifndef kroundup32 | |
44 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
45 #endif | |
46 | |
47 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset) | |
48 { | |
49 khint_t k; | |
50 int ret; | |
51 faidx1_t t; | |
52 if (idx->n == idx->m) { | |
53 idx->m = idx->m? idx->m<<1 : 16; | |
54 idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m); | |
55 } | |
56 idx->name[idx->n] = strdup(name); | |
57 k = kh_put(s, idx->hash, idx->name[idx->n], &ret); | |
58 t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset; | |
59 kh_value(idx->hash, k) = t; | |
60 ++idx->n; | |
61 } | |
62 | |
63 faidx_t *fai_build_core(RAZF *rz) | |
64 { | |
65 char c, *name; | |
66 int l_name, m_name, ret; | |
67 int len, line_len, line_blen, state; | |
68 int l1, l2; | |
69 faidx_t *idx; | |
70 uint64_t offset; | |
71 | |
72 idx = (faidx_t*)calloc(1, sizeof(faidx_t)); | |
73 idx->hash = kh_init(s); | |
74 name = 0; l_name = m_name = 0; | |
75 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; | |
76 while (razf_read(rz, &c, 1)) { | |
77 if (c == '\n') { // an empty line | |
78 if (state == 1) { | |
79 offset = razf_tell(rz); | |
80 continue; | |
81 } else if ((state == 0 && len < 0) || state == 2) continue; | |
82 } | |
83 if (c == '>') { // fasta header | |
84 if (len >= 0) | |
85 fai_insert_index(idx, name, len, line_len, line_blen, offset); | |
86 l_name = 0; | |
87 while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) { | |
88 if (m_name < l_name + 2) { | |
89 m_name = l_name + 2; | |
90 kroundup32(m_name); | |
91 name = (char*)realloc(name, m_name); | |
92 } | |
93 name[l_name++] = c; | |
94 } | |
95 name[l_name] = '\0'; | |
96 if (ret == 0) { | |
97 fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); | |
98 free(name); fai_destroy(idx); | |
99 return 0; | |
100 } | |
101 if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n'); | |
102 state = 1; len = 0; | |
103 offset = razf_tell(rz); | |
104 } else { | |
105 if (state == 3) { | |
106 fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); | |
107 free(name); fai_destroy(idx); | |
108 return 0; | |
109 } | |
110 if (state == 2) state = 3; | |
111 l1 = l2 = 0; | |
112 do { | |
113 ++l1; | |
114 if (isgraph(c)) ++l2; | |
115 } while ((ret = razf_read(rz, &c, 1)) && c != '\n'); | |
116 if (state == 3 && l2) { | |
117 fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); | |
118 free(name); fai_destroy(idx); | |
119 return 0; | |
120 } | |
121 ++l1; len += l2; | |
122 if (l2 >= 0x10000) { | |
123 fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name); | |
124 free(name); fai_destroy(idx); | |
125 return 0; | |
126 } | |
127 if (state == 1) line_len = l1, line_blen = l2, state = 0; | |
128 else if (state == 0) { | |
129 if (l1 != line_len || l2 != line_blen) state = 2; | |
130 } | |
131 } | |
132 } | |
133 fai_insert_index(idx, name, len, line_len, line_blen, offset); | |
134 free(name); | |
135 return idx; | |
136 } | |
137 | |
138 void fai_save(const faidx_t *fai, FILE *fp) | |
139 { | |
140 khint_t k; | |
141 int i; | |
142 for (i = 0; i < fai->n; ++i) { | |
143 faidx1_t x; | |
144 k = kh_get(s, fai->hash, fai->name[i]); | |
145 x = kh_value(fai->hash, k); | |
146 #ifdef _WIN32 | |
147 fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len); | |
148 #else | |
149 fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len); | |
150 #endif | |
151 } | |
152 } | |
153 | |
154 faidx_t *fai_read(FILE *fp) | |
155 { | |
156 faidx_t *fai; | |
157 char *buf, *p; | |
158 int len, line_len, line_blen; | |
159 #ifdef _WIN32 | |
160 long offset; | |
161 #else | |
162 long long offset; | |
163 #endif | |
164 fai = (faidx_t*)calloc(1, sizeof(faidx_t)); | |
165 fai->hash = kh_init(s); | |
166 buf = (char*)calloc(0x10000, 1); | |
167 while (!feof(fp) && fgets(buf, 0x10000, fp)) { | |
168 for (p = buf; *p && isgraph(*p); ++p); | |
169 *p = 0; ++p; | |
170 #ifdef _WIN32 | |
171 sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len); | |
172 #else | |
173 sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len); | |
174 #endif | |
175 fai_insert_index(fai, buf, len, line_len, line_blen, offset); | |
176 } | |
177 free(buf); | |
178 return fai; | |
179 } | |
180 | |
181 void fai_destroy(faidx_t *fai) | |
182 { | |
183 int i; | |
184 for (i = 0; i < fai->n; ++i) free(fai->name[i]); | |
185 free(fai->name); | |
186 kh_destroy(s, fai->hash); | |
187 if (fai->rz) razf_close(fai->rz); | |
188 free(fai); | |
189 } | |
190 | |
191 int fai_build(const char *fn) | |
192 { | |
193 char *str; | |
194 RAZF *rz; | |
195 FILE *fp; | |
196 faidx_t *fai; | |
197 str = (char*)calloc(strlen(fn) + 5, 1); | |
198 sprintf(str, "%s.fai", fn); | |
199 rz = razf_open(fn, "r"); | |
200 if (rz == 0) { | |
201 fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); | |
202 free(str); | |
203 return -1; | |
204 } | |
205 fai = fai_build_core(rz); | |
206 razf_close(rz); | |
207 fp = fopen(str, "wb"); | |
208 if (fp == 0) { | |
209 fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str); | |
210 fai_destroy(fai); free(str); | |
211 return -1; | |
212 } | |
213 fai_save(fai, fp); | |
214 fclose(fp); | |
215 free(str); | |
216 fai_destroy(fai); | |
217 return 0; | |
218 } | |
219 | |
220 #ifdef _USE_KNETFILE | |
221 FILE *download_and_open(const char *fn) | |
222 { | |
223 const int buf_size = 1 * 1024 * 1024; | |
224 uint8_t *buf; | |
225 FILE *fp; | |
226 knetFile *fp_remote; | |
227 const char *url = fn; | |
228 const char *p; | |
229 int l = strlen(fn); | |
230 for (p = fn + l - 1; p >= fn; --p) | |
231 if (*p == '/') break; | |
232 fn = p + 1; | |
233 | |
234 // First try to open a local copy | |
235 fp = fopen(fn, "r"); | |
236 if (fp) | |
237 return fp; | |
238 | |
239 // If failed, download from remote and open | |
240 fp_remote = knet_open(url, "rb"); | |
241 if (fp_remote == 0) { | |
242 fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url); | |
243 return NULL; | |
244 } | |
245 if ((fp = fopen(fn, "wb")) == 0) { | |
246 fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn); | |
247 knet_close(fp_remote); | |
248 return NULL; | |
249 } | |
250 buf = (uint8_t*)calloc(buf_size, 1); | |
251 while ((l = knet_read(fp_remote, buf, buf_size)) != 0) | |
252 fwrite(buf, 1, l, fp); | |
253 free(buf); | |
254 fclose(fp); | |
255 knet_close(fp_remote); | |
256 | |
257 return fopen(fn, "r"); | |
258 } | |
259 #endif | |
260 | |
261 faidx_t *fai_load(const char *fn) | |
262 { | |
263 char *str; | |
264 FILE *fp; | |
265 faidx_t *fai; | |
266 str = (char*)calloc(strlen(fn) + 5, 1); | |
267 sprintf(str, "%s.fai", fn); | |
268 | |
269 #ifdef _USE_KNETFILE | |
270 if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) | |
271 { | |
272 fp = download_and_open(str); | |
273 if ( !fp ) | |
274 { | |
275 fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str); | |
276 free(str); | |
277 return 0; | |
278 } | |
279 } | |
280 else | |
281 #endif | |
282 fp = fopen(str, "rb"); | |
283 if (fp == 0) { | |
284 fprintf(stderr, "[fai_load] build FASTA index.\n"); | |
285 fai_build(fn); | |
286 fp = fopen(str, "rb"); | |
287 if (fp == 0) { | |
288 fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); | |
289 free(str); | |
290 return 0; | |
291 } | |
292 } | |
293 | |
294 fai = fai_read(fp); | |
295 fclose(fp); | |
296 | |
297 fai->rz = razf_open(fn, "rb"); | |
298 free(str); | |
299 if (fai->rz == 0) { | |
300 fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); | |
301 return 0; | |
302 } | |
303 return fai; | |
304 } | |
305 | |
306 char *fai_fetch(const faidx_t *fai, const char *str, int *len) | |
307 { | |
308 char *s, *p, c; | |
309 int i, l, k; | |
310 khiter_t iter; | |
311 faidx1_t val; | |
312 khash_t(s) *h; | |
313 int beg, end; | |
314 | |
315 beg = end = -1; | |
316 h = fai->hash; | |
317 l = strlen(str); | |
318 p = s = (char*)malloc(l+1); | |
319 /* squeeze out "," */ | |
320 for (i = k = 0; i != l; ++i) | |
321 if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; | |
322 s[k] = 0; | |
323 for (i = 0; i != k; ++i) if (s[i] == ':') break; | |
324 s[i] = 0; | |
325 iter = kh_get(s, h, s); /* get the ref_id */ | |
326 if (iter == kh_end(h)) { | |
327 *len = 0; | |
328 free(s); return 0; | |
329 } | |
330 val = kh_value(h, iter); | |
331 if (i == k) { /* dump the whole sequence */ | |
332 beg = 0; end = val.len; | |
333 } else { | |
334 for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; | |
335 beg = atoi(p); | |
336 if (i < k) { | |
337 p = s + i + 1; | |
338 end = atoi(p); | |
339 } else end = val.len; | |
340 } | |
341 if (beg > 0) --beg; | |
342 if (beg >= val.len) beg = val.len; | |
343 if (end >= val.len) end = val.len; | |
344 if (beg > end) beg = end; | |
345 free(s); | |
346 | |
347 // now retrieve the sequence | |
348 l = 0; | |
349 s = (char*)malloc(end - beg + 2); | |
350 razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); | |
351 while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err) | |
352 if (isgraph(c)) s[l++] = c; | |
353 s[l] = '\0'; | |
354 *len = l; | |
355 return s; | |
356 } | |
357 | |
358 int faidx_main(int argc, char *argv[]) | |
359 { | |
360 if (argc == 1) { | |
361 fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n"); | |
362 return 1; | |
363 } else { | |
364 if (argc == 2) fai_build(argv[1]); | |
365 else { | |
366 int i, j, k, l; | |
367 char *s; | |
368 faidx_t *fai; | |
369 fai = fai_load(argv[1]); | |
370 if (fai == 0) return 1; | |
371 for (i = 2; i != argc; ++i) { | |
372 printf(">%s\n", argv[i]); | |
373 s = fai_fetch(fai, argv[i], &l); | |
374 for (j = 0; j < l; j += 60) { | |
375 for (k = 0; k < 60 && k < l - j; ++k) | |
376 putchar(s[j + k]); | |
377 putchar('\n'); | |
378 } | |
379 free(s); | |
380 } | |
381 fai_destroy(fai); | |
382 } | |
383 } | |
384 return 0; | |
385 } | |
386 | |
387 int faidx_fetch_nseq(const faidx_t *fai) | |
388 { | |
389 return fai->n; | |
390 } | |
391 | |
392 char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len) | |
393 { | |
394 int l; | |
395 char c; | |
396 khiter_t iter; | |
397 faidx1_t val; | |
398 char *seq=NULL; | |
399 | |
400 // Adjust position | |
401 iter = kh_get(s, fai->hash, c_name); | |
402 if(iter == kh_end(fai->hash)) return 0; | |
403 val = kh_value(fai->hash, iter); | |
404 if(p_end_i < p_beg_i) p_beg_i = p_end_i; | |
405 if(p_beg_i < 0) p_beg_i = 0; | |
406 else if(val.len <= p_beg_i) p_beg_i = val.len - 1; | |
407 if(p_end_i < 0) p_end_i = 0; | |
408 else if(val.len <= p_end_i) p_end_i = val.len - 1; | |
409 | |
410 // Now retrieve the sequence | |
411 l = 0; | |
412 seq = (char*)malloc(p_end_i - p_beg_i + 2); | |
413 razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); | |
414 while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1) | |
415 if (isgraph(c)) seq[l++] = c; | |
416 seq[l] = '\0'; | |
417 *len = l; | |
418 return seq; | |
419 } | |
420 | |
421 #ifdef FAIDX_MAIN | |
422 int main(int argc, char *argv[]) { return faidx_main(argc, argv); } | |
423 #endif |