Mercurial > repos > ryanmorin > nextgen_variant_identification
comparison SNV/SNVMix2_source/SNVMix2-v0.12.1-rc1/samtools-0.1.6/faidx.c @ 0:74f5ea818cea
Uploaded
author | ryanmorin |
---|---|
date | Wed, 12 Oct 2011 19:50:38 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:74f5ea818cea |
---|---|
1 #include <ctype.h> | |
2 #include <string.h> | |
3 #include <stdlib.h> | |
4 #include <stdio.h> | |
5 #include "faidx.h" | |
6 #include "khash.h" | |
7 | |
8 typedef struct { | |
9 uint64_t len:32, line_len:16, line_blen:16; | |
10 uint64_t offset; | |
11 } faidx1_t; | |
12 KHASH_MAP_INIT_STR(s, faidx1_t) | |
13 | |
14 #ifndef _NO_RAZF | |
15 #include "razf.h" | |
16 #else | |
17 #ifdef _WIN32 | |
18 #define ftello(fp) ftell(fp) | |
19 #define fseeko(fp, offset, whence) fseek(fp, offset, whence) | |
20 #else | |
21 extern off_t ftello(FILE *stream); | |
22 extern int fseeko(FILE *stream, off_t offset, int whence); | |
23 #endif | |
24 #define RAZF FILE | |
25 #define razf_read(fp, buf, size) fread(buf, 1, size, fp) | |
26 #define razf_open(fn, mode) fopen(fn, mode) | |
27 #define razf_close(fp) fclose(fp) | |
28 #define razf_seek(fp, offset, whence) fseeko(fp, offset, whence) | |
29 #define razf_tell(fp) ftello(fp) | |
30 #endif | |
31 | |
32 struct __faidx_t { | |
33 RAZF *rz; | |
34 int n, m; | |
35 char **name; | |
36 khash_t(s) *hash; | |
37 }; | |
38 | |
39 #ifndef kroundup32 | |
40 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
41 #endif | |
42 | |
43 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset) | |
44 { | |
45 khint_t k; | |
46 int ret; | |
47 faidx1_t t; | |
48 if (idx->n == idx->m) { | |
49 idx->m = idx->m? idx->m<<1 : 16; | |
50 idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m); | |
51 } | |
52 idx->name[idx->n] = strdup(name); | |
53 k = kh_put(s, idx->hash, idx->name[idx->n], &ret); | |
54 t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset; | |
55 kh_value(idx->hash, k) = t; | |
56 ++idx->n; | |
57 } | |
58 | |
59 faidx_t *fai_build_core(RAZF *rz) | |
60 { | |
61 char c, *name; | |
62 int l_name, m_name, ret; | |
63 int len, line_len, line_blen, state; | |
64 int l1, l2; | |
65 faidx_t *idx; | |
66 uint64_t offset; | |
67 | |
68 idx = (faidx_t*)calloc(1, sizeof(faidx_t)); | |
69 idx->hash = kh_init(s); | |
70 name = 0; l_name = m_name = 0; | |
71 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; | |
72 while (razf_read(rz, &c, 1)) { | |
73 if (c == '\n') { // an empty line | |
74 if (state == 1) { | |
75 offset = razf_tell(rz); | |
76 continue; | |
77 } else if ((state == 0 && len < 0) || state == 2) continue; | |
78 } | |
79 if (c == '>') { // fasta header | |
80 if (len >= 0) | |
81 fai_insert_index(idx, name, len, line_len, line_blen, offset); | |
82 l_name = 0; | |
83 while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) { | |
84 if (m_name < l_name + 2) { | |
85 m_name = l_name + 2; | |
86 kroundup32(m_name); | |
87 name = (char*)realloc(name, m_name); | |
88 } | |
89 name[l_name++] = c; | |
90 } | |
91 name[l_name] = '\0'; | |
92 if (ret == 0) { | |
93 fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); | |
94 free(name); fai_destroy(idx); | |
95 return 0; | |
96 } | |
97 if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n'); | |
98 state = 1; len = 0; | |
99 offset = razf_tell(rz); | |
100 } else { | |
101 if (state == 3) { | |
102 fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); | |
103 free(name); fai_destroy(idx); | |
104 return 0; | |
105 } | |
106 if (state == 2) state = 3; | |
107 l1 = l2 = 0; | |
108 do { | |
109 ++l1; | |
110 if (isgraph(c)) ++l2; | |
111 } while ((ret = razf_read(rz, &c, 1)) && c != '\n'); | |
112 if (state == 3 && l2) { | |
113 fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); | |
114 free(name); fai_destroy(idx); | |
115 return 0; | |
116 } | |
117 ++l1; len += l2; | |
118 if (l2 >= 0x10000) { | |
119 fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name); | |
120 free(name); fai_destroy(idx); | |
121 return 0; | |
122 } | |
123 if (state == 1) line_len = l1, line_blen = l2, state = 0; | |
124 else if (state == 0) { | |
125 if (l1 != line_len || l2 != line_blen) state = 2; | |
126 } | |
127 } | |
128 } | |
129 fai_insert_index(idx, name, len, line_len, line_blen, offset); | |
130 free(name); | |
131 return idx; | |
132 } | |
133 | |
134 void fai_save(const faidx_t *fai, FILE *fp) | |
135 { | |
136 khint_t k; | |
137 int i; | |
138 for (i = 0; i < fai->n; ++i) { | |
139 faidx1_t x; | |
140 k = kh_get(s, fai->hash, fai->name[i]); | |
141 x = kh_value(fai->hash, k); | |
142 #ifdef _WIN32 | |
143 fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len); | |
144 #else | |
145 fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len); | |
146 #endif | |
147 } | |
148 } | |
149 | |
150 faidx_t *fai_read(FILE *fp) | |
151 { | |
152 faidx_t *fai; | |
153 char *buf, *p; | |
154 int len, line_len, line_blen; | |
155 #ifdef _WIN32 | |
156 long offset; | |
157 #else | |
158 long long offset; | |
159 #endif | |
160 fai = (faidx_t*)calloc(1, sizeof(faidx_t)); | |
161 fai->hash = kh_init(s); | |
162 buf = (char*)calloc(0x10000, 1); | |
163 while (!feof(fp) && fgets(buf, 0x10000, fp)) { | |
164 for (p = buf; *p && isgraph(*p); ++p); | |
165 *p = 0; ++p; | |
166 #ifdef _WIN32 | |
167 sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len); | |
168 #else | |
169 sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len); | |
170 #endif | |
171 fai_insert_index(fai, buf, len, line_len, line_blen, offset); | |
172 } | |
173 free(buf); | |
174 return fai; | |
175 } | |
176 | |
177 void fai_destroy(faidx_t *fai) | |
178 { | |
179 int i; | |
180 for (i = 0; i < fai->n; ++i) free(fai->name[i]); | |
181 free(fai->name); | |
182 kh_destroy(s, fai->hash); | |
183 if (fai->rz) razf_close(fai->rz); | |
184 free(fai); | |
185 } | |
186 | |
187 int fai_build(const char *fn) | |
188 { | |
189 char *str; | |
190 RAZF *rz; | |
191 FILE *fp; | |
192 faidx_t *fai; | |
193 str = (char*)calloc(strlen(fn) + 5, 1); | |
194 sprintf(str, "%s.fai", fn); | |
195 rz = razf_open(fn, "r"); | |
196 if (rz == 0) { | |
197 fprintf(stderr, "[fai_build] fail to open the FASTA file.\n"); | |
198 free(str); | |
199 return -1; | |
200 } | |
201 fai = fai_build_core(rz); | |
202 razf_close(rz); | |
203 fp = fopen(str, "wb"); | |
204 if (fp == 0) { | |
205 fprintf(stderr, "[fai_build] fail to write FASTA index.\n"); | |
206 fai_destroy(fai); free(str); | |
207 return -1; | |
208 } | |
209 fai_save(fai, fp); | |
210 fclose(fp); | |
211 free(str); | |
212 fai_destroy(fai); | |
213 return 0; | |
214 } | |
215 | |
216 faidx_t *fai_load(const char *fn) | |
217 { | |
218 char *str; | |
219 FILE *fp; | |
220 faidx_t *fai; | |
221 str = (char*)calloc(strlen(fn) + 5, 1); | |
222 sprintf(str, "%s.fai", fn); | |
223 fp = fopen(str, "rb"); | |
224 if (fp == 0) { | |
225 fprintf(stderr, "[fai_load] build FASTA index.\n"); | |
226 fai_build(fn); | |
227 fp = fopen(str, "r"); | |
228 if (fp == 0) { | |
229 fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); | |
230 free(str); | |
231 return 0; | |
232 } | |
233 } | |
234 fai = fai_read(fp); | |
235 fclose(fp); | |
236 fai->rz = razf_open(fn, "rb"); | |
237 free(str); | |
238 if (fai->rz == 0) { | |
239 fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); | |
240 return 0; | |
241 } | |
242 return fai; | |
243 } | |
244 | |
245 char *fai_fetch(const faidx_t *fai, const char *str, int *len) | |
246 { | |
247 char *s, *p, c; | |
248 int i, l, k; | |
249 khiter_t iter; | |
250 faidx1_t val; | |
251 khash_t(s) *h; | |
252 int beg, end; | |
253 | |
254 beg = end = -1; | |
255 h = fai->hash; | |
256 l = strlen(str); | |
257 p = s = (char*)malloc(l+1); | |
258 /* squeeze out "," */ | |
259 for (i = k = 0; i != l; ++i) | |
260 if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; | |
261 s[k] = 0; | |
262 for (i = 0; i != k; ++i) if (s[i] == ':') break; | |
263 s[i] = 0; | |
264 iter = kh_get(s, h, s); /* get the ref_id */ | |
265 if (iter == kh_end(h)) { | |
266 *len = 0; | |
267 free(s); return 0; | |
268 } | |
269 val = kh_value(h, iter); | |
270 if (i == k) { /* dump the whole sequence */ | |
271 beg = 0; end = val.len; | |
272 } else { | |
273 for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; | |
274 beg = atoi(p); | |
275 if (i < k) { | |
276 p = s + i + 1; | |
277 end = atoi(p); | |
278 } else end = val.len; | |
279 } | |
280 if (beg > 0) --beg; | |
281 if (beg >= val.len) beg = val.len; | |
282 if (end >= val.len) end = val.len; | |
283 if (beg > end) beg = end; | |
284 free(s); | |
285 | |
286 // now retrieve the sequence | |
287 l = 0; | |
288 s = (char*)malloc(end - beg + 2); | |
289 razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); | |
290 while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg) | |
291 if (isgraph(c)) s[l++] = c; | |
292 s[l] = '\0'; | |
293 *len = l; | |
294 return s; | |
295 } | |
296 | |
297 int faidx_main(int argc, char *argv[]) | |
298 { | |
299 if (argc == 1) { | |
300 fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n"); | |
301 return 1; | |
302 } else { | |
303 if (argc == 2) fai_build(argv[1]); | |
304 else { | |
305 int i, j, k, l; | |
306 char *s; | |
307 faidx_t *fai; | |
308 fai = fai_load(argv[1]); | |
309 if (fai == 0) return 1; | |
310 for (i = 2; i != argc; ++i) { | |
311 printf(">%s\n", argv[i]); | |
312 s = fai_fetch(fai, argv[i], &l); | |
313 for (j = 0; j < l; j += 60) { | |
314 for (k = 0; k < 60 && k < l - j; ++k) | |
315 putchar(s[j + k]); | |
316 putchar('\n'); | |
317 } | |
318 free(s); | |
319 } | |
320 fai_destroy(fai); | |
321 } | |
322 } | |
323 return 0; | |
324 } | |
325 | |
326 #ifdef FAIDX_MAIN | |
327 int main(int argc, char *argv[]) { return faidx_main(argc, argv); } | |
328 #endif |