9
|
1 /* The MIT License
|
|
2
|
|
3 Copyright (c) 2008 Genome Research Ltd (GRL).
|
|
4
|
|
5 Permission is hereby granted, free of charge, to any person obtaining
|
|
6 a copy of this software and associated documentation files (the
|
|
7 "Software"), to deal in the Software without restriction, including
|
|
8 without limitation the rights to use, copy, modify, merge, publish,
|
|
9 distribute, sublicense, and/or sell copies of the Software, and to
|
|
10 permit persons to whom the Software is furnished to do so, subject to
|
|
11 the following conditions:
|
|
12
|
|
13 The above copyright notice and this permission notice shall be
|
|
14 included in all copies or substantial portions of the Software.
|
|
15
|
|
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23 SOFTWARE.
|
|
24 */
|
|
25
|
|
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
|
|
27
|
|
28 /* Last Modified: 12APR2009 */
|
|
29
|
|
30 #ifndef AC_KSEQ_H
|
|
31 #define AC_KSEQ_H
|
|
32
|
|
33 #include <ctype.h>
|
|
34 #include <string.h>
|
|
35 #include <stdlib.h>
|
|
36
|
|
37 #define KS_SEP_SPACE 0 /* isspace(): \t, \n, \v, \f, \r */
|
|
38 #define KS_SEP_TAB 1 /* isspace() && !' ' */
|
|
39 #define KS_SEP_MAX 1
|
|
40
|
|
41 #define __KS_TYPE(type_t) \
|
|
42 typedef struct __kstream_t { \
|
|
43 char *buf; \
|
|
44 int begin, end, is_eof; \
|
|
45 type_t f; \
|
|
46 } kstream_t;
|
|
47
|
|
48 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
|
|
49 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
|
|
50
|
|
51 #define __KS_BASIC(type_t, __bufsize) \
|
|
52 static inline kstream_t *ks_init(type_t f) \
|
|
53 { \
|
|
54 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
|
|
55 ks->f = f; \
|
|
56 ks->buf = (char*)malloc(__bufsize); \
|
|
57 return ks; \
|
|
58 } \
|
|
59 static inline void ks_destroy(kstream_t *ks) \
|
|
60 { \
|
|
61 if (ks) { \
|
|
62 free(ks->buf); \
|
|
63 free(ks); \
|
|
64 } \
|
|
65 }
|
|
66
|
|
67 #define __KS_GETC(__read, __bufsize) \
|
|
68 static inline int ks_getc(kstream_t *ks) \
|
|
69 { \
|
|
70 if (ks->is_eof && ks->begin >= ks->end) return -1; \
|
|
71 if (ks->begin >= ks->end) { \
|
|
72 ks->begin = 0; \
|
|
73 ks->end = __read(ks->f, ks->buf, __bufsize); \
|
|
74 if (ks->end < __bufsize) ks->is_eof = 1; \
|
|
75 if (ks->end == 0) return -1; \
|
|
76 } \
|
|
77 return (int)ks->buf[ks->begin++]; \
|
|
78 }
|
|
79
|
|
80 #ifndef KSTRING_T
|
|
81 #define KSTRING_T kstring_t
|
|
82 typedef struct __kstring_t {
|
|
83 size_t l, m;
|
|
84 char *s;
|
|
85 } kstring_t;
|
|
86 #endif
|
|
87
|
|
88 #ifndef kroundup32
|
|
89 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
|
90 #endif
|
|
91
|
|
92 #define __KS_GETUNTIL(__read, __bufsize) \
|
|
93 static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
|
|
94 { \
|
|
95 if (dret) *dret = 0; \
|
|
96 str->l = 0; \
|
|
97 if (ks->begin >= ks->end && ks->is_eof) return -1; \
|
|
98 for (;;) { \
|
|
99 int i; \
|
|
100 if (ks->begin >= ks->end) { \
|
|
101 if (!ks->is_eof) { \
|
|
102 ks->begin = 0; \
|
|
103 ks->end = __read(ks->f, ks->buf, __bufsize); \
|
|
104 if (ks->end < __bufsize) ks->is_eof = 1; \
|
|
105 if (ks->end == 0) break; \
|
|
106 } else break; \
|
|
107 } \
|
|
108 if (delimiter > KS_SEP_MAX) { \
|
|
109 for (i = ks->begin; i < ks->end; ++i) \
|
|
110 if (ks->buf[i] == delimiter) break; \
|
|
111 } else if (delimiter == KS_SEP_SPACE) { \
|
|
112 for (i = ks->begin; i < ks->end; ++i) \
|
|
113 if (isspace(ks->buf[i])) break; \
|
|
114 } else if (delimiter == KS_SEP_TAB) { \
|
|
115 for (i = ks->begin; i < ks->end; ++i) \
|
|
116 if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
|
|
117 } else i = 0; /* never come to here! */ \
|
|
118 if (str->m - str->l < i - ks->begin + 1) { \
|
|
119 str->m = str->l + (i - ks->begin) + 1; \
|
|
120 kroundup32(str->m); \
|
|
121 str->s = (char*)realloc(str->s, str->m); \
|
|
122 } \
|
|
123 memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
|
|
124 str->l = str->l + (i - ks->begin); \
|
|
125 ks->begin = i + 1; \
|
|
126 if (i < ks->end) { \
|
|
127 if (dret) *dret = ks->buf[i]; \
|
|
128 break; \
|
|
129 } \
|
|
130 } \
|
|
131 if (str->l == 0) { \
|
|
132 str->m = 1; \
|
|
133 str->s = (char*)calloc(1, 1); \
|
|
134 } \
|
|
135 str->s[str->l] = '\0'; \
|
|
136 return str->l; \
|
|
137 }
|
|
138
|
|
139 #define KSTREAM_INIT(type_t, __read, __bufsize) \
|
|
140 __KS_TYPE(type_t) \
|
|
141 __KS_BASIC(type_t, __bufsize) \
|
|
142 __KS_GETC(__read, __bufsize) \
|
|
143 __KS_GETUNTIL(__read, __bufsize)
|
|
144
|
|
145 #define __KSEQ_BASIC(type_t) \
|
|
146 static inline kseq_t *kseq_init(type_t fd) \
|
|
147 { \
|
|
148 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
|
|
149 s->f = ks_init(fd); \
|
|
150 return s; \
|
|
151 } \
|
|
152 static inline void kseq_rewind(kseq_t *ks) \
|
|
153 { \
|
|
154 ks->last_char = 0; \
|
|
155 ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
|
|
156 } \
|
|
157 static inline void kseq_destroy(kseq_t *ks) \
|
|
158 { \
|
|
159 if (!ks) return; \
|
|
160 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
|
|
161 ks_destroy(ks->f); \
|
|
162 free(ks); \
|
|
163 }
|
|
164
|
|
165 /* Return value:
|
|
166 >=0 length of the sequence (normal)
|
|
167 -1 end-of-file
|
|
168 -2 truncated quality string
|
|
169 */
|
|
170 #define __KSEQ_READ \
|
|
171 static int kseq_read(kseq_t *seq) \
|
|
172 { \
|
|
173 int c; \
|
|
174 kstream_t *ks = seq->f; \
|
|
175 if (seq->last_char == 0) { /* then jump to the next header line */ \
|
|
176 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
|
|
177 if (c == -1) return -1; /* end of file */ \
|
|
178 seq->last_char = c; \
|
|
179 } /* the first header char has been read */ \
|
|
180 seq->comment.l = seq->seq.l = seq->qual.l = 0; \
|
|
181 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
|
|
182 if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
|
|
183 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
|
|
184 if (isgraph(c)) { /* printable non-space character */ \
|
|
185 if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
|
|
186 seq->seq.m = seq->seq.l + 2; \
|
|
187 kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
|
|
188 seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
|
|
189 } \
|
|
190 seq->seq.s[seq->seq.l++] = (char)c; \
|
|
191 } \
|
|
192 } \
|
|
193 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
|
|
194 if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
|
|
195 seq->seq.m = seq->seq.l + 2; \
|
|
196 kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
|
|
197 seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
|
|
198 } \
|
|
199 seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
|
|
200 if (c != '+') return seq->seq.l; /* FASTA */ \
|
|
201 if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
|
|
202 seq->qual.m = seq->seq.m; \
|
|
203 seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
|
|
204 } \
|
|
205 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
|
|
206 if (c == -1) return -2; /* we should not stop here */ \
|
|
207 while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
|
|
208 if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
|
|
209 seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
|
|
210 seq->last_char = 0; /* we have not come to the next header line */ \
|
|
211 if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
|
|
212 return seq->seq.l; \
|
|
213 }
|
|
214
|
|
215 #define __KSEQ_TYPE(type_t) \
|
|
216 typedef struct { \
|
|
217 kstring_t name, comment, seq, qual; \
|
|
218 int last_char; \
|
|
219 kstream_t *f; \
|
|
220 } kseq_t;
|
|
221
|
|
222 #define KSEQ_INIT(type_t, __read) \
|
|
223 KSTREAM_INIT(type_t, __read, 4096) \
|
|
224 __KSEQ_TYPE(type_t) \
|
|
225 __KSEQ_BASIC(type_t) \
|
|
226 __KSEQ_READ
|
|
227
|
|
228 #endif
|