0
|
1 #include <zlib.h>
|
|
2 #include <stdlib.h>
|
|
3 #include <stdio.h>
|
|
4 #include <string.h>
|
|
5 #include "bcf.h"
|
|
6 #include "kstring.h"
|
|
7 #include "kseq.h"
|
|
8 KSTREAM_INIT(gzFile, gzread, 4096)
|
|
9
|
|
10 typedef struct {
|
|
11 gzFile fp;
|
|
12 FILE *fpout;
|
|
13 kstream_t *ks;
|
|
14 void *refhash;
|
|
15 kstring_t line;
|
|
16 int max_ref;
|
|
17 } vcf_t;
|
|
18
|
|
19 bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
|
|
20 {
|
|
21 kstring_t meta, smpl;
|
|
22 int dret;
|
|
23 vcf_t *v;
|
|
24 bcf_hdr_t *h;
|
|
25 if (!bp->is_vcf) return bcf_hdr_read(bp);
|
|
26 h = calloc(1, sizeof(bcf_hdr_t));
|
|
27 v = (vcf_t*)bp->v;
|
|
28 v->line.l = 0;
|
|
29 memset(&meta, 0, sizeof(kstring_t));
|
|
30 memset(&smpl, 0, sizeof(kstring_t));
|
|
31 while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) {
|
|
32 if (v->line.l < 2) continue;
|
|
33 if (v->line.s[0] != '#') {
|
|
34 free(meta.s);
|
|
35 free(smpl.s);
|
|
36 free(h);
|
|
37 return 0; // no sample line
|
|
38 }
|
|
39 if (v->line.s[0] == '#' && v->line.s[1] == '#') {
|
|
40 kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta);
|
|
41 } else if (v->line.s[0] == '#') {
|
|
42 int k;
|
|
43 ks_tokaux_t aux;
|
|
44 char *p;
|
|
45 for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
|
|
46 if (k >= 9) {
|
|
47 kputsn(p, aux.p - p, &smpl);
|
|
48 kputc('\0', &smpl);
|
|
49 }
|
|
50 }
|
|
51 break;
|
|
52 }
|
|
53 }
|
|
54 kputc('\0', &meta);
|
|
55 h->name = 0;
|
|
56 h->sname = smpl.s; h->l_smpl = smpl.l;
|
|
57 h->txt = meta.s; h->l_txt = meta.l;
|
|
58 bcf_hdr_sync(h);
|
|
59 return h;
|
|
60 }
|
|
61
|
|
62 bcf_t *vcf_open(const char *fn, const char *mode)
|
|
63 {
|
|
64 bcf_t *bp;
|
|
65 vcf_t *v;
|
|
66 if (strchr(mode, 'b')) return bcf_open(fn, mode);
|
|
67 bp = calloc(1, sizeof(bcf_t));
|
|
68 v = calloc(1, sizeof(vcf_t));
|
|
69 bp->is_vcf = 1;
|
|
70 bp->v = v;
|
|
71 v->refhash = bcf_str2id_init();
|
|
72 if (strchr(mode, 'r')) {
|
|
73 v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
|
74 v->ks = ks_init(v->fp);
|
|
75 } else if (strchr(mode, 'w'))
|
|
76 v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout;
|
|
77 return bp;
|
|
78 }
|
|
79
|
|
80 int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn)
|
|
81 {
|
|
82 vcf_t *v;
|
|
83 gzFile fp;
|
|
84 kstream_t *ks;
|
|
85 kstring_t s, rn;
|
|
86 int dret;
|
|
87 if (bp == 0) return -1;
|
|
88 if (!bp->is_vcf) return 0;
|
|
89 s.l = s.m = 0; s.s = 0;
|
|
90 rn.m = rn.l = h->l_nm; rn.s = h->name;
|
|
91 v = (vcf_t*)bp->v;
|
|
92 fp = gzopen(fn, "r");
|
|
93 ks = ks_init(fp);
|
|
94 while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
|
|
95 bcf_str2id_add(v->refhash, strdup(s.s));
|
|
96 kputs(s.s, &rn); kputc('\0', &rn);
|
|
97 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
|
|
98 }
|
|
99 ks_destroy(ks);
|
|
100 gzclose(fp);
|
|
101 h->l_nm = rn.l; h->name = rn.s;
|
|
102 bcf_hdr_sync(h);
|
|
103 free(s.s);
|
|
104 return 0;
|
|
105 }
|
|
106
|
|
107 int vcf_close(bcf_t *bp)
|
|
108 {
|
|
109 vcf_t *v;
|
|
110 if (bp == 0) return -1;
|
|
111 if (!bp->is_vcf) return bcf_close(bp);
|
|
112 v = (vcf_t*)bp->v;
|
|
113 if (v->fp) {
|
|
114 ks_destroy(v->ks);
|
|
115 gzclose(v->fp);
|
|
116 }
|
|
117 if (v->fpout) fclose(v->fpout);
|
|
118 free(v->line.s);
|
|
119 bcf_str2id_thorough_destroy(v->refhash);
|
|
120 free(v);
|
|
121 free(bp);
|
|
122 return 0;
|
|
123 }
|
|
124
|
|
125 int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h)
|
|
126 {
|
|
127 vcf_t *v = (vcf_t*)bp->v;
|
|
128 int i, has_ver = 0;
|
|
129 if (!bp->is_vcf) return bcf_hdr_write(bp, h);
|
|
130 if (h->l_txt > 0) {
|
|
131 if (strstr(h->txt, "##fileformat=")) has_ver = 1;
|
|
132 if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n");
|
|
133 fwrite(h->txt, 1, h->l_txt - 1, v->fpout);
|
|
134 }
|
|
135 if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n");
|
|
136 fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
|
|
137 for (i = 0; i < h->n_smpl; ++i)
|
|
138 fprintf(v->fpout, "\t%s", h->sns[i]);
|
|
139 fputc('\n', v->fpout);
|
|
140 return 0;
|
|
141 }
|
|
142
|
|
143 int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
|
|
144 {
|
|
145 vcf_t *v = (vcf_t*)bp->v;
|
|
146 extern void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s);
|
|
147 if (!bp->is_vcf) return bcf_write(bp, h, b);
|
|
148 bcf_fmt_core(h, b, &v->line);
|
|
149 fwrite(v->line.s, 1, v->line.l, v->fpout);
|
|
150 fputc('\n', v->fpout);
|
|
151 return v->line.l + 1;
|
|
152 }
|
|
153
|
|
154 int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
|
|
155 {
|
|
156 int dret, k, i, sync = 0;
|
|
157 vcf_t *v = (vcf_t*)bp->v;
|
|
158 char *p, *q;
|
|
159 kstring_t str, rn;
|
|
160 ks_tokaux_t aux, a2;
|
|
161 if (!bp->is_vcf) return bcf_read(bp, h, b);
|
|
162 v->line.l = 0;
|
|
163 str.l = 0; str.m = b->m_str; str.s = b->str;
|
|
164 rn.l = rn.m = h->l_nm; rn.s = h->name;
|
|
165 if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1;
|
|
166 b->n_smpl = h->n_smpl;
|
|
167 for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
|
|
168 *(char*)aux.p = 0;
|
|
169 if (k == 0) { // ref
|
|
170 int tid = bcf_str2id(v->refhash, p);
|
|
171 if (tid < 0) {
|
|
172 tid = bcf_str2id_add(v->refhash, strdup(p));
|
|
173 kputs(p, &rn); kputc('\0', &rn);
|
|
174 sync = 1;
|
|
175 }
|
|
176 b->tid = tid;
|
|
177 } else if (k == 1) { // pos
|
|
178 b->pos = atoi(p) - 1;
|
|
179 } else if (k == 5) { // qual
|
|
180 b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0;
|
|
181 } else if (k <= 8) { // variable length strings
|
|
182 kputs(p, &str); kputc('\0', &str);
|
|
183 b->l_str = str.l; b->m_str = str.m; b->str = str.s;
|
|
184 if (k == 8) bcf_sync(b);
|
|
185 } else { // k > 9
|
|
186 if (strncmp(p, "./.", 3) == 0) {
|
|
187 for (i = 0; i < b->n_gi; ++i) {
|
|
188 if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
|
|
189 ((uint8_t*)b->gi[i].data)[k-9] = 1<<7;
|
|
190 } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
|
|
191 ((uint8_t*)b->gi[i].data)[k-9] = 0;
|
|
192 } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
|
|
193 ((int32_t*)b->gi[i].data)[k-9] = 0;
|
|
194 } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
|
|
195 ((uint16_t*)b->gi[i].data)[k-9] = 0;
|
|
196 } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
|
|
197 int y = b->n_alleles * (b->n_alleles + 1) / 2;
|
|
198 memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y);
|
|
199 } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
|
|
200 int y = b->n_alleles * (b->n_alleles + 1) / 2;
|
|
201 memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4);
|
|
202 }
|
|
203 }
|
|
204 goto endblock;
|
|
205 }
|
|
206 for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) {
|
|
207 if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
|
|
208 ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6;
|
|
209 } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
|
|
210 double _x = strtod(q, &q);
|
|
211 int x = (int)(_x + .499);
|
|
212 if (x > 255) x = 255;
|
|
213 ((uint8_t*)b->gi[i].data)[k-9] = x;
|
|
214 } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
|
|
215 int x = strtol(q, &q, 10);
|
|
216 if (x > 0xffff) x = 0xffff;
|
|
217 ((uint32_t*)b->gi[i].data)[k-9] = x;
|
|
218 } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
|
|
219 int x = strtol(q, &q, 10);
|
|
220 if (x > 0xffff) x = 0xffff;
|
|
221 ((uint16_t*)b->gi[i].data)[k-9] = x;
|
|
222 } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
|
|
223 int x, y, j;
|
|
224 uint8_t *data = (uint8_t*)b->gi[i].data;
|
|
225 y = b->n_alleles * (b->n_alleles + 1) / 2;
|
|
226 for (j = 0; j < y; ++j) {
|
|
227 x = strtol(q, &q, 10);
|
|
228 if (x > 255) x = 255;
|
|
229 data[(k-9) * y + j] = x;
|
|
230 ++q;
|
|
231 }
|
|
232 } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
|
|
233 int j, y;
|
|
234 float x, *data = (float*)b->gi[i].data;
|
|
235 y = b->n_alleles * (b->n_alleles + 1) / 2;
|
|
236 for (j = 0; j < y; ++j) {
|
|
237 x = strtod(q, &q);
|
|
238 data[(k-9) * y + j] = x > 0? -x/10. : x;
|
|
239 ++q;
|
|
240 }
|
|
241 }
|
|
242 }
|
|
243 endblock: i = i;
|
|
244 }
|
|
245 }
|
|
246 h->l_nm = rn.l; h->name = rn.s;
|
|
247 if (sync) bcf_hdr_sync(h);
|
|
248 return v->line.l + 1;
|
|
249 }
|