Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/bcftools/vcf.c @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <zlib.h> | |
2 #include <stdlib.h> | |
3 #include <stdio.h> | |
4 #include <string.h> | |
5 #include "bcf.h" | |
6 #include "kstring.h" | |
7 #include "kseq.h" | |
8 KSTREAM_INIT(gzFile, gzread, 4096) | |
9 | |
10 typedef struct { | |
11 gzFile fp; | |
12 FILE *fpout; | |
13 kstream_t *ks; | |
14 void *refhash; | |
15 kstring_t line; | |
16 int max_ref; | |
17 } vcf_t; | |
18 | |
19 bcf_hdr_t *vcf_hdr_read(bcf_t *bp) | |
20 { | |
21 kstring_t meta, smpl; | |
22 int dret; | |
23 vcf_t *v; | |
24 bcf_hdr_t *h; | |
25 if (!bp->is_vcf) return bcf_hdr_read(bp); | |
26 h = calloc(1, sizeof(bcf_hdr_t)); | |
27 v = (vcf_t*)bp->v; | |
28 v->line.l = 0; | |
29 memset(&meta, 0, sizeof(kstring_t)); | |
30 memset(&smpl, 0, sizeof(kstring_t)); | |
31 while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) { | |
32 if (v->line.l < 2) continue; | |
33 if (v->line.s[0] != '#') { | |
34 free(meta.s); | |
35 free(smpl.s); | |
36 free(h); | |
37 return 0; // no sample line | |
38 } | |
39 if (v->line.s[0] == '#' && v->line.s[1] == '#') { | |
40 kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta); | |
41 } else if (v->line.s[0] == '#') { | |
42 int k; | |
43 ks_tokaux_t aux; | |
44 char *p; | |
45 for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) { | |
46 if (k >= 9) { | |
47 kputsn(p, aux.p - p, &smpl); | |
48 kputc('\0', &smpl); | |
49 } | |
50 } | |
51 break; | |
52 } | |
53 } | |
54 kputc('\0', &meta); | |
55 h->name = 0; | |
56 h->sname = smpl.s; h->l_smpl = smpl.l; | |
57 h->txt = meta.s; h->l_txt = meta.l; | |
58 bcf_hdr_sync(h); | |
59 return h; | |
60 } | |
61 | |
62 bcf_t *vcf_open(const char *fn, const char *mode) | |
63 { | |
64 bcf_t *bp; | |
65 vcf_t *v; | |
66 if (strchr(mode, 'b')) return bcf_open(fn, mode); | |
67 bp = calloc(1, sizeof(bcf_t)); | |
68 v = calloc(1, sizeof(vcf_t)); | |
69 bp->is_vcf = 1; | |
70 bp->v = v; | |
71 v->refhash = bcf_str2id_init(); | |
72 if (strchr(mode, 'r')) { | |
73 v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); | |
74 v->ks = ks_init(v->fp); | |
75 } else if (strchr(mode, 'w')) | |
76 v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout; | |
77 return bp; | |
78 } | |
79 | |
80 int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn) | |
81 { | |
82 vcf_t *v; | |
83 gzFile fp; | |
84 kstream_t *ks; | |
85 kstring_t s, rn; | |
86 int dret; | |
87 if (bp == 0) return -1; | |
88 if (!bp->is_vcf) return 0; | |
89 s.l = s.m = 0; s.s = 0; | |
90 rn.m = rn.l = h->l_nm; rn.s = h->name; | |
91 v = (vcf_t*)bp->v; | |
92 fp = gzopen(fn, "r"); | |
93 ks = ks_init(fp); | |
94 while (ks_getuntil(ks, 0, &s, &dret) >= 0) { | |
95 bcf_str2id_add(v->refhash, strdup(s.s)); | |
96 kputs(s.s, &rn); kputc('\0', &rn); | |
97 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); | |
98 } | |
99 ks_destroy(ks); | |
100 gzclose(fp); | |
101 h->l_nm = rn.l; h->name = rn.s; | |
102 bcf_hdr_sync(h); | |
103 free(s.s); | |
104 return 0; | |
105 } | |
106 | |
107 int vcf_close(bcf_t *bp) | |
108 { | |
109 vcf_t *v; | |
110 if (bp == 0) return -1; | |
111 if (!bp->is_vcf) return bcf_close(bp); | |
112 v = (vcf_t*)bp->v; | |
113 if (v->fp) { | |
114 ks_destroy(v->ks); | |
115 gzclose(v->fp); | |
116 } | |
117 if (v->fpout) fclose(v->fpout); | |
118 free(v->line.s); | |
119 bcf_str2id_thorough_destroy(v->refhash); | |
120 free(v); | |
121 free(bp); | |
122 return 0; | |
123 } | |
124 | |
125 int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h) | |
126 { | |
127 vcf_t *v = (vcf_t*)bp->v; | |
128 int i, has_ver = 0; | |
129 if (!bp->is_vcf) return bcf_hdr_write(bp, h); | |
130 if (h->l_txt > 0) { | |
131 if (strstr(h->txt, "##fileformat=")) has_ver = 1; | |
132 if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); | |
133 fwrite(h->txt, 1, h->l_txt - 1, v->fpout); | |
134 } | |
135 if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); | |
136 fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); | |
137 for (i = 0; i < h->n_smpl; ++i) | |
138 fprintf(v->fpout, "\t%s", h->sns[i]); | |
139 fputc('\n', v->fpout); | |
140 return 0; | |
141 } | |
142 | |
143 int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) | |
144 { | |
145 vcf_t *v = (vcf_t*)bp->v; | |
146 extern void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s); | |
147 if (!bp->is_vcf) return bcf_write(bp, h, b); | |
148 bcf_fmt_core(h, b, &v->line); | |
149 fwrite(v->line.s, 1, v->line.l, v->fpout); | |
150 fputc('\n', v->fpout); | |
151 return v->line.l + 1; | |
152 } | |
153 | |
154 int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) | |
155 { | |
156 int dret, k, i, sync = 0; | |
157 vcf_t *v = (vcf_t*)bp->v; | |
158 char *p, *q; | |
159 kstring_t str, rn; | |
160 ks_tokaux_t aux, a2; | |
161 if (!bp->is_vcf) return bcf_read(bp, h, b); | |
162 v->line.l = 0; | |
163 str.l = 0; str.m = b->m_str; str.s = b->str; | |
164 rn.l = rn.m = h->l_nm; rn.s = h->name; | |
165 if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1; | |
166 b->n_smpl = h->n_smpl; | |
167 for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) { | |
168 *(char*)aux.p = 0; | |
169 if (k == 0) { // ref | |
170 int tid = bcf_str2id(v->refhash, p); | |
171 if (tid < 0) { | |
172 tid = bcf_str2id_add(v->refhash, strdup(p)); | |
173 kputs(p, &rn); kputc('\0', &rn); | |
174 sync = 1; | |
175 } | |
176 b->tid = tid; | |
177 } else if (k == 1) { // pos | |
178 b->pos = atoi(p) - 1; | |
179 } else if (k == 5) { // qual | |
180 b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0; | |
181 } else if (k <= 8) { // variable length strings | |
182 kputs(p, &str); kputc('\0', &str); | |
183 b->l_str = str.l; b->m_str = str.m; b->str = str.s; | |
184 if (k == 8) bcf_sync(b); | |
185 } else { // k > 9 | |
186 if (strncmp(p, "./.", 3) == 0) { | |
187 for (i = 0; i < b->n_gi; ++i) { | |
188 if (b->gi[i].fmt == bcf_str2int("GT", 2)) { | |
189 ((uint8_t*)b->gi[i].data)[k-9] = 1<<7; | |
190 } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { | |
191 ((uint8_t*)b->gi[i].data)[k-9] = 0; | |
192 } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { | |
193 ((int32_t*)b->gi[i].data)[k-9] = 0; | |
194 } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { | |
195 ((uint16_t*)b->gi[i].data)[k-9] = 0; | |
196 } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { | |
197 int y = b->n_alleles * (b->n_alleles + 1) / 2; | |
198 memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y); | |
199 } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { | |
200 int y = b->n_alleles * (b->n_alleles + 1) / 2; | |
201 memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4); | |
202 } | |
203 } | |
204 goto endblock; | |
205 } | |
206 for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) { | |
207 if (b->gi[i].fmt == bcf_str2int("GT", 2)) { | |
208 ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6; | |
209 } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { | |
210 double _x = strtod(q, &q); | |
211 int x = (int)(_x + .499); | |
212 if (x > 255) x = 255; | |
213 ((uint8_t*)b->gi[i].data)[k-9] = x; | |
214 } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { | |
215 int x = strtol(q, &q, 10); | |
216 if (x > 0xffff) x = 0xffff; | |
217 ((uint32_t*)b->gi[i].data)[k-9] = x; | |
218 } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { | |
219 int x = strtol(q, &q, 10); | |
220 if (x > 0xffff) x = 0xffff; | |
221 ((uint16_t*)b->gi[i].data)[k-9] = x; | |
222 } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { | |
223 int x, y, j; | |
224 uint8_t *data = (uint8_t*)b->gi[i].data; | |
225 y = b->n_alleles * (b->n_alleles + 1) / 2; | |
226 for (j = 0; j < y; ++j) { | |
227 x = strtol(q, &q, 10); | |
228 if (x > 255) x = 255; | |
229 data[(k-9) * y + j] = x; | |
230 ++q; | |
231 } | |
232 } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { | |
233 int j, y; | |
234 float x, *data = (float*)b->gi[i].data; | |
235 y = b->n_alleles * (b->n_alleles + 1) / 2; | |
236 for (j = 0; j < y; ++j) { | |
237 x = strtod(q, &q); | |
238 data[(k-9) * y + j] = x > 0? -x/10. : x; | |
239 ++q; | |
240 } | |
241 } | |
242 } | |
243 endblock: i = i; | |
244 } | |
245 } | |
246 h->l_nm = rn.l; h->name = rn.s; | |
247 if (sync) bcf_hdr_sync(h); | |
248 return v->line.l + 1; | |
249 } |