annotate pyPRADA_1.2/tools/samtools-0.1.16/bcftools/bcfutils.c @ 3:f17965495ec9 draft default tip

Uploaded
author siyuan
date Tue, 11 Mar 2014 12:14:01 -0400
parents acc2ca1a3ba4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #include <string.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2 #include <math.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 #include "bcf.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4 #include "kstring.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 #include "khash.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 KHASH_MAP_INIT_STR(str2id, int)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 void *bcf_build_refhash(bcf_hdr_t *h)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 khash_t(str2id) *hash;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 int i, ret;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 hash = kh_init(str2id);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 for (i = 0; i < h->n_ref; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 khint_t k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 kh_val(hash, k) = i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 return hash;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 void *bcf_str2id_init()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 return kh_init(str2id);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 void bcf_str2id_destroy(void *_hash)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 if (hash) kh_destroy(str2id, hash); // Note that strings are not freed.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 void bcf_str2id_thorough_destroy(void *_hash)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 khint_t k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 if (hash == 0) return;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 for (k = 0; k < kh_end(hash); ++k)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 kh_destroy(str2id, hash);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42 int bcf_str2id(void *_hash, const char *str)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 khint_t k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 if (!hash) return -1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 k = kh_get(str2id, hash, str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 return k == kh_end(hash)? -1 : kh_val(hash, k);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 int bcf_str2id_add(void *_hash, const char *str)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 khint_t k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 int ret;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 if (!hash) return -1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 k = kh_put(str2id, hash, str, &ret);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 if (ret == 0) return kh_val(hash, k);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 kh_val(hash, k) = kh_size(hash) - 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 return kh_val(hash, k);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 int bcf_shrink_alt(bcf1_t *b, int n)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 char *p;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 int i, j, k, n_smpl = b->n_smpl;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 if (b->n_alleles <= n) return -1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 // update ALT
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 if (n > 1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 for (p = b->alt, k = 1; *p; ++p)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 if (*p == ',' && ++k == n) break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 *p = '\0';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 } else p = b->alt, *p = '\0';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 ++p;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 memmove(p, b->flt, b->str + b->l_str - b->flt);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 b->l_str -= b->flt - p;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 // update PL
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 for (i = 0; i < b->n_gi; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 bcf_ginfo_t *g = b->gi + i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 if (g->fmt == bcf_str2int("PL", 2)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 int l, x = b->n_alleles * (b->n_alleles + 1) / 2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 uint8_t *d = (uint8_t*)g->data;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 g->len = n * (n + 1) / 2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 for (l = k = 0; l < n_smpl; ++l) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 uint8_t *dl = d + l * x;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 for (j = 0; j < g->len; ++j) d[k++] = dl[j];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 } // FIXME: to add GL
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 b->n_alleles = n;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 bcf_sync(b);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 int bcf_gl2pl(bcf1_t *b)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 char *p;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 int i, n_smpl = b->n_smpl;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 bcf_ginfo_t *g;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 float *d0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 uint8_t *d1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 if (strstr(b->fmt, "PL")) return -1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103 if ((p = strstr(b->fmt, "GL")) == 0) return -1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 *p = 'P';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 for (i = 0; i < b->n_gi; ++i)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 if (b->gi[i].fmt == bcf_str2int("GL", 2))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 g = b->gi + i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 g->fmt = bcf_str2int("PL", 2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 g->len /= 4; // 4 == sizeof(float)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 d0 = (float*)g->data; d1 = (uint8_t*)g->data;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 for (i = 0; i < n_smpl * g->len; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 int x = (int)(-10. * d0[i] + .499);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 if (x > 255) x = 255;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115 if (x < 0) x = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 d1[i] = x;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 /* FIXME: this function will fail given AB:GTX:GT. BCFtools never
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121 * produces such FMT, but others may do. */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 int bcf_fix_gt(bcf1_t *b)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 char *s;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 int i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 uint32_t tmp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 bcf_ginfo_t gt;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 // check the presence of the GT FMT
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 tmp = bcf_str2int("GT", 2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 for (i = 0; i < b->n_gi; ++i)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 if (b->gi[i].fmt == tmp) break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug...
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135 gt = b->gi[i];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 // move GT to the first
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 for (; i > 0; --i) b->gi[i] = b->gi[i-1];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138 b->gi[0] = gt;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139 memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140 b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 int bcf_fix_pl(bcf1_t *b)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146 int i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147 uint32_t tmp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 uint8_t *PL, *swap;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149 bcf_ginfo_t *gi;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150 // pinpoint PL
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 tmp = bcf_str2int("PL", 2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 for (i = 0; i < b->n_gi; ++i)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153 if (b->gi[i].fmt == tmp) break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 if (i == b->n_gi) return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 // prepare
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156 gi = b->gi + i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 PL = (uint8_t*)gi->data;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 swap = alloca(gi->len);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159 // loop through individuals
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160 for (i = 0; i < b->n_smpl; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161 int k, l, x;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162 uint8_t *PLi = PL + i * gi->len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163 memcpy(swap, PLi, gi->len);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164 for (k = x = 0; k < b->n_alleles; ++k)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165 for (l = k; l < b->n_alleles; ++l)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
166 PLi[l*(l+1)/2 + k] = swap[x++];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
167 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
168 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
169 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
170
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
171 int bcf_smpl_covered(const bcf1_t *b)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
172 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
173 int i, j, n = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
174 uint32_t tmp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
175 bcf_ginfo_t *gi;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
176 // pinpoint PL
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
177 tmp = bcf_str2int("PL", 2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
178 for (i = 0; i < b->n_gi; ++i)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
179 if (b->gi[i].fmt == tmp) break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
180 if (i == b->n_gi) return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
181 // count how many samples having PL!=[0..0]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
182 gi = b->gi + i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
183 for (i = 0; i < b->n_smpl; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
184 uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
185 for (j = 0; j < gi->len; ++j)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
186 if (PLi[j]) break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
187 if (j < gi->len) ++n;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
188 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
189 return n;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
190 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
191
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
192 static void *locate_field(const bcf1_t *b, const char *fmt, int l)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
193 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
194 int i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
195 uint32_t tmp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
196 tmp = bcf_str2int(fmt, l);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
197 for (i = 0; i < b->n_gi; ++i)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
198 if (b->gi[i].fmt == tmp) break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
199 return i == b->n_gi? 0 : b->gi[i].data;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
200 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
201
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
202 int bcf_anno_max(bcf1_t *b)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
203 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
204 int k, max_gq, max_sp, n_het;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
205 kstring_t str;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
206 uint8_t *gt, *gq;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
207 int32_t *sp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
208 max_gq = max_sp = n_het = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
209 gt = locate_field(b, "GT", 2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
210 if (gt == 0) return -1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
211 gq = locate_field(b, "GQ", 2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
212 sp = locate_field(b, "SP", 2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
213 if (sp)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
214 for (k = 0; k < b->n_smpl; ++k)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
215 if (gt[k]&0x3f)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
216 max_sp = max_sp > (int)sp[k]? max_sp : sp[k];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
217 if (gq)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
218 for (k = 0; k < b->n_smpl; ++k)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
219 if (gt[k]&0x3f)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
220 max_gq = max_gq > (int)gq[k]? max_gq : gq[k];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
221 for (k = 0; k < b->n_smpl; ++k) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
222 int a1, a2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
223 a1 = gt[k]&7; a2 = gt[k]>>3&7;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
224 if ((!a1 && a2) || (!a2 && a1)) { // a het
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
225 if (gq == 0) ++n_het;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
226 else if (gq[k] >= 20) ++n_het;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
227 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
228 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
229 if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
230 if (max_sp < 0) max_sp = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
231 memset(&str, 0, sizeof(kstring_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
232 if (*b->info) kputc(';', &str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
233 ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
234 bcf_append_info(b, str.s, str.l);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
235 free(str.s);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
236 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
237 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
238
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
239 // FIXME: only data are shuffled; the header is NOT
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
240 int bcf_shuffle(bcf1_t *b, int seed)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
241 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
242 int i, j, *a;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
243 if (seed > 0) srand48(seed);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
244 a = malloc(b->n_smpl * sizeof(int));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
245 for (i = 0; i < b->n_smpl; ++i) a[i] = i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
246 for (i = b->n_smpl; i > 1; --i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
247 int tmp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
248 j = (int)(drand48() * i);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
249 tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
250 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
251 for (j = 0; j < b->n_gi; ++j) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
252 bcf_ginfo_t *gi = b->gi + j;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
253 uint8_t *swap, *data = (uint8_t*)gi->data;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
254 swap = malloc(gi->len * b->n_smpl);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
255 for (i = 0; i < b->n_smpl; ++i)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
256 memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
257 free(gi->data);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
258 gi->data = swap;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
259 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
260 free(a);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
261 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
262 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
263
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
264 bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
265 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
266 int i, ret, j;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
267 khint_t k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
268 bcf_hdr_t *h;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
269 khash_t(str2id) *hash;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
270 kstring_t s;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
271 s.l = s.m = 0; s.s = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
272 hash = kh_init(str2id);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
273 for (i = 0; i < h0->n_smpl; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
274 k = kh_put(str2id, hash, h0->sns[i], &ret);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
275 kh_val(hash, k) = i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
276 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
277 for (i = j = 0; i < n; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
278 k = kh_get(str2id, hash, samples[i]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
279 if (k != kh_end(hash)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
280 list[j++] = kh_val(hash, k);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
281 kputs(samples[i], &s); kputc('\0', &s);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
282 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
283 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
284 if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
285 kh_destroy(str2id, hash);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
286 h = calloc(1, sizeof(bcf_hdr_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
287 *h = *h0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
288 h->ns = 0; h->sns = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
289 h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
290 h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
291 h->l_smpl = s.l; h->sname = s.s;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
292 bcf_hdr_sync(h);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
293 return h;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
294 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
295
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
296 int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
297 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
298 int i, j;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
299 for (j = 0; j < b->n_gi; ++j) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
300 bcf_ginfo_t *gi = b->gi + j;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
301 uint8_t *swap;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
302 swap = malloc(gi->len * b->n_smpl);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
303 for (i = 0; i < n_smpl; ++i)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
304 memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
305 free(gi->data);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
306 gi->data = swap;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
307 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
308 b->n_smpl = n_smpl;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
309 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
310 }