comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/vcfutils.lua @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 #!/usr/bin/env luajit
2
3 -----------------------------------
4 -- BEGIN: routines from klib.lua --
5 -----------------------------------
6
7 -- Description: getopt() translated from the BSD getopt(); compatible with the default Unix getopt()
8 --[[ Example:
9 for o, a in os.getopt(arg, 'a:b') do
10 print(o, a)
11 end
12 ]]--
13 function os.getopt(args, ostr)
14 local arg, place = nil, 0;
15 return function ()
16 if place == 0 then -- update scanning pointer
17 place = 1
18 if #args == 0 or args[1]:sub(1, 1) ~= '-' then place = 0; return nil end
19 if #args[1] >= 2 then
20 place = place + 1
21 if args[1]:sub(2, 2) == '-' then -- found "--"
22 table.remove(args, 1);
23 place = 0
24 return nil;
25 end
26 end
27 end
28 local optopt = place <= #args[1] and args[1]:sub(place, place) or nil
29 place = place + 1;
30 local oli = optopt and ostr:find(optopt) or nil
31 if optopt == ':' or oli == nil then -- unknown option
32 if optopt == '-' then return nil end
33 if place > #args[1] then
34 table.remove(args, 1);
35 place = 0;
36 end
37 return '?';
38 end
39 oli = oli + 1;
40 if ostr:sub(oli, oli) ~= ':' then -- do not need argument
41 arg = nil;
42 if place > #args[1] then
43 table.remove(args, 1);
44 place = 0;
45 end
46 else -- need an argument
47 if place <= #args[1] then -- no white space
48 arg = args[1]:sub(place);
49 else
50 table.remove(args, 1);
51 if #args == 0 then -- an option requiring argument is the last one
52 place = 0;
53 if ostr:sub(1, 1) == ':' then return ':' end
54 return '?';
55 else arg = args[1] end
56 end
57 table.remove(args, 1);
58 place = 0;
59 end
60 return optopt, arg;
61 end
62 end
63
64 -- Description: string split
65 function string:split(sep, n)
66 local a, start = {}, 1;
67 sep = sep or "%s+";
68 repeat
69 local b, e = self:find(sep, start);
70 if b == nil then
71 table.insert(a, self:sub(start));
72 break
73 end
74 a[#a+1] = self:sub(start, b - 1);
75 start = e + 1;
76 if n and #a == n then
77 table.insert(a, self:sub(start));
78 break
79 end
80 until start > #self;
81 return a;
82 end
83
84 -- Description: smart file open
85 function io.xopen(fn, mode)
86 mode = mode or 'r';
87 if fn == nil then return io.stdin;
88 elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout;
89 elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w');
90 elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w');
91 else return io.open(fn, mode) end
92 end
93
94 -- Description: log gamma function
95 -- Required by: math.lbinom()
96 -- Reference: AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
97 function math.lgamma(z)
98 local x;
99 x = 0.1659470187408462e-06 / (z+7);
100 x = x + 0.9934937113930748e-05 / (z+6);
101 x = x - 0.1385710331296526 / (z+5);
102 x = x + 12.50734324009056 / (z+4);
103 x = x - 176.6150291498386 / (z+3);
104 x = x + 771.3234287757674 / (z+2);
105 x = x - 1259.139216722289 / (z+1);
106 x = x + 676.5203681218835 / z;
107 x = x + 0.9999999999995183;
108 return math.log(x) - 5.58106146679532777 - z + (z-0.5) * math.log(z+6.5);
109 end
110
111 -- Description: regularized incomplete gamma function
112 -- Dependent on: math.lgamma()
113 --[[
114 Formulas are taken from Wiki, with additional input from Numerical
115 Recipes in C (for modified Lentz's algorithm) and AS245
116 (http://lib.stat.cmu.edu/apstat/245).
117
118 A good online calculator is available at:
119
120 http://www.danielsoper.com/statcalc/calc23.aspx
121
122 It calculates upper incomplete gamma function, which equals
123 math.igamma(s,z,true)*math.exp(math.lgamma(s))
124 ]]--
125 function math.igamma(s, z, complement)
126
127 local function _kf_gammap(s, z)
128 local sum, x = 1, 1;
129 for k = 1, 100 do
130 x = x * z / (s + k);
131 sum = sum + x;
132 if x / sum < 1e-14 then break end
133 end
134 return math.exp(s * math.log(z) - z - math.lgamma(s + 1.) + math.log(sum));
135 end
136
137 local function _kf_gammaq(s, z)
138 local C, D, f, TINY;
139 f = 1. + z - s; C = f; D = 0.; TINY = 1e-290;
140 -- Modified Lentz's algorithm for computing continued fraction. See Numerical Recipes in C, 2nd edition, section 5.2
141 for j = 1, 100 do
142 local d;
143 local a, b = j * (s - j), j*2 + 1 + z - s;
144 D = b + a * D;
145 if D < TINY then D = TINY end
146 C = b + a / C;
147 if C < TINY then C = TINY end
148 D = 1. / D;
149 d = C * D;
150 f = f * d;
151 if math.abs(d - 1) < 1e-14 then break end
152 end
153 return math.exp(s * math.log(z) - z - math.lgamma(s) - math.log(f));
154 end
155
156 if complement then
157 return ((z <= 1 or z < s) and 1 - _kf_gammap(s, z)) or _kf_gammaq(s, z);
158 else
159 return ((z <= 1 or z < s) and _kf_gammap(s, z)) or (1 - _kf_gammaq(s, z));
160 end
161 end
162
163 function math.brent(func, a, b, tol)
164 local gold1, gold2, tiny, max_iter = 1.6180339887, 0.3819660113, 1e-20, 100
165
166 local fa, fb = func(a, data), func(b, data)
167 if fb > fa then -- swap, such that f(a) > f(b)
168 a, b, fa, fb = b, a, fb, fa
169 end
170 local c = b + gold1 * (b - a)
171 local fc = func(c) -- golden section extrapolation
172 while fb > fc do
173 local bound = b + 100.0 * (c - b) -- the farthest point where we want to go
174 local r = (b - a) * (fb - fc)
175 local q = (b - c) * (fb - fa)
176 if math.abs(q - r) < tiny then -- avoid 0 denominator
177 tmp = q > r and tiny or 0.0 - tiny
178 else tmp = q - r end
179 u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp) -- u is the parabolic extrapolation point
180 if (b > u and u > c) or (b < u and u < c) then -- u lies between b and c
181 fu = func(u)
182 if fu < fc then -- (b,u,c) bracket the minimum
183 a, b, fa, fb = b, u, fb, fu
184 break
185 elseif fu > fb then -- (a,b,u) bracket the minimum
186 c, fc = u, fu
187 break
188 end
189 u = c + gold1 * (c - b)
190 fu = func(u) -- golden section extrapolation
191 elseif (c > u and u > bound) or (c < u and u < bound) then -- u lies between c and bound
192 fu = func(u)
193 if fu < fc then -- fb > fc > fu
194 b, c, u = c, u, c + gold1 * (c - b)
195 fb, fc, fu = fc, fu, func(u)
196 else -- (b,c,u) bracket the minimum
197 a, b, c = b, c, u
198 fa, fb, fc = fb, fc, fu
199 break
200 end
201 elseif (u > bound and bound > c) or (u < bound and bound < c) then -- u goes beyond the bound
202 u = bound
203 fu = func(u)
204 else -- u goes the other way around, use golden section extrapolation
205 u = c + gold1 * (c - b)
206 fu = func(u)
207 end
208 a, b, c = b, c, u
209 fa, fb, fc = fb, fc, fu
210 end
211 if a > c then a, c = c, a end -- swap
212
213 -- now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm
214 local e, d = 0, 0
215 local w, v, fw, fv
216 w, v = b, b
217 fw, fv = fb, fb
218 for iter = 1, max_iter do
219 local mid = 0.5 * (a + c)
220 local tol1 = tol * math.abs(b) + tiny
221 local tol2 = 2.0 * tol1
222 if math.abs(b - mid) <= tol2 - 0.5 * (c - a) then return fb, b end -- found
223 if math.abs(e) > tol1 then
224 -- related to parabolic interpolation
225 local r = (b - w) * (fb - fv)
226 local q = (b - v) * (fb - fw)
227 local p = (b - v) * q - (b - w) * r
228 q = 2.0 * (q - r)
229 if q > 0.0 then p = 0.0 - p
230 else q = 0.0 - q end
231 eold, e = e, d
232 if math.abs(p) >= math.abs(0.5 * q * eold) or p <= q * (a - b) or p >= q * (c - b) then
233 e = b >= mid and a - b or c - b
234 d = gold2 * e
235 else
236 d, u = p / q, b + d -- actual parabolic interpolation happens here
237 if u - a < tol2 or c - u < tol2 then
238 d = mid > b and tol1 or 0.0 - tol1
239 end
240 end
241 else -- golden section interpolation
242 e = b >= min and a - b or c - b
243 d = gold2 * e
244 end
245 u = fabs(d) >= tol1 and b + d or b + (d > 0.0 and tol1 or -tol1);
246 fu = func(u)
247 if fu <= fb then -- u is the minimum point so far
248 if u >= b then a = b
249 else c = b end
250 v, w, b = w, b, u
251 fv, fw, fb = fw, fb, fu
252 else -- adjust (a,c) and (u,v,w)
253 if u < b then a = u
254 else c = u end
255 if fu <= fw or w == b then
256 v, w = w, u
257 fv, fw = fw, fu
258 elseif fu <= fv or v == b or v == w then
259 v, fv = u, fu;
260 end
261 end
262 end
263 return fb, b
264 end
265
266 matrix = {}
267
268 -- Description: chi^2 test for contingency tables
269 -- Dependent on: math.igamma()
270 function matrix.chi2(a)
271 if #a == 2 and #a[1] == 2 then -- 2x2 table
272 local x, z
273 x = (a[1][1] + a[1][2]) * (a[2][1] + a[2][2]) * (a[1][1] + a[2][1]) * (a[1][2] + a[2][2])
274 if x == 0 then return 0, 1, false end
275 z = a[1][1] * a[2][2] - a[1][2] * a[2][1]
276 z = (a[1][1] + a[1][2] + a[2][1] + a[2][2]) * z * z / x
277 return z, math.igamma(.5, .5 * z, true), true
278 else -- generic table
279 local rs, cs, n, m, N, z = {}, {}, #a, #a[1], 0, 0
280 for i = 1, n do rs[i] = 0 end
281 for j = 1, m do cs[j] = 0 end
282 for i = 1, n do -- compute column sum and row sum
283 for j = 1, m do cs[j], rs[i] = cs[j] + a[i][j], rs[i] + a[i][j] end
284 end
285 for i = 1, n do N = N + rs[i] end
286 for i = 1, n do -- compute the chi^2 statistics
287 for j = 1, m do
288 local E = rs[i] * cs[j] / N;
289 z = z + (a[i][j] - E) * (a[i][j] - E) / E
290 end
291 end
292 return z, math.igamma(.5 * (n-1) * (m-1), .5 * z, true), true;
293 end
294 end
295
296 ---------------------------------
297 -- END: routines from klib.lua --
298 ---------------------------------
299
300
301 --------------------------
302 -- BEGIN: misc routines --
303 --------------------------
304
305 -- precompute an array for PL->probability conversion
306 -- @param m maximum PL
307 function algo_init_q2p(m)
308 local q2p = {}
309 for i = 0, m do
310 q2p[i] = math.pow(10, -i / 10)
311 end
312 return q2p
313 end
314
315 -- given the haplotype frequency, compute r^2
316 -- @param f 4 haplotype frequencies; f[] is 0-indexed.
317 -- @return r^2
318 function algo_r2(f)
319 local p = { f[0] + f[1], f[0] + f[2] }
320 local D = f[0] * f[3] - f[1] * f[2]
321 return (p[1] == 0 or p[2] == 0 or 1-p[1] == 0 or 1-p[2] == 0) and 0 or D * D / (p[1] * p[2] * (1 - p[1]) * (1 - p[2]))
322 end
323
324 -- parse a VCF line to get PL
325 -- @param q2p is computed by algo_init_q2p()
326 function text_parse_pl(t, q2p, parse_GT)
327 parse_GT = parse_GT == nil and true or false
328 local ht, gt, pl = {}, {}, {}
329 local s, j0 = t[9]:split(':'), 0
330 for j = 1, #s do
331 if s[j] == 'PL' then j0 = j break end
332 end
333 local has_GT = (s[1] == 'GT' and parse_GT) and true or false
334 for i = 10, #t do
335 if j0 > 0 then
336 local s = t[i]:split(':')
337 local a, b = 1, s[j0]:find(',')
338 pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))]
339 a, b = b + 1, s[j0]:find(',', b + 1)
340 pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))]
341 a, b = b + 1, s[j0]:find(',', b + 1)
342 pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, (b and b - 1) or nil))]
343 end
344 if has_GT then
345 if t[i]:sub(1, 1) ~= '.' then
346 local g = tonumber(t[i]:sub(1, 1)) + tonumber(t[i]:sub(3, 3));
347 gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6
348 gt[#gt - 2 + g] = 1
349 ht[#ht+1] = tonumber(t[i]:sub(1, 1)); ht[#ht+1] = tonumber(t[i]:sub(3, 3));
350 else
351 gt[#gt+1] = 1; gt[#gt+1] = 1; gt[#gt+1] = 1
352 ht[#ht+1] = -1; ht[#ht+1] = -1;
353 end
354 end
355 -- print(t[i], pl[#pl-2], pl[#pl-1], pl[#pl], gt[#gt-2], gt[#gt-1], gt[#gt])
356 end
357 if #pl == 0 then pl = nil end
358 local x = has_GT and { t[1], t[2], ht, gt, pl } or { t[1], t[2], nil, nil, pl }
359 return x
360 end
361
362 -- Infer haplotype frequency
363 -- @param pdg genotype likelihoods P(D|g) generated by text_parse_pl(). pdg[] is 1-indexed.
364 -- @param eps precision [1e-5]
365 -- @return 2-locus haplotype frequencies, 0-indexed array
366 function algo_hapfreq2(pdg, eps)
367 eps = eps or 1e-5
368 local n, f = #pdg[1] / 3, {[0]=0.25, 0.25, 0.25, 0.25}
369 for iter = 1, 100 do
370 local F = {[0]=0, 0, 0, 0}
371 for i = 0, n - 1 do
372 local p1, p2 = {[0]=pdg[1][i*3+1], pdg[1][i*3+2], pdg[1][i*3+3]}, {[0]=pdg[2][i*3+1], pdg[2][i*3+2], pdg[2][i*3+3]}
373 local u = { [0]=
374 f[0] * (f[0] * p1[0] * p2[0] + f[1] * p1[0] * p2[1] + f[2] * p1[1] * p2[0] + f[3] * p1[1] * p2[1]),
375 f[1] * (f[0] * p1[0] * p2[1] + f[1] * p1[0] * p2[2] + f[2] * p1[1] * p2[1] + f[3] * p1[1] * p2[2]),
376 f[2] * (f[0] * p1[1] * p2[0] + f[1] * p1[1] * p2[1] + f[2] * p1[2] * p2[0] + f[3] * p1[2] * p2[1]),
377 f[3] * (f[0] * p1[1] * p2[1] + f[1] * p1[1] * p2[2] + f[2] * p1[2] * p2[1] + f[3] * p1[2] * p2[2])
378 }
379 local s = u[0] + u[1] + u[2] + u[3]
380 s = 1 / (s * n)
381 F[0] = F[0] + u[0] * s
382 F[1] = F[1] + u[1] * s
383 F[2] = F[2] + u[2] * s
384 F[3] = F[3] + u[3] * s
385 end
386 local e = 0
387 for k = 0, 3 do
388 e = math.abs(f[k] - F[k]) > e and math.abs(f[k] - F[k]) or e
389 end
390 for k = 0, 3 do f[k] = F[k] end
391 if e < eps then break end
392 -- print(f[0], f[1], f[2], f[3])
393 end
394 return f
395 end
396
397 ------------------------
398 -- END: misc routines --
399 ------------------------
400
401
402 ---------------------
403 -- BEGIN: commands --
404 ---------------------
405
406 -- CMD vcf2bgl: convert PL tagged VCF to Beagle input --
407 function cmd_vcf2bgl()
408 if #arg == 0 then
409 print("\nUsage: vcf2bgl.lua <in.vcf>")
410 print("\nNB: This command finds PL by matching /(\\d+),(\\d+),(\\d+)/.\n");
411 os.exit(1)
412 end
413
414 local lookup = {}
415 for i = 0, 10000 do lookup[i] = string.format("%.4f", math.pow(10, -i/10)) end
416
417 local fp = io.xopen(arg[1])
418 for l in fp:lines() do
419 if l:sub(1, 2) == '##' then -- meta lines; do nothing
420 elseif l:sub(1, 1) == '#' then -- sample lines
421 local t, s = l:split('\t'), {}
422 for i = 10, #t do s[#s+1] = t[i]; s[#s+1] = t[i]; s[#s+1] = t[i] end
423 print('marker', 'alleleA', 'alleleB', table.concat(s, '\t'))
424 else -- data line
425 local t = l:split('\t');
426 if t[5] ~= '.' and t[5]:find(",") == nil and #t[5] == 1 and #t[4] == 1 then -- biallic SNP
427 local x, z = -1, {};
428 if t[9]:find('PL') then
429 for i = 10, #t do
430 local AA, Aa, aa = t[i]:match('(%d+),(%d+),(%d+)')
431 AA = tonumber(AA); Aa = tonumber(Aa); aa = tonumber(aa);
432 if AA ~= nil then
433 z[#z+1] = lookup[AA]; z[#z+1] = lookup[Aa]; z[#z+1] = lookup[aa];
434 else z[#z+1] = 1; z[#z+1] = 1; z[#z+1] = 1; end
435 end
436 print(t[1]..':'..t[2], t[4], t[5], table.concat(z, '\t'))
437 elseif t[9]:find('GL') then
438 print('Error: not implemented')
439 os.exit(1)
440 end
441 end
442 end
443 end
444 fp:close()
445 end
446
447 -- CMD bgl2vcf: convert Beagle output to VCF
448 function cmd_bgl2vcf()
449 if #arg < 2 then
450 print('Usage: bgl2vcf.lua <in.phased> <in.gprobs>')
451 os.exit(1)
452 end
453
454 local fpp = io.xopen(arg[1]);
455 local fpg = io.xopen(arg[2]);
456 for lg in fpg:lines() do
457 local tp, tg, a = fpp:read():split('%s'), lg:split('%s', 4), {}
458 if tp[1] == 'I' then
459 for i = 3, #tp, 2 do a[#a+1] = tp[i] end
460 print('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', table.concat(a, '\t'))
461 else
462 local chr, pos = tg[1]:match('(%S+):(%d+)$')
463 a = {chr, pos, '.', tg[2], tg[3], 30, '.', '.', 'GT'}
464 for i = 3, #tp, 2 do
465 a[#a+1] = ((tp[i] == tg[2] and 0) or 1) .. '|' .. ((tp[i+1] == tg[2] and 0) or 1)
466 end
467 print(table.concat(a, '\t'))
468 end
469 end
470 fpg:close(); fpp:close();
471 end
472
473 -- CMD freq: count alleles in each population
474 function cmd_freq()
475 -- parse the command line
476 local site_only = true; -- print site allele frequency or not
477 for c in os.getopt(arg, 's') do
478 if c == 's' then site_only = false end
479 end
480 if #arg == 0 then
481 print("\nUsage: vcfutils.lua freq [-s] <in.vcf> [samples.txt]\n")
482 print("NB: 1) This command only considers biallelic variants.")
483 print(" 2) Apply '-s' to get the allele frequency spectrum.")
484 print(" 3) 'samples.txt' is TAB-delimited with each line consisting of sample and population.")
485 print("")
486 os.exit(1)
487 end
488
489 -- read the sample-population pairs
490 local pop, sample = {}, {}
491 if #arg > 1 then
492 local fp = io.xopen(arg[2]);
493 for l in fp:lines() do
494 local s, p = l:match("^(%S+)%s+(%S+)"); -- sample, population pair
495 sample[s] = p; -- FIXME: check duplications
496 if pop[p] then table.insert(pop[p], s)
497 else pop[p] = {s} end
498 end
499 fp:close();
500 end
501 pop['NA'] = {}
502
503 -- parse VCF
504 fp = (#arg >= 2 and io.xopen(arg[1])) or io.stdin;
505 local col, cnt = {}, {};
506 for k in pairs(pop) do
507 col[k], cnt[k] = {}, {[0]=0};
508 end
509 for l in fp:lines() do
510 if l:sub(1, 2) == '##' then -- meta lines; do nothing
511 elseif l:sub(1, 1) == '#' then -- the sample line
512 local t, del_NA = l:split('\t'), true;
513 for i = 10, #t do
514 local k = sample[t[i]]
515 if k == nil then
516 k, del_NA = 'NA', false
517 table.insert(pop[k], t[i])
518 end
519 table.insert(col[k], i);
520 table.insert(cnt[k], 0);
521 table.insert(cnt[k], 0);
522 end
523 if del_NA then pop['NA'], col['NA'], cnt['NA'] = nil, nil, nil end
524 else -- data lines
525 local t = l:split('\t');
526 if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic
527 if site_only == true then io.write(t[1], '\t', t[2], '\t', t[4], '\t', t[5]) end
528 for k, v in pairs(col) do
529 local ac, an = 0, 0;
530 for i = 1, #v do
531 local a1, a2 = t[v[i]]:match("^(%d).(%d)");
532 if a1 ~= nil then ac, an = ac + a1 + a2, an + 2 end
533 end
534 if site_only == true then io.write('\t', k, ':', an, ':', ac) end
535 if an == #cnt[k] then cnt[k][ac] = cnt[k][ac] + 1 end
536 end
537 if site_only == true then io.write('\n') end
538 end
539 end
540 end
541 fp:close();
542
543 -- print
544 if site_only == false then
545 for k, v in pairs(cnt) do
546 io.write(k .. "\t" .. #v);
547 for i = 0, #v do io.write("\t" .. v[i]) end
548 io.write('\n');
549 end
550 end
551 end
552
553 function cmd_vcf2chi2()
554 if #arg < 3 then
555 print("Usage: vcfutils.lua vcf2chi2 <in.vcf> <group1.list> <group2.list>");
556 os.exit(1)
557 end
558
559 local g = {};
560
561 -- read the list of groups
562 local fp = io.xopen(arg[2]);
563 for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 1 end -- FIXME: check duplicate
564 fp:close()
565 fp = io.xopen(arg[3]);
566 for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 2 end
567 fp:close()
568
569 -- process VCF
570 fp = io.xopen(arg[1])
571 local h = {{}, {}}
572 for l in fp:lines() do
573 if l:sub(1, 2) == '##' then print(l) -- meta lines; do nothing
574 elseif l:sub(1, 1) == '#' then -- sample lines
575 local t = l:split('\t');
576 for i = 10, #t do
577 if g[t[i]] == 1 then table.insert(h[1], i)
578 elseif g[t[i]] == 2 then table.insert(h[2], i) end
579 end
580 while #t > 8 do table.remove(t) end
581 print(table.concat(t, "\t"))
582 else -- data line
583 local t = l:split('\t');
584 if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic
585 local a = {{0, 0}, {0, 0}}
586 for i = 1, 2 do
587 for _, k in pairs(h[i]) do
588 if t[k]:find("^0.0") then a[i][1] = a[i][1] + 2
589 elseif t[k]:find("^1.1") then a[i][2] = a[i][2] + 2
590 elseif t[k]:find("^0.1") or t[k]:find("^1.0") then
591 a[i][1], a[i][2] = a[i][1] + 1, a[i][2] + 1
592 end
593 end
594 end
595 local chi2, p, succ = matrix.chi2(a);
596 while #t > 8 do table.remove(t) end
597 --print(a[1][1], a[1][2], a[2][1], a[2][2], chi2, p);
598 if succ then print(table.concat(t, "\t") .. ";PCHI2=" .. string.format("%.3g", p)
599 .. string.format(';AF1=%.4g;AF2=%.4g,%.4g', (a[1][2]+a[2][2]) / (a[1][1]+a[1][2]+a[2][1]+a[2][2]),
600 a[1][2]/(a[1][1]+a[1][2]), a[2][2]/(a[2][1]+a[2][2])))
601 else print(table.concat(t, "\t")) end
602 end
603 end
604 end
605 fp:close()
606 end
607
608 -- CMD: compute r^2
609 function cmd_r2()
610 local w, is_ht, is_gt = 1, false, false
611 for o, a in os.getopt(arg, 'w:hg') do
612 if o == 'w' then w = tonumber(a)
613 elseif o == 'h' then is_ht, is_gt = true, true
614 elseif o == 'g' then is_gt = true
615 end
616 end
617 if #arg == 0 then
618 print("Usage: vcfutils.lua r2 [-hg] [-w 1] <in.vcf>")
619 os.exit(1)
620 end
621 local stack, fp, q2p = {}, io.xopen(arg[1]), algo_init_q2p(1023)
622 for l in fp:lines() do
623 if l:sub(1, 1) ~= '#' then
624 local t = l:split('\t')
625 local x = text_parse_pl(t, q2p)
626 if #t[5] == 1 and t[5] ~= '.' then -- biallelic
627 local r2 = {}
628 for k = 1, w do
629 if is_gt == false then -- use PL
630 if stack[k] then
631 local pdg = { stack[k][5], x[5] }
632 r2[#r2+1] = algo_r2(algo_hapfreq2(pdg))
633 else r2[#r2+1] = 0 end
634 elseif is_ht == false then -- use unphased GT
635 if stack[k] then
636 local pdg = { stack[k][4], x[4] }
637 r2[#r2+1] = algo_r2(algo_hapfreq2(pdg))
638 else r2[#r2+1] = 0 end
639 else -- use phased GT
640 if stack[k] then
641 local f, ht = { [0]=0, 0, 0, 0 }, { stack[k][3], x[3] }
642 for i = 1, #ht[1] do
643 local j = ht[1][i] * 2 + ht[2][i]
644 f[j] = f[j] + 1
645 end
646 local sum = f[0] + f[1] + f[2] + f[3]
647 for k = 0, 3 do f[k] = f[k] / sum end
648 r2[#r2+1] = algo_r2(f)
649 else r2[#r2+1] = 0 end
650 end
651 end
652 for k = 1, #r2 do
653 r2[k] = string.format('%.3f', r2[k])
654 end
655 print(x[1], x[2], table.concat(r2, '\t'))
656 if #stack == w then table.remove(stack, 1) end
657 stack[#stack+1] = x
658 end
659 end
660 end
661 fp:close()
662 end
663
664 -------------------
665 -- END: commands --
666 -------------------
667
668
669 -------------------
670 -- MAIN FUNCTION --
671 -------------------
672
673 if #arg == 0 then
674 print("\nUsage: vcfutils.lua <command> <arguments>\n")
675 print("Command: freq count biallelic alleles in each population")
676 print(" r2 compute r^2")
677 print(" vcf2chi2 compute 1-degree chi-square between two groups of samples")
678 print(" vcf2bgl convert PL annotated VCF to Beagle input")
679 print(" bgl2vcf convert Beagle input to VCF")
680 print("")
681 os.exit(1)
682 end
683
684 local cmd = arg[1]
685 table.remove(arg, 1)
686 if cmd == 'vcf2bgl' then cmd_vcf2bgl()
687 elseif cmd == 'bgl2vcf' then cmd_bgl2vcf()
688 elseif cmd == 'freq' then cmd_freq()
689 elseif cmd == 'r2' then cmd_r2()
690 elseif cmd == 'vcf2chi2' then cmd_vcf2chi2()
691 else
692 print('ERROR: unknown command "' .. cmd .. '"')
693 os.exit(1)
694 end