annotate genome_diversity/src/aggregate.c @ 24:248b06e86022

Added gd_genotype datatype. Modified tools to support new datatype.
author Richard Burhans <burhans@bx.psu.edu>
date Tue, 28 May 2013 16:24:19 -0400
parents 95a05c1ef5d5
children 91e835060ad2
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
22
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
1 /* aggregate -- add four columns (allele counts, genotype, maximum quality) for
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
2 * a specified population to a Galaxy SNP table
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
3 *
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
4 * argv[1] = file containing a Galaxy table
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
5 * argv[2] ... are the starting columns (base-1) for the chosen individuals
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
6
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
7 What it does on Galaxy
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
8 The user specifies that some of the individuals in a gd_snp dataset form a "population", by supplying a list that has been previously created using the Specify Individuals tool. The program appends a new "entity" (set of four columns) to the gd_snp table, analogous to the columns for an individual but containing summary data for the population as a group. These four columns give the total counts for the two alleles, the "genotype" for the population, and the maximum quality value, taken over all individuals in the population. If all defined genotypes in the population are 2 (agree with the reference), then the population's genotype is 2, and similarly for 0; otherwise the genotype is 1 (unless all individuals have undefined genotype, in which case it is -1).
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
9 */
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
10
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
11 #include "lib.h"
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
12
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
13 // most characters allowed in a row of the table
24
248b06e86022 Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents: 22
diff changeset
14 #define MOST 50000
22
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
15
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
16 // column for the relevant individuals/groups
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
17 int col[MOST];
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
18 int nI;
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
19
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
20 int main(int argc, char **argv) {
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
21 FILE *fp;
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
22 char *p, *z = "\t\n", buf[MOST], trash[MOST];
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
23 int X[MOST], m, i, A, B, G, Q, g;
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
24
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
25 if (argc < 3)
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
26 fatalf("args: SNP-table individual1 ...");
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
27
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
28 for (i = 2, nI = 0; i < argc; ++i, ++nI)
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
29 col[nI] = atoi(argv[i]);
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
30
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
31 fp = ckopen(argv[1], "r");
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
32 while (fgets(buf, MOST, fp)) {
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
33 if (buf[0] == '#')
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
34 continue;
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
35 strcpy(trash, buf);
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
36 // set X[i] = atoi(i-th word of s), i is base 0
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
37 for (i = 1, p = strtok(trash, z); p != NULL;
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
38 ++i, p = strtok(NULL, z))
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
39 X[i] = atoi(p);
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
40 for (i = A = B = Q = 0, G = -1; i < nI; ++i) {
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
41 m = col[i];
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
42 A += X[m];
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
43 B += X[m+1];
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
44 g = X[m+2];
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
45 if (g != -1) {
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
46 if (G == -1) // first time
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
47 G = g;
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
48 else if (G != g)
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
49 G = 1;
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
50 }
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
51 Q = MAX(Q, X[m+3]);
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
52 }
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
53 if (i < nI) // check bounds on the population's individuals
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
54 continue;
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
55 // add columns
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
56 if ((p = strchr(buf, '\n')) != NULL)
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
57 *p = '\0';
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
58 printf("%s\t%d\t%d\t%d\t%d\n", buf, A, B, G, Q);
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
59 }
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
60
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
61 return 0;
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
62 }