Mercurial > repos > miller-lab > genome_diversity
annotate genome_diversity/src/sweep.c @ 24:248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Tue, 28 May 2013 16:24:19 -0400 |
parents | 95a05c1ef5d5 |
children | fb944979bf35 |
rev | line source |
---|---|
0 | 1 /* sweep -- find regions of the genome with high scores (e.g., Fst scores). |
2 * | |
3 * argv[1] -- file containing a Galaxy table | |
22
95a05c1ef5d5
update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
4 * argv[2] -- column number for the chromosome name (column numbers base-1) |
95a05c1ef5d5
update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
5 * argv[3] -- column number for the chromosomal position |
0 | 6 * argv[4] -- column number for a score for the position |
7 * argv[5] -- a percentage, such as "95", or a raw score, such as "=0.9". | |
8 * argv[6] -- the number of randomizations (shuffles) of the scores | |
9 * argv[7] -- [optional] if present and non-zero, report SNPs | |
10 * | |
11 * The program first determines a threshold such that the stated percentage | |
12 * of the scores are below that threshold (or uses the provided number if | |
13 * argv[5] starts with "="). The program subtracts the threshold | |
14 * from each score, then looks for maximal-scoring runs of SNPs, i.e., where | |
15 * adding or subtracting SNPs from an end of then run always decreases the | |
16 * total score. These regions are printed in order of descreasing total score. | |
17 * To determine a cutoff for the printed regions, the programs takes the maximum | |
18 * score over all regions observed in a specified number of shuffles of the | |
19 * list of scores. If argv[6] = 0, then all maximal-scoring runs of at least | |
20 * 4 table entries are printed. | |
21 | |
22 What it does on Galaxy | |
23 The user selects a SNP table and specifies the columns containing (1) chromosome, (2) position, (3) scores (such as an Fst-value for the SNP), (4) a percentage or raw score for the "cutoff" and (5) the number of times the data should be radomized (only intervals with score exceeding the maximum for the randomized data are reported). If a percentage (e.g. 95%) is specified for #3, then that percentile of the scores is used as the cutoff; this may not work well if many SNPs have the same score. The program subtracts the cutoff from every score, then finds genomic intervals (i.e., consecutive runs of SNPs) whose total score cannot be increased by adding or subtracting one or more SNPs at the ends of the interval. | |
24 */ | |
25 | |
26 #include "lib.h" | |
27 #include "Huang.h" | |
28 | |
29 // maximum number of rows in any processed table | |
30 #define MANY 20000000 | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
31 #define BUF_SIZE 50000 |
0 | 32 #define MAX_WINDOW 1000000 |
33 | |
34 double X[MANY]; // holds all scores | |
35 int nX; | |
36 | |
37 // position-score pairs for a single chromosome | |
38 struct score { | |
39 int pos; | |
40 double x; // original score, then shifted score | |
41 } S[MANY]; | |
42 int nS; | |
43 | |
44 struct snp { | |
45 int pos; | |
46 double x; | |
47 struct snp *next; | |
48 }; | |
49 | |
50 // structure to hold the maximum-scoring chromosomal intervals | |
51 struct sweep { | |
52 float score; | |
53 char *chr; | |
54 int b, e; | |
55 struct snp *snps; | |
56 } W[MAX_WINDOW]; | |
57 int nW; | |
58 | |
59 // return the linked list of SNPs in positions b to e | |
60 struct snp *add_snps(int b, int e) { | |
61 struct snp *first = NULL, *last = NULL, *new; | |
62 int i; | |
63 for (i = b; i <= e; ++i) | |
64 if (S[i].pos >= 0) { | |
65 new = ckalloc(sizeof(*new)); | |
66 new->pos = S[i].pos; | |
67 new->x = S[i].x; | |
68 new->next = NULL; | |
69 if (first == NULL) | |
70 first = new; | |
71 else | |
72 last->next = new; | |
73 last = new; | |
74 } | |
75 return first; | |
76 } | |
77 | |
78 // given a table row, return a pointer to the item in a particular column | |
79 char *get_col(char *buf, int col) { | |
80 static char temp[BUF_SIZE], *p; | |
81 int i; | |
82 char *z = " \t\n"; | |
83 | |
84 strcpy(temp, buf); | |
85 for (p = strtok(temp, z), i = 1; *p && i < col; | |
86 p = strtok(NULL, z), ++i) | |
87 ; | |
88 if (p == NULL) | |
89 fatalf("no column %d in %s", col, buf); | |
90 return p; | |
91 } | |
92 | |
93 // fill S[] with position-score pairs for the next chromosome | |
94 // return 0 for EOF | |
95 int get_chr(FILE *fp, int chr_col, int pos_col, int score_col, char *chr) { | |
96 static char buf[BUF_SIZE]; | |
97 static int init = 1; | |
98 char *status; | |
99 | |
100 if (init) { | |
101 while ((status = fgets(buf, BUF_SIZE, fp)) != NULL && | |
102 buf[0] == '#') | |
103 ; | |
104 if (status == NULL) | |
105 fatal("empty table"); | |
106 init = 0; | |
107 } | |
108 if (buf[0] == '\0') | |
109 return 0; | |
110 | |
111 if (buf[0] == '#') | |
112 fatal("cannot happen"); | |
113 strcpy(chr, get_col(buf, chr_col)); | |
114 S[0].pos = atoi(get_col(buf, pos_col)); | |
22
95a05c1ef5d5
update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
115 if (S[0].pos < 0) |
95a05c1ef5d5
update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
116 fatalf("remove unmapped SNPs (address = -1)"); |
0 | 117 S[0].x = atof(get_col(buf, score_col)); |
118 for (nS = 1; ; ++nS) { | |
119 if (!fgets(buf, BUF_SIZE, fp)) { | |
120 buf[0] = '\0'; | |
121 return 1; | |
122 } | |
123 if (!same_string(chr, get_col(buf, chr_col))) | |
124 break; | |
125 S[nS].pos = atoi(get_col(buf, pos_col)); | |
22
95a05c1ef5d5
update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
126 if (S[nS].pos < 0) |
95a05c1ef5d5
update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
127 fatalf("remove unmapped SNPs (address = -1)"); |
0 | 128 S[nS].x = atof(get_col(buf, score_col)); |
129 } | |
130 return 1; | |
131 } | |
132 | |
133 // for sorting genomic intervals by *decreasing* score | |
134 int Wcompar(struct sweep *a, struct sweep *b) { | |
135 float y = a->score, z = b->score; | |
136 | |
137 if (y > z) | |
138 return -1; | |
139 if (y < z) | |
140 return 1; | |
141 return 0; | |
142 } | |
143 | |
144 // for sorting an array of scores into increasing order | |
145 int fcompar(double *a, double *b) { | |
146 if (*a < *b) | |
147 return -1; | |
148 if (*a > *b) | |
149 return 1; | |
150 return 0; | |
151 } | |
152 | |
153 /* shuffle the values S[0], S[1], ... , S[nscores-1]; | |
154 * Uses Algorithm P in page 125 of "The Art of Computer Programming (Vol II) | |
155 * Seminumerical Programming", by Donald Knuth, Addison-Wesley, 1971. | |
156 */ | |
157 void shuffle_scores() { | |
158 int i, j; | |
159 double temp; | |
160 | |
161 for (i = nX-1; i > 0; --i) { | |
162 // swap what's in location i with location j, where 0 <= j <= i | |
163 j = random() % (i+1); | |
164 temp = X[i]; | |
165 X[i] = X[j]; | |
166 X[j] = temp; | |
167 } | |
168 } | |
169 | |
170 // return the best interval score (R[i] is the struct operated by Huang()) | |
171 double best() { | |
172 int i; | |
173 double bestScore; | |
174 | |
175 Huang(X, nX); | |
176 | |
177 for (bestScore = 0.0, i = 1; i <= top; ++i) | |
178 bestScore = MAX(R[i].Score, bestScore); | |
179 return bestScore; | |
180 } | |
181 | |
182 int main(int argc, char **argv) { | |
183 FILE *fp; | |
184 char buf[BUF_SIZE], chr[100], *a; | |
185 double shift = 0.0, cutoff; | |
186 int i, b, e, chr_col, pos_col, score_col, nshuffle, snps = 0; | |
187 struct snp *s; | |
188 | |
189 if (argc != 7 && argc != 8) | |
22
95a05c1ef5d5
update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
190 fatal("args: table chr_col pos_col score_col threshold randomizations [SNPs]"); |
0 | 191 |
192 // process command-line arguments | |
193 chr_col = atoi(argv[2]); | |
194 pos_col = atoi(argv[3]); | |
195 score_col = atoi(argv[4]); | |
196 a = argv[5]; | |
197 fp = ckopen(argv[1], "r"); | |
198 if (argc == 8) | |
199 snps = atoi(argv[7]); | |
200 if (isdigit(a[0])) { | |
201 for (nX = 0; nX < MANY && fgets(buf, BUF_SIZE, fp); ) { | |
202 if (buf[0] == '#') | |
203 continue; | |
204 X[nX++] = atof(get_col(buf, score_col)); | |
205 } | |
206 if (nX == MANY) | |
207 fatal("Too many rows"); | |
208 qsort((void *)X, (size_t)nX, sizeof(double), | |
209 (const void *)fcompar); | |
210 shift = X[atoi(a)*nX/100]; | |
211 rewind(fp); | |
212 } else if (a[0] == '=') | |
213 shift = atof(a+1); | |
214 | |
215 //fprintf(stderr, "shift = %4.3f\n", shift); | |
216 nshuffle = atoi(argv[6]); | |
217 if (nshuffle == 0) | |
218 cutoff = 0; | |
219 else { | |
220 for (nX = 0; nX < MANY && fgets(buf, BUF_SIZE, fp); ) { | |
221 if (buf[0] == '#') | |
222 continue; | |
223 X[nX++] = atof(get_col(buf, score_col)) - shift; | |
224 } | |
225 if (nX == MANY) | |
226 fatal("Too many rows"); | |
227 for (cutoff = 0.0, i = 0; i < nshuffle; ++i) { | |
228 shuffle_scores(); | |
229 cutoff = MAX(cutoff, best()); | |
230 } | |
231 rewind(fp); | |
232 } | |
233 //fprintf(stderr, "cutoff = %4.3f\n", cutoff); | |
234 | |
235 // loop over chromosomes; | |
236 // start by getting the chromosome's scores | |
237 while (get_chr(fp, chr_col, pos_col, score_col, chr)) { | |
238 // subtract shift from the scores | |
239 for (i = 0; i < nS; ++i) | |
240 X[i] = S[i].x - shift; | |
241 | |
242 // find the maximum=scoring regions | |
243 Huang(X, nS); | |
244 | |
245 // save any regions with >= 4 points and score >= cutoff | |
246 for (i = 0; i <= top; ++i) { | |
247 if (nW >= MAX_WINDOW) | |
248 fatalf("too many windows"); | |
249 | |
250 // get indices of the first and last SNP in the interval | |
251 b = R[i].Lpos + 1; | |
252 e = R[i].Rpos; | |
253 | |
254 // remove unmapped SNP position from intervals' ends | |
255 while (b < e && S[b].pos == -1) | |
256 ++b; | |
257 while (e > b && S[e].pos == -1) | |
258 --e; | |
259 | |
260 // record intervals | |
261 if (e - b < 3 || R[i].Score < cutoff) | |
262 continue; | |
263 W[nW].score = R[i].Score; | |
264 W[nW].chr = copy_string(chr); | |
265 W[nW].b = S[b].pos; | |
266 W[nW].e = S[e].pos+1; // Ws are half-open | |
267 if (snps) | |
268 W[nW].snps = add_snps(b, e); | |
269 ++nW; | |
270 } | |
271 } | |
272 | |
273 // sort by decreasing score | |
274 qsort((void *)W, (size_t)nW, sizeof(W[0]), (const void *)Wcompar); | |
275 | |
276 for (i = 0; i < nW; ++i) { | |
277 printf("%s\t%d\t%d\t%4.4f\n", | |
278 W[i].chr, W[i].b, W[i].e, W[i].score); | |
279 for (s = W[i].snps; s; s = s->next) | |
280 printf(" %d %3.2f\n", s->pos, s->x); | |
281 } | |
282 return 0; | |
283 } |