annotate genome_diversity/src/sweep.c @ 24:248b06e86022

Added gd_genotype datatype. Modified tools to support new datatype.
author Richard Burhans <burhans@bx.psu.edu>
date Tue, 28 May 2013 16:24:19 -0400
parents 95a05c1ef5d5
children fb944979bf35
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
1 /* sweep -- find regions of the genome with high scores (e.g., Fst scores).
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
2 *
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
3 * argv[1] -- file containing a Galaxy table
22
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
4 * argv[2] -- column number for the chromosome name (column numbers base-1)
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
5 * argv[3] -- column number for the chromosomal position
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
6 * argv[4] -- column number for a score for the position
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
7 * argv[5] -- a percentage, such as "95", or a raw score, such as "=0.9".
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
8 * argv[6] -- the number of randomizations (shuffles) of the scores
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
9 * argv[7] -- [optional] if present and non-zero, report SNPs
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
10 *
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
11 * The program first determines a threshold such that the stated percentage
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
12 * of the scores are below that threshold (or uses the provided number if
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
13 * argv[5] starts with "="). The program subtracts the threshold
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
14 * from each score, then looks for maximal-scoring runs of SNPs, i.e., where
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
15 * adding or subtracting SNPs from an end of then run always decreases the
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
16 * total score. These regions are printed in order of descreasing total score.
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
17 * To determine a cutoff for the printed regions, the programs takes the maximum
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
18 * score over all regions observed in a specified number of shuffles of the
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
19 * list of scores. If argv[6] = 0, then all maximal-scoring runs of at least
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
20 * 4 table entries are printed.
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
21
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
22 What it does on Galaxy
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
23 The user selects a SNP table and specifies the columns containing (1) chromosome, (2) position, (3) scores (such as an Fst-value for the SNP), (4) a percentage or raw score for the "cutoff" and (5) the number of times the data should be radomized (only intervals with score exceeding the maximum for the randomized data are reported). If a percentage (e.g. 95%) is specified for #3, then that percentile of the scores is used as the cutoff; this may not work well if many SNPs have the same score. The program subtracts the cutoff from every score, then finds genomic intervals (i.e., consecutive runs of SNPs) whose total score cannot be increased by adding or subtracting one or more SNPs at the ends of the interval.
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
24 */
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
25
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
26 #include "lib.h"
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
27 #include "Huang.h"
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
28
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
29 // maximum number of rows in any processed table
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
30 #define MANY 20000000
24
248b06e86022 Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents: 22
diff changeset
31 #define BUF_SIZE 50000
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
32 #define MAX_WINDOW 1000000
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
33
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
34 double X[MANY]; // holds all scores
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
35 int nX;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
36
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
37 // position-score pairs for a single chromosome
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
38 struct score {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
39 int pos;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
40 double x; // original score, then shifted score
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
41 } S[MANY];
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
42 int nS;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
43
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
44 struct snp {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
45 int pos;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
46 double x;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
47 struct snp *next;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
48 };
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
49
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
50 // structure to hold the maximum-scoring chromosomal intervals
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
51 struct sweep {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
52 float score;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
53 char *chr;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
54 int b, e;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
55 struct snp *snps;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
56 } W[MAX_WINDOW];
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
57 int nW;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
58
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
59 // return the linked list of SNPs in positions b to e
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
60 struct snp *add_snps(int b, int e) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
61 struct snp *first = NULL, *last = NULL, *new;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
62 int i;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
63 for (i = b; i <= e; ++i)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
64 if (S[i].pos >= 0) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
65 new = ckalloc(sizeof(*new));
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
66 new->pos = S[i].pos;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
67 new->x = S[i].x;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
68 new->next = NULL;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
69 if (first == NULL)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
70 first = new;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
71 else
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
72 last->next = new;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
73 last = new;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
74 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
75 return first;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
76 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
77
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
78 // given a table row, return a pointer to the item in a particular column
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
79 char *get_col(char *buf, int col) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
80 static char temp[BUF_SIZE], *p;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
81 int i;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
82 char *z = " \t\n";
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
83
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
84 strcpy(temp, buf);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
85 for (p = strtok(temp, z), i = 1; *p && i < col;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
86 p = strtok(NULL, z), ++i)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
87 ;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
88 if (p == NULL)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
89 fatalf("no column %d in %s", col, buf);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
90 return p;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
91 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
92
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
93 // fill S[] with position-score pairs for the next chromosome
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
94 // return 0 for EOF
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
95 int get_chr(FILE *fp, int chr_col, int pos_col, int score_col, char *chr) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
96 static char buf[BUF_SIZE];
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
97 static int init = 1;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
98 char *status;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
99
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
100 if (init) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
101 while ((status = fgets(buf, BUF_SIZE, fp)) != NULL &&
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
102 buf[0] == '#')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
103 ;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
104 if (status == NULL)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
105 fatal("empty table");
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
106 init = 0;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
107 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
108 if (buf[0] == '\0')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
109 return 0;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
110
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
111 if (buf[0] == '#')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
112 fatal("cannot happen");
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
113 strcpy(chr, get_col(buf, chr_col));
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
114 S[0].pos = atoi(get_col(buf, pos_col));
22
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
115 if (S[0].pos < 0)
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
116 fatalf("remove unmapped SNPs (address = -1)");
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
117 S[0].x = atof(get_col(buf, score_col));
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
118 for (nS = 1; ; ++nS) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
119 if (!fgets(buf, BUF_SIZE, fp)) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
120 buf[0] = '\0';
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
121 return 1;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
122 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
123 if (!same_string(chr, get_col(buf, chr_col)))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
124 break;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
125 S[nS].pos = atoi(get_col(buf, pos_col));
22
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
126 if (S[nS].pos < 0)
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
127 fatalf("remove unmapped SNPs (address = -1)");
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
128 S[nS].x = atof(get_col(buf, score_col));
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
129 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
130 return 1;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
131 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
132
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
133 // for sorting genomic intervals by *decreasing* score
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
134 int Wcompar(struct sweep *a, struct sweep *b) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
135 float y = a->score, z = b->score;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
136
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
137 if (y > z)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
138 return -1;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
139 if (y < z)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
140 return 1;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
141 return 0;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
142 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
143
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
144 // for sorting an array of scores into increasing order
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
145 int fcompar(double *a, double *b) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
146 if (*a < *b)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
147 return -1;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
148 if (*a > *b)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
149 return 1;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
150 return 0;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
151 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
152
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
153 /* shuffle the values S[0], S[1], ... , S[nscores-1];
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
154 * Uses Algorithm P in page 125 of "The Art of Computer Programming (Vol II)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
155 * Seminumerical Programming", by Donald Knuth, Addison-Wesley, 1971.
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
156 */
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
157 void shuffle_scores() {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
158 int i, j;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
159 double temp;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
160
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
161 for (i = nX-1; i > 0; --i) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
162 // swap what's in location i with location j, where 0 <= j <= i
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
163 j = random() % (i+1);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
164 temp = X[i];
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
165 X[i] = X[j];
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
166 X[j] = temp;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
167 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
168 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
169
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
170 // return the best interval score (R[i] is the struct operated by Huang())
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
171 double best() {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
172 int i;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
173 double bestScore;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
174
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
175 Huang(X, nX);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
176
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
177 for (bestScore = 0.0, i = 1; i <= top; ++i)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
178 bestScore = MAX(R[i].Score, bestScore);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
179 return bestScore;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
180 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
181
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
182 int main(int argc, char **argv) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
183 FILE *fp;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
184 char buf[BUF_SIZE], chr[100], *a;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
185 double shift = 0.0, cutoff;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
186 int i, b, e, chr_col, pos_col, score_col, nshuffle, snps = 0;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
187 struct snp *s;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
188
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
189 if (argc != 7 && argc != 8)
22
95a05c1ef5d5 update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
190 fatal("args: table chr_col pos_col score_col threshold randomizations [SNPs]");
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
191
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
192 // process command-line arguments
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
193 chr_col = atoi(argv[2]);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
194 pos_col = atoi(argv[3]);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
195 score_col = atoi(argv[4]);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
196 a = argv[5];
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
197 fp = ckopen(argv[1], "r");
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
198 if (argc == 8)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
199 snps = atoi(argv[7]);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
200 if (isdigit(a[0])) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
201 for (nX = 0; nX < MANY && fgets(buf, BUF_SIZE, fp); ) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
202 if (buf[0] == '#')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
203 continue;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
204 X[nX++] = atof(get_col(buf, score_col));
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
205 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
206 if (nX == MANY)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
207 fatal("Too many rows");
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
208 qsort((void *)X, (size_t)nX, sizeof(double),
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
209 (const void *)fcompar);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
210 shift = X[atoi(a)*nX/100];
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
211 rewind(fp);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
212 } else if (a[0] == '=')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
213 shift = atof(a+1);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
214
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
215 //fprintf(stderr, "shift = %4.3f\n", shift);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
216 nshuffle = atoi(argv[6]);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
217 if (nshuffle == 0)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
218 cutoff = 0;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
219 else {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
220 for (nX = 0; nX < MANY && fgets(buf, BUF_SIZE, fp); ) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
221 if (buf[0] == '#')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
222 continue;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
223 X[nX++] = atof(get_col(buf, score_col)) - shift;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
224 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
225 if (nX == MANY)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
226 fatal("Too many rows");
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
227 for (cutoff = 0.0, i = 0; i < nshuffle; ++i) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
228 shuffle_scores();
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
229 cutoff = MAX(cutoff, best());
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
230 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
231 rewind(fp);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
232 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
233 //fprintf(stderr, "cutoff = %4.3f\n", cutoff);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
234
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
235 // loop over chromosomes;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
236 // start by getting the chromosome's scores
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
237 while (get_chr(fp, chr_col, pos_col, score_col, chr)) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
238 // subtract shift from the scores
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
239 for (i = 0; i < nS; ++i)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
240 X[i] = S[i].x - shift;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
241
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
242 // find the maximum=scoring regions
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
243 Huang(X, nS);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
244
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
245 // save any regions with >= 4 points and score >= cutoff
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
246 for (i = 0; i <= top; ++i) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
247 if (nW >= MAX_WINDOW)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
248 fatalf("too many windows");
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
249
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
250 // get indices of the first and last SNP in the interval
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
251 b = R[i].Lpos + 1;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
252 e = R[i].Rpos;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
253
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
254 // remove unmapped SNP position from intervals' ends
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
255 while (b < e && S[b].pos == -1)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
256 ++b;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
257 while (e > b && S[e].pos == -1)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
258 --e;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
259
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
260 // record intervals
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
261 if (e - b < 3 || R[i].Score < cutoff)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
262 continue;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
263 W[nW].score = R[i].Score;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
264 W[nW].chr = copy_string(chr);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
265 W[nW].b = S[b].pos;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
266 W[nW].e = S[e].pos+1; // Ws are half-open
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
267 if (snps)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
268 W[nW].snps = add_snps(b, e);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
269 ++nW;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
270 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
271 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
272
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
273 // sort by decreasing score
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
274 qsort((void *)W, (size_t)nW, sizeof(W[0]), (const void *)Wcompar);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
275
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
276 for (i = 0; i < nW; ++i) {
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
277 printf("%s\t%d\t%d\t%4.4f\n",
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
278 W[i].chr, W[i].b, W[i].e, W[i].score);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
279 for (s = W[i].snps; s; s = s->next)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
280 printf(" %d %3.2f\n", s->pos, s->x);
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
281 }
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
282 return 0;
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
283 }