Mercurial > repos > xuebing > sharplabtool
comparison tools/human_genome_variation/pagetag.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 This accepts as input a file of the following format: | |
5 | |
6 Site Sample Allele1 Allele2 | |
7 | |
8 for example: | |
9 | |
10 000834 D001 G G | |
11 000834 D002 G G | |
12 000834 D003 G G | |
13 000834 D004 G G | |
14 000834 D005 N N | |
15 000834 E001 G G | |
16 000834 E002 G G | |
17 000834 E003 G G | |
18 000834 E004 G G | |
19 000834 E005 G G | |
20 000963 D001 T T | |
21 000963 D002 T T | |
22 000963 D003 T T | |
23 000963 D004 T T | |
24 000963 D005 N N | |
25 000963 E001 T T | |
26 000963 E002 N N | |
27 000963 E003 G T | |
28 000963 E004 G G | |
29 000963 E005 G T | |
30 | |
31 and a rsquare threshold and outputs two files: | |
32 | |
33 a) a file of input snps (one on each line). A SNP is identified by the "Site" | |
34 column in the input file | |
35 | |
36 b) a file where each line has the following: | |
37 SNP list | |
38 where SNP is one of the SNPs and the "list" is a comma separated list of SNPs | |
39 that exceed the rsquare threshold with the first SNP. | |
40 """ | |
41 | |
42 from sys import argv, stderr, exit | |
43 from getopt import getopt, GetoptError | |
44 | |
45 __author__ = "Aakrosh Ratan" | |
46 __email__ = "ratan@bx.psu.edu" | |
47 | |
48 # do we want the debug information to be printed? | |
49 debug_flag = False | |
50 | |
51 # denote different combos of alleles in code | |
52 HOMC = str(1) | |
53 HOMR = str(2) | |
54 HETE = str(3) | |
55 OTHER = str(4) | |
56 | |
57 indexcalculator = {(HOMC,HOMC) : 0, | |
58 (HOMC,HOMR) : 1, | |
59 (HOMC,HETE) : 2, | |
60 (HOMR,HOMC) : 3, | |
61 (HOMR,HOMR) : 4, | |
62 (HOMR,HETE) : 5, | |
63 (HETE,HOMC) : 6, | |
64 (HETE,HOMR) : 7, | |
65 (HETE,HETE) : 8} | |
66 | |
67 def read_inputfile(filename, samples): | |
68 input = {} | |
69 | |
70 file = open(filename, "r") | |
71 | |
72 for line in file: | |
73 position,sample,allele1,allele2 = line.split() | |
74 | |
75 # if the user specified a list of samples, then only use those samples | |
76 if samples != None and sample not in samples: continue | |
77 | |
78 if position in input: | |
79 v = input[position] | |
80 v[sample] = (allele1,allele2) | |
81 else: | |
82 v = {sample : (allele1, allele2)} | |
83 input[position] = v | |
84 | |
85 file.close() | |
86 return input | |
87 | |
88 def annotate_locus(input, minorallelefrequency, snpsfile): | |
89 locus = {} | |
90 for k,v in input.items(): | |
91 genotypes = [x for x in v.values()] | |
92 alleles = [y for x in genotypes for y in x] | |
93 alleleset = list(set(alleles)) | |
94 alleleset = list(set(alleles) - set(["N","X"])) | |
95 | |
96 if len(alleleset) == 2: | |
97 genotypevec = "" | |
98 num1 = len([x for x in alleles if x == alleleset[0]]) | |
99 num2 = len([x for x in alleles if x == alleleset[1]]) | |
100 | |
101 if num1 > num2: | |
102 major = alleleset[0] | |
103 minor = alleleset[1] | |
104 minorfreq = (num2 * 1.0)/(num1 + num2) | |
105 else: | |
106 major = alleleset[1] | |
107 minor = alleleset[0] | |
108 minorfreq = (num1 * 1.0)/(num1 + num2) | |
109 | |
110 if minorfreq < minorallelefrequency: continue | |
111 | |
112 for gen in genotypes: | |
113 if gen == (major,major): | |
114 genotypevec += HOMC | |
115 elif gen == (minor,minor): | |
116 genotypevec += HOMR | |
117 elif gen == (major, minor) or gen == (minor, major): | |
118 genotypevec += HETE | |
119 else: | |
120 genotypevec += OTHER | |
121 | |
122 locus[k] = genotypevec,minorfreq | |
123 elif len(alleleset) > 2: | |
124 print >> snpsfile, k | |
125 return locus | |
126 | |
127 def calculateLD(loci, rsqthreshold): | |
128 snps = list(loci) | |
129 rsquare = {} | |
130 | |
131 for index,loc1 in enumerate(snps): | |
132 for loc2 in snps[index + 1:]: | |
133 matrix = [0]*9 | |
134 | |
135 vec1 = loci[loc1][0] | |
136 vec2 = loci[loc2][0] | |
137 | |
138 for gen in zip(vec1,vec2): | |
139 if gen[0] == OTHER or gen[1] == OTHER: continue | |
140 matrix[indexcalculator[gen]] += 1 | |
141 | |
142 n = sum(matrix) | |
143 x11 = 2*matrix[0] + matrix[2] + matrix[6] | |
144 x12 = 2*matrix[1] + matrix[2] + matrix[7] | |
145 x21 = 2*matrix[3] + matrix[6] + matrix[5] | |
146 x22 = 2*matrix[4] + matrix[6] + matrix[5] | |
147 | |
148 p = (x11 + x12 + matrix[8] * 1.0) / (2 * n) | |
149 q = (x11 + x21 + matrix[8] * 1.0) / (2 * n) | |
150 | |
151 p11 = p * q | |
152 | |
153 oldp11 = p11 | |
154 range = 0.0 | |
155 converged = False | |
156 convergentcounter = 0 | |
157 if p11 > 0.0: | |
158 while converged == False and convergentcounter < 100: | |
159 if (1.0 - p - q + p11) != 0.0 and oldp11 != 0.0: | |
160 num = matrix[8] * p11 * (1.0 - p - q + p11) | |
161 den = p11 * (1.0 - p - q + p11) + (p - p11)*(q - p11) | |
162 p11 = (x11 + (num/den))/(2.0*n) | |
163 range = p11/oldp11 | |
164 if range >= 0.9999 and range <= 1.001: | |
165 converged = True | |
166 oldp11 = p11 | |
167 convergentcounter += 1 | |
168 else: | |
169 converged = True | |
170 | |
171 dvalue = 0.0 | |
172 if converged == True: | |
173 dvalue = p11 - (p * q) | |
174 | |
175 if dvalue != 0.0: | |
176 rsq = (dvalue**2)/(p*q*(1-p)*(1-q)) | |
177 if rsq >= rsqthreshold: | |
178 rsquare["%s %s" % (loc1,loc2)] = rsq | |
179 | |
180 return rsquare | |
181 | |
182 def main(inputfile, snpsfile, neigborhoodfile, \ | |
183 rsquare, minorallelefrequency, samples): | |
184 # read the input file | |
185 input = read_inputfile(inputfile, samples) | |
186 print >> stderr, "Read %d locations" % len(input) | |
187 | |
188 # open the snpsfile to print | |
189 file = open(snpsfile, "w") | |
190 | |
191 # annotate the inputs, remove the abnormal loci (which do not have 2 alleles | |
192 # and add the major and minor allele to each loci | |
193 loci = annotate_locus(input, minorallelefrequency, file) | |
194 print >> stderr, "Read %d interesting locations" % len(loci) | |
195 | |
196 # print all the interesting loci as candidate snps | |
197 for k in loci.keys(): print >> file, k | |
198 file.close() | |
199 print >> stderr, "Finished creating the snpsfile" | |
200 | |
201 # calculate the LD values and store it if it exceeds the threshold | |
202 lds = calculateLD(loci, rsquare) | |
203 print >> stderr, "Calculated all the LD values" | |
204 | |
205 # create a list of SNPs | |
206 snps = {} | |
207 ldvals = {} | |
208 for k,v in lds.items(): | |
209 s1,s2 = k.split() | |
210 if s1 in snps: snps[s1].append(s2) | |
211 else : snps[s1] = [s2] | |
212 if s2 in snps: snps[s2].append(s1) | |
213 else : snps[s2] = [s1] | |
214 | |
215 if s1 in ldvals: ldvals[s1].append(str(v)) | |
216 else : ldvals[s1] = [str(v)] | |
217 if s2 in ldvals: ldvals[s2].append(str(v)) | |
218 else : ldvals[s2] = [str(v)] | |
219 | |
220 # print the snps to the output file | |
221 file = open(neigborhoodfile, "w") | |
222 | |
223 for k,v in snps.items(): | |
224 ldv = ldvals[k] | |
225 if debug_flag == True: | |
226 print >> file, "%s\t%s\t%s" % (k, ",".join(v), ",".join(ldv)) | |
227 else: | |
228 print >> file, "%s\t%s" % (k, ",".join(v)) | |
229 | |
230 file.close() | |
231 | |
232 | |
233 def read_list(filename): | |
234 file = open(filename, "r") | |
235 list = {} | |
236 | |
237 for line in file: | |
238 list[line.strip()] = 1 | |
239 | |
240 file.close() | |
241 return list | |
242 | |
243 def usage(): | |
244 f = stderr | |
245 print >> f, "usage:" | |
246 print >> f, "pagetag [options] input.txt snps.txt neighborhood.txt" | |
247 print >> f, "where input.txt is the prettybase file" | |
248 print >> f, "where snps.txt is the first output file with the snps" | |
249 print >> f, "where neighborhood.txt is the output neighborhood file" | |
250 print >> f, "where the options are:" | |
251 print >> f, "-h,--help : print usage and quit" | |
252 print >> f, "-d,--debug: print debug information" | |
253 print >> f, "-r,--rsquare: the rsquare threshold (default : 0.64)" | |
254 print >> f, "-f,--freq : the minimum MAF required (default: 0.0)" | |
255 print >> f, "-s,--sample : a list of samples to be clustered" | |
256 | |
257 if __name__ == "__main__": | |
258 try: | |
259 opts, args = getopt(argv[1:], "hds:r:f:",\ | |
260 ["help", "debug", "rsquare=","freq=", "sample="]) | |
261 except GetoptError, err: | |
262 print str(err) | |
263 usage() | |
264 exit(2) | |
265 | |
266 rsquare = 0.64 | |
267 minorallelefrequency = 0.0 | |
268 samples = None | |
269 | |
270 for o, a in opts: | |
271 if o in ("-h", "--help"): | |
272 usage() | |
273 exit() | |
274 elif o in ("-d", "--debug"): | |
275 debug_flag = True | |
276 elif o in ("-r", "--rsquare"): | |
277 rsquare = float(a) | |
278 elif o in ("-f", "--freq"): | |
279 minorallelefrequency = float(a) | |
280 elif o in ("-s", "--sample"): | |
281 samples = read_list(a) | |
282 else: | |
283 assert False, "unhandled option" | |
284 | |
285 if rsquare < 0.00 or rsquare > 1.00: | |
286 print >> stderr, "input value of rsquare should be in [0.00, 1.00]" | |
287 exit(3) | |
288 | |
289 if minorallelefrequency < 0.0 or minorallelefrequency > 0.5: | |
290 print >> stderr, "input value of MAF should be (0.00,0.50]" | |
291 exit(4) | |
292 | |
293 if len(args) != 3: | |
294 usage() | |
295 exit(5) | |
296 | |
297 main(args[0], args[1], args[2], rsquare, minorallelefrequency, samples) |