comparison tools/human_genome_variation/pagetag.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env python
2
3 """
4 This accepts as input a file of the following format:
5
6 Site Sample Allele1 Allele2
7
8 for example:
9
10 000834 D001 G G
11 000834 D002 G G
12 000834 D003 G G
13 000834 D004 G G
14 000834 D005 N N
15 000834 E001 G G
16 000834 E002 G G
17 000834 E003 G G
18 000834 E004 G G
19 000834 E005 G G
20 000963 D001 T T
21 000963 D002 T T
22 000963 D003 T T
23 000963 D004 T T
24 000963 D005 N N
25 000963 E001 T T
26 000963 E002 N N
27 000963 E003 G T
28 000963 E004 G G
29 000963 E005 G T
30
31 and a rsquare threshold and outputs two files:
32
33 a) a file of input snps (one on each line). A SNP is identified by the "Site"
34 column in the input file
35
36 b) a file where each line has the following:
37 SNP list
38 where SNP is one of the SNPs and the "list" is a comma separated list of SNPs
39 that exceed the rsquare threshold with the first SNP.
40 """
41
42 from sys import argv, stderr, exit
43 from getopt import getopt, GetoptError
44
45 __author__ = "Aakrosh Ratan"
46 __email__ = "ratan@bx.psu.edu"
47
48 # do we want the debug information to be printed?
49 debug_flag = False
50
51 # denote different combos of alleles in code
52 HOMC = str(1)
53 HOMR = str(2)
54 HETE = str(3)
55 OTHER = str(4)
56
57 indexcalculator = {(HOMC,HOMC) : 0,
58 (HOMC,HOMR) : 1,
59 (HOMC,HETE) : 2,
60 (HOMR,HOMC) : 3,
61 (HOMR,HOMR) : 4,
62 (HOMR,HETE) : 5,
63 (HETE,HOMC) : 6,
64 (HETE,HOMR) : 7,
65 (HETE,HETE) : 8}
66
67 def read_inputfile(filename, samples):
68 input = {}
69
70 file = open(filename, "r")
71
72 for line in file:
73 position,sample,allele1,allele2 = line.split()
74
75 # if the user specified a list of samples, then only use those samples
76 if samples != None and sample not in samples: continue
77
78 if position in input:
79 v = input[position]
80 v[sample] = (allele1,allele2)
81 else:
82 v = {sample : (allele1, allele2)}
83 input[position] = v
84
85 file.close()
86 return input
87
88 def annotate_locus(input, minorallelefrequency, snpsfile):
89 locus = {}
90 for k,v in input.items():
91 genotypes = [x for x in v.values()]
92 alleles = [y for x in genotypes for y in x]
93 alleleset = list(set(alleles))
94 alleleset = list(set(alleles) - set(["N","X"]))
95
96 if len(alleleset) == 2:
97 genotypevec = ""
98 num1 = len([x for x in alleles if x == alleleset[0]])
99 num2 = len([x for x in alleles if x == alleleset[1]])
100
101 if num1 > num2:
102 major = alleleset[0]
103 minor = alleleset[1]
104 minorfreq = (num2 * 1.0)/(num1 + num2)
105 else:
106 major = alleleset[1]
107 minor = alleleset[0]
108 minorfreq = (num1 * 1.0)/(num1 + num2)
109
110 if minorfreq < minorallelefrequency: continue
111
112 for gen in genotypes:
113 if gen == (major,major):
114 genotypevec += HOMC
115 elif gen == (minor,minor):
116 genotypevec += HOMR
117 elif gen == (major, minor) or gen == (minor, major):
118 genotypevec += HETE
119 else:
120 genotypevec += OTHER
121
122 locus[k] = genotypevec,minorfreq
123 elif len(alleleset) > 2:
124 print >> snpsfile, k
125 return locus
126
127 def calculateLD(loci, rsqthreshold):
128 snps = list(loci)
129 rsquare = {}
130
131 for index,loc1 in enumerate(snps):
132 for loc2 in snps[index + 1:]:
133 matrix = [0]*9
134
135 vec1 = loci[loc1][0]
136 vec2 = loci[loc2][0]
137
138 for gen in zip(vec1,vec2):
139 if gen[0] == OTHER or gen[1] == OTHER: continue
140 matrix[indexcalculator[gen]] += 1
141
142 n = sum(matrix)
143 x11 = 2*matrix[0] + matrix[2] + matrix[6]
144 x12 = 2*matrix[1] + matrix[2] + matrix[7]
145 x21 = 2*matrix[3] + matrix[6] + matrix[5]
146 x22 = 2*matrix[4] + matrix[6] + matrix[5]
147
148 p = (x11 + x12 + matrix[8] * 1.0) / (2 * n)
149 q = (x11 + x21 + matrix[8] * 1.0) / (2 * n)
150
151 p11 = p * q
152
153 oldp11 = p11
154 range = 0.0
155 converged = False
156 convergentcounter = 0
157 if p11 > 0.0:
158 while converged == False and convergentcounter < 100:
159 if (1.0 - p - q + p11) != 0.0 and oldp11 != 0.0:
160 num = matrix[8] * p11 * (1.0 - p - q + p11)
161 den = p11 * (1.0 - p - q + p11) + (p - p11)*(q - p11)
162 p11 = (x11 + (num/den))/(2.0*n)
163 range = p11/oldp11
164 if range >= 0.9999 and range <= 1.001:
165 converged = True
166 oldp11 = p11
167 convergentcounter += 1
168 else:
169 converged = True
170
171 dvalue = 0.0
172 if converged == True:
173 dvalue = p11 - (p * q)
174
175 if dvalue != 0.0:
176 rsq = (dvalue**2)/(p*q*(1-p)*(1-q))
177 if rsq >= rsqthreshold:
178 rsquare["%s %s" % (loc1,loc2)] = rsq
179
180 return rsquare
181
182 def main(inputfile, snpsfile, neigborhoodfile, \
183 rsquare, minorallelefrequency, samples):
184 # read the input file
185 input = read_inputfile(inputfile, samples)
186 print >> stderr, "Read %d locations" % len(input)
187
188 # open the snpsfile to print
189 file = open(snpsfile, "w")
190
191 # annotate the inputs, remove the abnormal loci (which do not have 2 alleles
192 # and add the major and minor allele to each loci
193 loci = annotate_locus(input, minorallelefrequency, file)
194 print >> stderr, "Read %d interesting locations" % len(loci)
195
196 # print all the interesting loci as candidate snps
197 for k in loci.keys(): print >> file, k
198 file.close()
199 print >> stderr, "Finished creating the snpsfile"
200
201 # calculate the LD values and store it if it exceeds the threshold
202 lds = calculateLD(loci, rsquare)
203 print >> stderr, "Calculated all the LD values"
204
205 # create a list of SNPs
206 snps = {}
207 ldvals = {}
208 for k,v in lds.items():
209 s1,s2 = k.split()
210 if s1 in snps: snps[s1].append(s2)
211 else : snps[s1] = [s2]
212 if s2 in snps: snps[s2].append(s1)
213 else : snps[s2] = [s1]
214
215 if s1 in ldvals: ldvals[s1].append(str(v))
216 else : ldvals[s1] = [str(v)]
217 if s2 in ldvals: ldvals[s2].append(str(v))
218 else : ldvals[s2] = [str(v)]
219
220 # print the snps to the output file
221 file = open(neigborhoodfile, "w")
222
223 for k,v in snps.items():
224 ldv = ldvals[k]
225 if debug_flag == True:
226 print >> file, "%s\t%s\t%s" % (k, ",".join(v), ",".join(ldv))
227 else:
228 print >> file, "%s\t%s" % (k, ",".join(v))
229
230 file.close()
231
232
233 def read_list(filename):
234 file = open(filename, "r")
235 list = {}
236
237 for line in file:
238 list[line.strip()] = 1
239
240 file.close()
241 return list
242
243 def usage():
244 f = stderr
245 print >> f, "usage:"
246 print >> f, "pagetag [options] input.txt snps.txt neighborhood.txt"
247 print >> f, "where input.txt is the prettybase file"
248 print >> f, "where snps.txt is the first output file with the snps"
249 print >> f, "where neighborhood.txt is the output neighborhood file"
250 print >> f, "where the options are:"
251 print >> f, "-h,--help : print usage and quit"
252 print >> f, "-d,--debug: print debug information"
253 print >> f, "-r,--rsquare: the rsquare threshold (default : 0.64)"
254 print >> f, "-f,--freq : the minimum MAF required (default: 0.0)"
255 print >> f, "-s,--sample : a list of samples to be clustered"
256
257 if __name__ == "__main__":
258 try:
259 opts, args = getopt(argv[1:], "hds:r:f:",\
260 ["help", "debug", "rsquare=","freq=", "sample="])
261 except GetoptError, err:
262 print str(err)
263 usage()
264 exit(2)
265
266 rsquare = 0.64
267 minorallelefrequency = 0.0
268 samples = None
269
270 for o, a in opts:
271 if o in ("-h", "--help"):
272 usage()
273 exit()
274 elif o in ("-d", "--debug"):
275 debug_flag = True
276 elif o in ("-r", "--rsquare"):
277 rsquare = float(a)
278 elif o in ("-f", "--freq"):
279 minorallelefrequency = float(a)
280 elif o in ("-s", "--sample"):
281 samples = read_list(a)
282 else:
283 assert False, "unhandled option"
284
285 if rsquare < 0.00 or rsquare > 1.00:
286 print >> stderr, "input value of rsquare should be in [0.00, 1.00]"
287 exit(3)
288
289 if minorallelefrequency < 0.0 or minorallelefrequency > 0.5:
290 print >> stderr, "input value of MAF should be (0.00,0.50]"
291 exit(4)
292
293 if len(args) != 3:
294 usage()
295 exit(5)
296
297 main(args[0], args[1], args[2], rsquare, minorallelefrequency, samples)