0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 This tool takes the following file pairs as input:
|
|
5 a) input_snp : A file with identifiers for SNPs (one on each line)
|
|
6 b) ldfile : A file where each line has the following
|
|
7 snp list
|
|
8 where "snp" is an identifier for one SNP and the "list" is a
|
|
9 comma separated list of all the other snps that are in LD with
|
|
10 it (as per some threshold of rsquare)
|
|
11
|
|
12 The output is a set of tag SNPs for the given datasets
|
|
13
|
|
14 The algorithm is as follows:
|
|
15
|
|
16 a) Construct a graph for each population, where each node is a SNP and two nodes
|
|
17 are connected using an edge iff they are in LD.
|
|
18 b) For each SNP, count the total number of connected nodes, which have not yet
|
|
19 been visited.
|
|
20 c) Find the SNP with the highest count and assign it to be a tag SNP.
|
|
21 d) Mark that SNP and all the snps connected to it as "visited". This should be
|
|
22 done for each population.
|
|
23 e) Continue steps b-e until all SNPs, in all populations have been visited.
|
|
24 """
|
|
25
|
|
26 from sys import argv, stderr, exit
|
|
27 from getopt import getopt, GetoptError
|
|
28
|
|
29 import os
|
|
30 import heapq
|
|
31
|
|
32 __author__ = "Aakrosh Ratan"
|
|
33 __email__ = "ratan@bx.psu.edu"
|
|
34
|
|
35 # do we want the debug information to be printed?
|
|
36 debug_flag = False
|
|
37
|
|
38 class node:
|
|
39 def __init__(self, name):
|
|
40 self.name = name
|
|
41 self.edges = []
|
|
42 self.visited = False
|
|
43
|
|
44 # return the number of nodes connected to this node, that have yet to be
|
|
45 # visited
|
|
46 def num_not_visited(self):
|
|
47 num = 0
|
|
48 for n in self.edges:
|
|
49 if n.visited == False: num += 1
|
|
50 return num
|
|
51
|
|
52 def __cmp__(self, other):
|
|
53 return other.num_not_visited() - self.num_not_visited()
|
|
54
|
|
55 def __str__(self):
|
|
56 return self.name
|
|
57
|
|
58 class graph:
|
|
59 def __init__(self):
|
|
60 self.nodes = {}
|
|
61
|
|
62 def __str__(self):
|
|
63 string = ""
|
|
64 for n1 in self.nodes.values():
|
|
65 n2s = [x.name for x in n1.edges]
|
|
66 string += "%s %s\n" % (n1.name, ",".join(n2s))
|
|
67 return string[:-1]
|
|
68
|
|
69 def add_node(self, n):
|
|
70 self.nodes[n.name] = n
|
|
71
|
|
72 def add_edges(self, n1, n2):
|
|
73 assert n1.name in self.nodes
|
|
74 assert n2.name in self.nodes
|
|
75 n1.edges.append(n2)
|
|
76 n2.edges.append(n1)
|
|
77
|
|
78 def check_graph(self):
|
|
79 for n in self.nodes.values():
|
|
80 ms = [x for x in n.edges]
|
|
81 for m in ms:
|
|
82 if n not in m.edges:
|
|
83 print >> stderr, "check : %s - %s" % (n,m)
|
|
84
|
|
85 def construct_graph(ldfile, snpfile):
|
|
86 # construct the initial graph. add all the SNPs as nodes
|
|
87 g = graph()
|
|
88 file = open(snpfile, "r")
|
|
89
|
|
90 for line in file:
|
|
91 # ignore empty lines and add the remainder to the graph
|
|
92 if len(line.strip()) == 0: continue
|
|
93 n = node(line.strip())
|
|
94 g.add_node(n)
|
|
95
|
|
96 file.close()
|
|
97 print >> stderr, "Added %d nodes to a graph" % len(g.nodes)
|
|
98
|
|
99 # now add all the edges
|
|
100 file = open(ldfile, "r")
|
|
101
|
|
102 for line in file:
|
|
103 tokens = line.split()
|
|
104 assert len(tokens) == 2
|
|
105
|
|
106 # if this node is in the graph, then we need to construct an edge from
|
|
107 # this node to all the nodes which are highly related to it
|
|
108 if tokens[0] in g.nodes:
|
|
109 n1 = g.nodes[tokens[0]]
|
|
110 n2s = [g.nodes[x] for x in tokens[1].split(",")]
|
|
111
|
|
112 for n2 in n2s:
|
|
113 g.add_edges(n1, n2)
|
|
114
|
|
115 file.close()
|
|
116 print >> stderr, "Added all edges to the graph"
|
|
117
|
|
118 return g
|
|
119
|
|
120 def check_output(g, tagsnps):
|
|
121 # find all the nodes in the graph
|
|
122 allsnps = [x.name for x in g.nodes.values()]
|
|
123
|
|
124 # find the nodes that are covered by our tagsnps
|
|
125 mysnps = [x.name for x in tagsnps]
|
|
126
|
|
127 for n in tagsnps:
|
|
128 for m in n.edges:
|
|
129 mysnps.append(m.name)
|
|
130
|
|
131 mysnps = list(set(mysnps))
|
|
132
|
|
133 if set(allsnps) != set(mysnps):
|
|
134 diff = list(set(allsnps) - set(mysnps))
|
|
135 print >> stderr, "%s are not covered" % ",".join(diff)
|
|
136
|
|
137 def main(ldfile, snpsfile, required, excluded):
|
|
138 # construct the graph
|
|
139 g = construct_graph(ldfile, snpsfile)
|
|
140 if debug_flag == True: g.check_graph()
|
|
141
|
|
142 tagsnps = []
|
|
143 neighbors = {}
|
|
144
|
|
145 # take care of the SNPs that are required to be TagSNPs
|
|
146 for s in required:
|
|
147 t = g.nodes[s]
|
|
148
|
|
149 t.visited = True
|
|
150 ns = []
|
|
151
|
|
152 for n in t.edges:
|
|
153 if n.visited == False: ns.append(n.name)
|
|
154 n.visited = True
|
|
155
|
|
156 tagsnps.append(t)
|
|
157 neighbors[t.name] = list(set(ns))
|
|
158
|
|
159 # find the tag SNPs for this graph
|
|
160 data = [x for x in g.nodes.values()]
|
|
161 heapq.heapify(data)
|
|
162
|
|
163 while data:
|
|
164 s = heapq.heappop(data)
|
|
165
|
|
166 if s.visited == True or s.name in excluded: continue
|
|
167
|
|
168 s.visited = True
|
|
169 ns = []
|
|
170
|
|
171 for n in s.edges:
|
|
172 if n.visited == False: ns.append(n.name)
|
|
173 n.visited = True
|
|
174
|
|
175 tagsnps.append(s)
|
|
176 neighbors[s.name] = list(set(ns))
|
|
177
|
|
178 heapq.heapify(data)
|
|
179
|
|
180 for s in tagsnps:
|
|
181 if len(neighbors[s.name]) > 0:
|
|
182 print "%s\t%s" % (s, ",".join(neighbors[s.name]))
|
|
183 continue
|
|
184 print s
|
|
185
|
|
186 if debug_flag == True: check_output(g, tagsnps)
|
|
187
|
|
188 def read_list(filename):
|
|
189 assert os.path.exists(filename) == True
|
|
190 file = open(filename, "r")
|
|
191 list = {}
|
|
192
|
|
193 for line in file:
|
|
194 list[line.strip()] = 1
|
|
195
|
|
196 file.close()
|
|
197 return list
|
|
198
|
|
199 def usage():
|
|
200 f = stderr
|
|
201 print >> f, "usage:"
|
|
202 print >> f, "senatag [options] neighborhood.txt inputsnps.txt"
|
|
203 print >> f, "where inputsnps.txt is a file of snps from one population"
|
|
204 print >> f, "where neighborhood.txt is neighborhood details for the pop."
|
|
205 print >> f, "where the options are:"
|
|
206 print >> f, "-h,--help : print usage and quit"
|
|
207 print >> f, "-d,--debug: print debug information"
|
|
208 print >> f, "-e,--excluded : file with names of SNPs that cannot be TagSNPs"
|
|
209 print >> f, "-r,--required : file with names of SNPs that should be TagSNPs"
|
|
210
|
|
211 if __name__ == "__main__":
|
|
212 try:
|
|
213 opts, args = getopt(argv[1:], "hdr:e:",\
|
|
214 ["help", "debug", "required=", "excluded="])
|
|
215 except GetoptError, err:
|
|
216 print str(err)
|
|
217 usage()
|
|
218 exit(2)
|
|
219
|
|
220 required = {}
|
|
221 excluded = {}
|
|
222
|
|
223 for o, a in opts:
|
|
224 if o in ("-h", "--help"):
|
|
225 usage()
|
|
226 exit()
|
|
227 elif o in ("-d", "--debug"):
|
|
228 debug_flag = True
|
|
229 elif o in ("-r", "--required"):
|
|
230 required = read_list(a)
|
|
231 elif o in ("-e", "--excluded"):
|
|
232 excluded = read_list(a)
|
|
233 else:
|
|
234 assert False, "unhandled option"
|
|
235
|
|
236 if len(args) != 2:
|
|
237 usage()
|
|
238 exit(3)
|
|
239
|
|
240 assert os.path.exists(args[0]) == True
|
|
241 assert os.path.exists(args[1]) == True
|
|
242
|
|
243 main(args[0], args[1], required, excluded)
|