0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import errno
|
|
4 import os
|
|
5 import shutil
|
|
6 import subprocess
|
|
7 import sys
|
|
8 from BeautifulSoup import BeautifulSoup
|
|
9 import gd_composite
|
|
10
|
|
11 ################################################################################
|
|
12
|
|
13 def mkdir_p(path):
|
|
14 try:
|
|
15 os.makedirs(path)
|
|
16 except OSError, e:
|
|
17 if e.errno <> errno.EEXIST:
|
|
18 raise
|
|
19
|
|
20 ################################################################################
|
|
21
|
|
22 def run_program(prog, args, stdout_file=None):
|
|
23 #print "args: ", ' '.join(args)
|
|
24 p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
25 (stdoutdata, stderrdata) = p.communicate()
|
|
26 rc = p.returncode
|
|
27
|
|
28 if stdout_file is not None:
|
|
29 with open(stdout_file, 'w') as ofh:
|
|
30 print >> ofh, stdoutdata
|
|
31
|
|
32 if rc != 0:
|
|
33 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args))
|
|
34 print >> sys.stderr, stderrdata
|
|
35 sys.exit(1)
|
|
36
|
|
37 ################################################################################
|
|
38
|
|
39 def do_ped2geno(input, output):
|
|
40 lines = []
|
|
41 with open(input) as fh:
|
|
42 for line in fh:
|
|
43 line = line.rstrip('\r\n')
|
|
44 lines.append(line.split())
|
|
45
|
|
46 pair_map = {
|
|
47 '0':{ '0':'9', '1':'9', '2':'9' },
|
|
48 '1':{ '0':'1', '1':'2', '2':'1' },
|
|
49 '2':{ '0':'1', '1':'1', '2':'0' }
|
|
50 }
|
|
51 with open(output, 'w') as ofh:
|
|
52 for a_idx in xrange(6, len(lines[0]), 2):
|
|
53 b_idx = a_idx + 1
|
|
54 print >> ofh, ''.join(map(lambda line: pair_map[line[a_idx]][line[b_idx]], lines))
|
|
55
|
|
56 def do_map2snp(input, output):
|
|
57 with open(output, 'w') as ofh:
|
|
58 with open(input) as fh:
|
|
59 for line in fh:
|
|
60 elems = line.split()
|
|
61 print >> ofh, ' {0} 11 0.002 2000 A T'.format(elems[1])
|
|
62
|
|
63 def make_ind_file(ind_file, input):
|
|
64 pops = []
|
|
65
|
|
66 ofh = open(ind_file, 'w')
|
|
67
|
|
68 with open(input) as fh:
|
|
69 soup = BeautifulSoup(fh)
|
|
70 misc = soup.find('div', {'id': 'gd_misc'})
|
|
71 populations = misc('ul')[0]
|
|
72
|
|
73 i = 0
|
|
74 for entry in populations:
|
|
75 if i % 2 == 1:
|
|
76 population_name = entry.contents[0].encode('utf8').strip().replace(' ', '_')
|
|
77 pops.append(population_name)
|
|
78 individuals = entry.ol('li')
|
|
79 for individual in individuals:
|
|
80 individual_name = individual.string.encode('utf8').strip()
|
|
81 print >> ofh, individual_name, 'M', population_name
|
|
82 i += 1
|
|
83
|
|
84 ofh.close()
|
|
85 return pops
|
|
86
|
|
87 def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file):
|
|
88 with open(par_file, 'w') as fh:
|
|
89 print >> fh, 'genotypename: {0}'.format(geno_file)
|
|
90 print >> fh, 'snpname: {0}'.format(snp_file)
|
|
91 print >> fh, 'indivname: {0}'.format(ind_file)
|
|
92 print >> fh, 'evecoutname: {0}'.format(evec_file)
|
|
93 print >> fh, 'evaloutname: {0}'.format(eval_file)
|
|
94 print >> fh, 'altnormstyle: NO'
|
|
95 print >> fh, 'numoutevec: 2'
|
|
96
|
|
97 def do_smartpca(par_file):
|
|
98 prog = 'smartpca'
|
|
99
|
|
100 args = [ prog ]
|
|
101 args.append('-p')
|
|
102 args.append(par_file)
|
|
103
|
|
104 #print "args: ", ' '.join(args)
|
|
105 p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=subprocess.PIPE, stderr=sys.stderr)
|
|
106 (stdoutdata, stderrdata) = p.communicate()
|
|
107 rc = p.returncode
|
|
108
|
|
109 if rc != 0:
|
|
110 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args))
|
|
111 print >> sys.stderr, stderrdata
|
|
112 sys.exit(1)
|
|
113
|
|
114 stats = []
|
|
115
|
|
116 save_line = False
|
|
117 for line in stdoutdata.split('\n'):
|
|
118 if line.startswith(('## Average divergence', '## Anova statistics', '## Statistical significance')):
|
|
119 stats.append('')
|
|
120 save_line = True
|
|
121 if line.strip() == '':
|
|
122 save_line = False
|
|
123 if save_line:
|
|
124 stats.append(line)
|
|
125
|
|
126 return '\n'.join(stats[1:])
|
|
127
|
|
128 def do_ploteig(evec_file, population_names):
|
|
129 prog = 'gd_ploteig'
|
|
130
|
|
131 args = [ prog ]
|
|
132 args.append('-i')
|
|
133 args.append(evec_file)
|
|
134 args.append('-c')
|
|
135 args.append('1:2')
|
|
136 args.append('-p')
|
|
137 args.append(':'.join(population_names))
|
|
138 args.append('-x')
|
|
139
|
|
140 run_program(None, args)
|
|
141
|
|
142 def do_eval2pct(eval_file, explained_file):
|
|
143 prog = 'eval2pct'
|
|
144
|
|
145 args = [ prog ]
|
|
146 args.append(eval_file)
|
|
147
|
|
148 with open(explained_file, 'w') as ofh:
|
|
149 #print "args:", ' '.join(args)
|
|
150 p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=ofh, stderr=subprocess.PIPE)
|
|
151 (stdoutdata, stderrdata) = p.communicate()
|
|
152 rc = p.returncode
|
|
153
|
|
154 if rc != 0:
|
|
155 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args))
|
|
156 print >> sys.stderr, stderrdata
|
|
157 sys.exit(1)
|
|
158
|
|
159 def do_coords2admix(coords_file):
|
|
160 prog = 'coords2admix'
|
|
161
|
|
162 args = [ prog ]
|
|
163 args.append(coords_file)
|
|
164
|
|
165 with open('fake', 'w') as ofh:
|
|
166 #print "args:", ' '.join(args)
|
|
167 p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=ofh, stderr=subprocess.PIPE)
|
|
168 (stdoutdata, stderrdata) = p.communicate()
|
|
169 rc = p.returncode
|
|
170
|
|
171 if rc != 0:
|
|
172 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args))
|
|
173 print >> sys.stderr, stderrdata
|
|
174 sys.exit(1)
|
|
175
|
|
176 shutil.copy2('fake', coords_file)
|
|
177
|
|
178 ################################################################################
|
|
179
|
|
180 if len(sys.argv) != 5:
|
|
181 print "usage"
|
|
182 sys.exit(1)
|
|
183
|
|
184 input, input_files_path, output, output_files_path = sys.argv[1:5]
|
|
185
|
|
186 mkdir_p(output_files_path)
|
|
187
|
|
188 ped_file = os.path.join(input_files_path, 'admix.ped')
|
|
189 geno_file = os.path.join(output_files_path, 'admix.geno')
|
|
190 do_ped2geno(ped_file, geno_file)
|
|
191
|
|
192 map_file = os.path.join(input_files_path, 'admix.map')
|
|
193 snp_file = os.path.join(output_files_path, 'admix.snp')
|
|
194 do_map2snp(map_file, snp_file)
|
|
195
|
|
196 ind_file = os.path.join(output_files_path, 'admix.ind')
|
|
197 population_names = make_ind_file(ind_file, input)
|
|
198
|
|
199 par_file = os.path.join(output_files_path, 'par.admix')
|
|
200 evec_file = os.path.join(output_files_path, 'coordinates.txt')
|
|
201 eval_file = os.path.join(output_files_path, 'admix.eval')
|
|
202 make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file)
|
|
203
|
|
204 smartpca_stats = do_smartpca(par_file)
|
|
205
|
|
206 do_ploteig(evec_file, population_names)
|
|
207 plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names))
|
|
208 output_plot_file = os.path.join(output_files_path, 'PCA.pdf')
|
|
209 shutil.copy2(plot_file, output_plot_file)
|
|
210 os.unlink(plot_file)
|
|
211
|
|
212 do_eval2pct(eval_file, os.path.join(output_files_path, 'explained.txt'))
|
|
213 os.unlink(eval_file)
|
|
214
|
|
215 do_coords2admix(evec_file)
|
|
216
|
|
217 ################################################################################
|
|
218
|
|
219 info_page = gd_composite.InfoPage()
|
|
220 info_page.set_title('PCA Galaxy Composite Dataset')
|
|
221
|
|
222 display_file = gd_composite.DisplayFile()
|
|
223 display_value = gd_composite.DisplayValue()
|
|
224
|
|
225 out_pdf = gd_composite.Parameter(name='PCA.pdf', value='PCA.pdf', display_type=display_file)
|
|
226 out_evec = gd_composite.Parameter(name='coordinates.txt', value='coordinates.txt', display_type=display_file)
|
|
227 out_explained = gd_composite.Parameter(name='explained.txt', value='explained.txt', display_type=display_file)
|
|
228
|
|
229 evec_prefix = 'coordinates.txt.1:2.{0}'.format(':'.join(population_names))
|
|
230 ps_file = '{0}.ps'.format(evec_prefix)
|
|
231 xtxt_file = '{0}.xtxt'.format(evec_prefix)
|
|
232
|
|
233 os.unlink(os.path.join(output_files_path, ps_file))
|
|
234 os.unlink(os.path.join(output_files_path, xtxt_file))
|
|
235
|
|
236 info_page.add_output_parameter(out_pdf)
|
|
237 info_page.add_output_parameter(out_evec)
|
|
238 info_page.add_output_parameter(out_explained)
|
|
239
|
|
240 in_admix = gd_composite.Parameter(name='par.admix', value='par.admix', display_type=display_file)
|
|
241 in_geno = gd_composite.Parameter(name='admix.geno', value='admix.geno', display_type=display_file)
|
|
242 in_snp = gd_composite.Parameter(name='admix.snp', value='admix.snp', display_type=display_file)
|
|
243 in_ind = gd_composite.Parameter(name='admix.ind', value='admix.ind', display_type=display_file)
|
|
244
|
|
245 info_page.add_input_parameter(in_admix)
|
|
246 info_page.add_input_parameter(in_geno)
|
|
247 info_page.add_input_parameter(in_snp)
|
|
248 info_page.add_input_parameter(in_ind)
|
|
249
|
|
250 misc_stats = gd_composite.Parameter(description='Stats<p/><pre>\n{0}\n</pre>'.format(smartpca_stats), display_type=display_value)
|
|
251
|
|
252 info_page.add_misc(misc_stats)
|
|
253
|
|
254 with open (output, 'w') as ofh:
|
|
255 print >> ofh, info_page.render()
|
|
256
|
|
257 sys.exit(0)
|
|
258
|