Mercurial > repos > miller-lab > genome_diversity
annotate pca.py @ 31:a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Fri, 20 Sep 2013 13:25:27 -0400 |
parents | 8997f2ca8c7a |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
3 import gd_util |
0 | 4 import os |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
5 import re |
0 | 6 import shutil |
7 import sys | |
8 from BeautifulSoup import BeautifulSoup | |
9 import gd_composite | |
10 | |
11 ################################################################################ | |
12 | |
13 def do_ped2geno(input, output): | |
14 lines = [] | |
15 with open(input) as fh: | |
16 for line in fh: | |
17 line = line.rstrip('\r\n') | |
18 lines.append(line.split()) | |
19 | |
20 pair_map = { | |
21 '0':{ '0':'9', '1':'9', '2':'9' }, | |
22 '1':{ '0':'1', '1':'2', '2':'1' }, | |
23 '2':{ '0':'1', '1':'1', '2':'0' } | |
24 } | |
25 with open(output, 'w') as ofh: | |
26 for a_idx in xrange(6, len(lines[0]), 2): | |
27 b_idx = a_idx + 1 | |
28 print >> ofh, ''.join(map(lambda line: pair_map[line[a_idx]][line[b_idx]], lines)) | |
29 | |
30 def do_map2snp(input, output): | |
31 with open(output, 'w') as ofh: | |
32 with open(input) as fh: | |
33 for line in fh: | |
34 elems = line.split() | |
35 print >> ofh, ' {0} 11 0.002 2000 A T'.format(elems[1]) | |
36 | |
37 def make_ind_file(ind_file, input): | |
38 pops = [] | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
39 name_map = [] |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
40 name_idx = 0 |
0 | 41 |
42 ofh = open(ind_file, 'w') | |
43 | |
44 with open(input) as fh: | |
45 soup = BeautifulSoup(fh) | |
46 misc = soup.find('div', {'id': 'gd_misc'}) | |
47 populations = misc('ul')[0] | |
48 | |
49 i = 0 | |
50 for entry in populations: | |
51 if i % 2 == 1: | |
52 population_name = entry.contents[0].encode('utf8').strip().replace(' ', '_') | |
53 pops.append(population_name) | |
54 individuals = entry.ol('li') | |
55 for individual in individuals: | |
56 individual_name = individual.string.encode('utf8').strip() | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
57 name_map.append(individual_name) |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
58 print >> ofh, 'ind_%s' % name_idx, 'M', population_name |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
59 name_idx += 1 |
0 | 60 i += 1 |
61 | |
62 ofh.close() | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
63 return pops, name_map |
0 | 64 |
65 def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file): | |
66 with open(par_file, 'w') as fh: | |
67 print >> fh, 'genotypename: {0}'.format(geno_file) | |
68 print >> fh, 'snpname: {0}'.format(snp_file) | |
69 print >> fh, 'indivname: {0}'.format(ind_file) | |
70 print >> fh, 'evecoutname: {0}'.format(evec_file) | |
71 print >> fh, 'evaloutname: {0}'.format(eval_file) | |
72 print >> fh, 'altnormstyle: NO' | |
73 print >> fh, 'numoutevec: 2' | |
74 | |
75 def do_smartpca(par_file): | |
76 prog = 'smartpca' | |
77 | |
78 args = [ prog ] | |
79 args.append('-p') | |
80 args.append(par_file) | |
81 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
82 stdoutdata, stderrdata = gd_util.run_program(prog, args) |
0 | 83 |
84 stats = [] | |
85 | |
86 save_line = False | |
87 for line in stdoutdata.split('\n'): | |
88 if line.startswith(('## Average divergence', '## Anova statistics', '## Statistical significance')): | |
89 stats.append('') | |
90 save_line = True | |
91 if line.strip() == '': | |
92 save_line = False | |
93 if save_line: | |
94 stats.append(line) | |
95 | |
96 return '\n'.join(stats[1:]) | |
97 | |
98 def do_ploteig(evec_file, population_names): | |
99 prog = 'gd_ploteig' | |
100 | |
101 args = [ prog ] | |
102 args.append('-i') | |
103 args.append(evec_file) | |
104 args.append('-c') | |
105 args.append('1:2') | |
106 args.append('-p') | |
107 args.append(':'.join(population_names)) | |
108 args.append('-x') | |
109 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
110 gd_util.run_program(prog, args) |
0 | 111 |
112 def do_eval2pct(eval_file, explained_file): | |
113 prog = 'eval2pct' | |
114 | |
115 args = [ prog ] | |
116 args.append(eval_file) | |
117 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
118 with open(explained_file, 'w') as fh: |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
119 gd_util.run_program(prog, args, stdout=fh) |
0 | 120 |
121 def do_coords2admix(coords_file): | |
122 prog = 'coords2admix' | |
123 | |
124 args = [ prog ] | |
125 args.append(coords_file) | |
126 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
127 with open('fake', 'w') as fh: |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
128 gd_util.run_program(prog, args, stdout=fh) |
0 | 129 |
130 shutil.copy2('fake', coords_file) | |
131 | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
132 ind_regex = re.compile('ind_([0-9]+)') |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
133 |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
134 def fix_names(name_map, files): |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
135 for file in files: |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
136 tmp_filename = '%s.tmp' % file |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
137 with open(tmp_filename, 'w') as ofh: |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
138 with open(file) as fh: |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
139 for line in fh: |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
140 line = line.rstrip('\r\n') |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
141 match = ind_regex.search(line) |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
142 if match: |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
143 idx = int(match.group(1)) |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
144 old = 'ind_%s' % idx |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
145 new = name_map[idx].replace(' ', '_') |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
146 line = line.replace(old, new) |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
147 print >> ofh, line |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
148 |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
149 shutil.copy2(tmp_filename, file) |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
150 os.unlink(tmp_filename) |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
151 |
0 | 152 ################################################################################ |
153 | |
154 if len(sys.argv) != 5: | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
155 gd_util.die('Usage') |
0 | 156 |
157 input, input_files_path, output, output_files_path = sys.argv[1:5] | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
158 gd_util.mkdir_p(output_files_path) |
0 | 159 |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
160 ################################################################################ |
0 | 161 |
162 ped_file = os.path.join(input_files_path, 'admix.ped') | |
163 geno_file = os.path.join(output_files_path, 'admix.geno') | |
164 do_ped2geno(ped_file, geno_file) | |
165 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
166 ################################################################################ |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
167 |
0 | 168 map_file = os.path.join(input_files_path, 'admix.map') |
169 snp_file = os.path.join(output_files_path, 'admix.snp') | |
170 do_map2snp(map_file, snp_file) | |
171 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
172 ################################################################################ |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
173 |
0 | 174 ind_file = os.path.join(output_files_path, 'admix.ind') |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
175 population_names, name_map = make_ind_file(ind_file, input) |
0 | 176 |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
177 ################################################################################ |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
178 |
0 | 179 par_file = os.path.join(output_files_path, 'par.admix') |
180 evec_file = os.path.join(output_files_path, 'coordinates.txt') | |
181 eval_file = os.path.join(output_files_path, 'admix.eval') | |
182 make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file) | |
183 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
184 ################################################################################ |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
185 |
0 | 186 smartpca_stats = do_smartpca(par_file) |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
187 fix_names(name_map, [ind_file, evec_file]) |
0 | 188 |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
189 ################################################################################ |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
190 |
0 | 191 do_ploteig(evec_file, population_names) |
192 plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names)) | |
193 output_plot_file = os.path.join(output_files_path, 'PCA.pdf') | |
194 shutil.copy2(plot_file, output_plot_file) | |
195 os.unlink(plot_file) | |
196 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
197 ################################################################################ |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
198 |
0 | 199 do_eval2pct(eval_file, os.path.join(output_files_path, 'explained.txt')) |
200 os.unlink(eval_file) | |
201 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
202 ################################################################################ |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
203 |
0 | 204 do_coords2admix(evec_file) |
205 | |
206 ################################################################################ | |
207 | |
208 info_page = gd_composite.InfoPage() | |
209 info_page.set_title('PCA Galaxy Composite Dataset') | |
210 | |
211 display_file = gd_composite.DisplayFile() | |
212 display_value = gd_composite.DisplayValue() | |
213 | |
214 out_pdf = gd_composite.Parameter(name='PCA.pdf', value='PCA.pdf', display_type=display_file) | |
215 out_evec = gd_composite.Parameter(name='coordinates.txt', value='coordinates.txt', display_type=display_file) | |
216 out_explained = gd_composite.Parameter(name='explained.txt', value='explained.txt', display_type=display_file) | |
217 | |
218 evec_prefix = 'coordinates.txt.1:2.{0}'.format(':'.join(population_names)) | |
219 ps_file = '{0}.ps'.format(evec_prefix) | |
220 xtxt_file = '{0}.xtxt'.format(evec_prefix) | |
221 | |
222 os.unlink(os.path.join(output_files_path, ps_file)) | |
223 os.unlink(os.path.join(output_files_path, xtxt_file)) | |
224 | |
225 info_page.add_output_parameter(out_pdf) | |
226 info_page.add_output_parameter(out_evec) | |
227 info_page.add_output_parameter(out_explained) | |
228 | |
229 in_admix = gd_composite.Parameter(name='par.admix', value='par.admix', display_type=display_file) | |
230 in_geno = gd_composite.Parameter(name='admix.geno', value='admix.geno', display_type=display_file) | |
231 in_snp = gd_composite.Parameter(name='admix.snp', value='admix.snp', display_type=display_file) | |
232 in_ind = gd_composite.Parameter(name='admix.ind', value='admix.ind', display_type=display_file) | |
233 | |
234 info_page.add_input_parameter(in_admix) | |
235 info_page.add_input_parameter(in_geno) | |
236 info_page.add_input_parameter(in_snp) | |
237 info_page.add_input_parameter(in_ind) | |
238 | |
239 misc_stats = gd_composite.Parameter(description='Stats<p/><pre>\n{0}\n</pre>'.format(smartpca_stats), display_type=display_value) | |
240 | |
241 info_page.add_misc(misc_stats) | |
242 | |
243 with open (output, 'w') as ofh: | |
244 print >> ofh, info_page.render() | |
245 | |
246 sys.exit(0) | |
247 |