annotate prepare_population_structure.py @ 30:4188853b940b

Update to Miller Lab devshed revision eb4e61d024db
author Richard Burhans <burhans@bx.psu.edu>
date Fri, 26 Jul 2013 12:51:13 -0400
parents 8997f2ca8c7a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
1 #!/usr/bin/env python
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
2
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
3 import gd_util
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
4 import os
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
5 import shutil
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
6 import sys
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
7 from Population import Population
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
8 import gd_composite
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
9
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
10 ################################################################################
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
11
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
12 def do_import(filename, files_path, min_reads, min_qual, min_spacing, using_info, population_list):
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
13 info_page = gd_composite.InfoPage()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
14 info_page.set_title('Prepare to look for population structure Galaxy Composite Dataset')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
15
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
16 display_file = gd_composite.DisplayFile()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
17 display_value = gd_composite.DisplayValue()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
18
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
19 out_ped = gd_composite.Parameter(name='admix.ped', value='admix.ped', display_type=display_file)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
20 out_map = gd_composite.Parameter(name='admix.map', value='admix.map', display_type=display_file)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
21 out_use = gd_composite.Parameter(description=using_info, display_type=display_value)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
22
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
23 info_page.add_output_parameter(out_ped)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
24 info_page.add_output_parameter(out_map)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
25 info_page.add_output_parameter(out_use)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
26
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
27 in_min_reads = gd_composite.Parameter(description='Minimum reads covering a SNP, per individual', value=min_reads, display_type=display_value)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
28 in_min_qual = gd_composite.Parameter(description='Minimum quality value, per individual', value=min_qual, display_type=display_value)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
29 in_min_spacing = gd_composite.Parameter(description='Minimum spacing between SNPs on the same scaffold', value=min_spacing, display_type=display_value)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
30
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
31 info_page.add_input_parameter(in_min_reads)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
32 info_page.add_input_parameter(in_min_qual)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
33 info_page.add_input_parameter(in_min_spacing)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
34
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
35 misc_populations = gd_composite.Parameter(name='Populations', value=population_list, display_type=gd_composite.DisplayPopulationList())
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
36 info_page.add_misc(misc_populations)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
37
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
38 with open(filename, 'w') as ofh:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
39 print >> ofh, info_page.render()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
40
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
41 ################################################################################
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
42
24
248b06e86022 Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
43 if len(sys.argv) < 10:
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
44 gd_util.die('Usage')
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
45
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
46 # parse command line
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
47 input_snp_filename, input_type, min_reads, min_qual, min_spacing, output_filename, output_files_path, ind_arg = sys.argv[1:9]
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
48 args = sys.argv[9:]
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
49
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
50 population_files = []
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
51 all_individuals = False
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
52
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
53 for arg in args:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
54 if arg == 'all_individuals':
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
55 all_individuals = True
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
56 elif len(arg) > 11 and arg[:11] == 'population:':
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
57 file, name = arg[11:].split(':', 1)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
58 population_files.append((file, name))
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
59
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
60 p_total = Population()
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
61 p_total.from_wrapped_dict(ind_arg)
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
62
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
63 individual_population = {}
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
64 population_list = []
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
65
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
66 if all_individuals:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
67 p1 = p_total
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
68 p1.name = 'All Individuals'
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
69 population_list.append(p1)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
70 else:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
71 p1 = Population()
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
72 for file, name in population_files:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
73 this_pop = Population(name)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
74 this_pop.from_population_file(file)
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
75 population_list.append(this_pop)
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
76
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
77 for tag in this_pop.tag_list():
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
78 if tag not in individual_population:
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
79 individual_population[tag] = name
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
80
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
81 # add individuals from this file to p1
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
82 p1.from_population_file(file)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
83
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
84
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
85 if not p_total.is_superset(p1):
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
86 gd_util.die('There is an individual in the population that is not in the SNP table')
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
87
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
88 ################################################################################
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
89
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
90 prog = 'admix_prep'
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
91
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
92 args = [ prog ]
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
93 args.append(input_snp_filename)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
94 args.append(min_reads)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
95 args.append(min_qual)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
96 args.append(min_spacing)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
97
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
98 for tag in p1.tag_list():
24
248b06e86022 Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
99 if input_type == 'gd_genotype':
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
100 column, name = tag.split(':', 1)
24
248b06e86022 Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
101 tag = '{0}:{1}'.format(int(column) - 2, name)
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
102 args.append(tag)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
103
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
104 stdoutdata, stderrdata = gd_util.run_program(prog, args)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
105 using_info = stdoutdata.rstrip('\r\n')
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
106
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
107 ################################################################################
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
108
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
109 gd_util.mkdir_p(output_files_path)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
110
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
111 output_ped_filename = os.path.join(output_files_path, 'admix.ped')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
112 output_map_filename = os.path.join(output_files_path, 'admix.map')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
113 shutil.copy2('admix.ped', output_ped_filename)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
114 shutil.copy2('admix.map', output_map_filename)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
115
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 24
diff changeset
116 do_import(output_filename, output_files_path, min_reads, min_qual, min_spacing, using_info, population_list)
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
117 sys.exit(0)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
118