Mercurial > repos > miller-lab > genome_diversity
comparison specify.py @ 27:8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Mon, 15 Jul 2013 10:47:35 -0400 |
parents | 248b06e86022 |
children |
comparison
equal
deleted
inserted
replaced
26:91e835060ad2 | 27:8997f2ca8c7a |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import gd_util | |
3 import sys | 4 import sys |
4 import base64 | 5 from Population import Population |
5 | 6 |
6 def parse_args(args): | 7 ################################################################################ |
7 if len(args) < 3: | |
8 usage() | |
9 | 8 |
10 input_file, output_file = args[1:3] | 9 def parse_string(str_arg, ind_token2col): |
11 | |
12 individuals = [] | |
13 checkboxes = [] | |
14 strings = [] | |
15 | |
16 for arg in args[3:]: | |
17 if ':' in arg: | |
18 arg_type, arg = arg.split(':', 1) | |
19 else: | |
20 print >> sys.stderr, "unknown argument:", arg | |
21 usage() | |
22 | |
23 if arg_type == 'individual': | |
24 individuals.append(arg) | |
25 elif arg_type == 'checkbox': | |
26 checkboxes.append(arg) | |
27 elif arg_type == 'string': | |
28 strings.append(arg) | |
29 else: | |
30 print >> sys.stderr, "unknown argument:", arg | |
31 usage() | |
32 | |
33 return input_file, output_file, individuals, checkboxes, strings | |
34 | |
35 def usage(): | |
36 print >> sys.stderr, "Usage: %s <input> <output> [<individual:col:name> ...] [<checkbox:col:name> ...] [<string:base64> ...]" % (sys.argv[0]) | |
37 sys.exit(1) | |
38 | |
39 def parse_individuals(individuals): | |
40 ind_col2name = {} | |
41 ind_name2col = {} | |
42 | |
43 for individual in individuals: | |
44 if ':' in individual: | |
45 column, name = individual.split(':', 1) | |
46 else: | |
47 print >> sys.stderr, "invalid individual specification:", individual | |
48 usage() | |
49 | |
50 try: | |
51 column = int(column) | |
52 except: | |
53 print "individual column is not an integer:", individual | |
54 usage() | |
55 | |
56 if column not in ind_col2name: | |
57 ind_col2name[column] = name | |
58 else: | |
59 if ind_col2name[column] != name: | |
60 print "duplicate individual column:", name, column, ind_col2name[column] | |
61 usage() | |
62 | |
63 if name not in ind_name2col: | |
64 ind_name2col[name] = [column] | |
65 elif column not in ind_name2col[name]: | |
66 ind_name2col[name].append(column) | |
67 | |
68 return ind_col2name, ind_name2col | |
69 | |
70 def parse_checkboxes(checkboxes, ind_col2name): | |
71 columns = [] | 10 columns = [] |
72 | 11 |
73 for checkbox in checkboxes: | 12 string = gd_util.unwrap_string(str_arg) |
74 if ':' in checkbox: | 13 tokens = find_tokens(string, ind_token2col) |
75 column, name = checkbox.split(':', 1) | |
76 else: | |
77 print >> sys.stderr, "invalid checkbox specification:", checkbox | |
78 usage() | |
79 | 14 |
80 try: | 15 for token in tokens: |
81 column = int(column) | 16 col = ind_token2col[token] |
82 except: | 17 if col not in columns: |
83 print "checkbox column is not an integer:", checkbox | 18 columns.append(col) |
84 usage() | |
85 | |
86 if column not in ind_col2name: | |
87 print "individual not in SNP table:", name | |
88 usage() | |
89 | |
90 if column not in columns: | |
91 columns.append(column) | |
92 | 19 |
93 return columns | 20 return columns |
94 | 21 |
95 def parse_strings(strings, ind_col2name, ind_name2col): | 22 def find_tokens(string, tokens): |
96 columns = [] | |
97 | |
98 for string in strings: | |
99 try: | |
100 decoded = base64.b64decode(string) | |
101 except: | |
102 print >> sys.stderr, "invalid base64 string:", string | |
103 usage() | |
104 | |
105 names = find_names(decoded, ind_name2col.keys()) | |
106 for name in names: | |
107 cols = ind_name2col[name] | |
108 if len(cols) == 1: | |
109 col = cols[0] | |
110 if col not in columns: | |
111 columns.append(col) | |
112 else: | |
113 print >> sys.stderr, "name with multiple columns:", name | |
114 usage() | |
115 | |
116 return columns | |
117 | |
118 def find_names(string, names): | |
119 rv = [] | 23 rv = [] |
120 for name in names: | 24 for token in tokens: |
121 if name in string: | 25 if token in string: |
122 if name not in rv: | 26 if token not in rv: |
123 rv.append(name) | 27 rv.append(token) |
124 return rv | 28 return rv |
125 | 29 |
30 ################################################################################ | |
126 | 31 |
32 if len(sys.argv) != 6: | |
33 gd_util.die('Usage') | |
127 | 34 |
35 input, output, ind_arg, cb_arg, str_arg = sys.argv[1:] | |
128 | 36 |
129 input_file, output_file, individuals, checkboxes, strings = parse_args(sys.argv) | 37 p_total = Population() |
130 ind_col2name, ind_name2col = parse_individuals(individuals) | 38 p_total.from_wrapped_dict(ind_arg) |
131 cb_cols = parse_checkboxes(checkboxes, ind_col2name) | |
132 str_cols = parse_strings(strings, ind_col2name, ind_name2col) | |
133 | 39 |
134 out_cols = cb_cols | 40 p_cb = Population() |
135 for col in str_cols: | 41 p_cb.from_wrapped_dict(cb_arg) |
136 if col not in out_cols: | |
137 out_cols.append(col) | |
138 | 42 |
139 with open(output_file, 'w') as fh: | 43 if not p_total.is_superset(p_cb): |
140 for col in sorted(out_cols): | 44 gd_util.die('There is a checked individual that does not appear in the SNP table') |
141 print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']]) | 45 |
46 ################################################################################ | |
47 | |
48 ind_col2name = {} | |
49 ind_token2col = {} | |
50 for col in p_total.column_list(): | |
51 individual = p_total.individual_with_column(col) | |
52 name = individual.name | |
53 ind_col2name[col] = name | |
54 first_token = name.split()[0] | |
55 if first_token not in ind_token2col: | |
56 ind_token2col[first_token] = col | |
57 else: | |
58 gd_util.die('duplicate first token: {0}'.format(first_token)) | |
59 | |
60 out_cols = p_cb.column_list() | |
61 str_cols = parse_string(str_arg, ind_token2col) | |
62 | |
63 with open(output, 'w') as fh: | |
64 for col in sorted(ind_col2name.keys()): | |
65 if col in out_cols or col in str_cols: | |
66 print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']]) | |
142 | 67 |
143 sys.exit(0) | 68 sys.exit(0) |
144 | 69 |
145 | |
146 | |
147 |