comparison specify.py @ 27:8997f2ca8c7a

Update to Miller Lab devshed revision bae0d3306d3b
author Richard Burhans <burhans@bx.psu.edu>
date Mon, 15 Jul 2013 10:47:35 -0400
parents 248b06e86022
children
comparison
equal deleted inserted replaced
26:91e835060ad2 27:8997f2ca8c7a
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import gd_util
3 import sys 4 import sys
4 import base64 5 from Population import Population
5 6
6 def parse_args(args): 7 ################################################################################
7 if len(args) < 3:
8 usage()
9 8
10 input_file, output_file = args[1:3] 9 def parse_string(str_arg, ind_token2col):
11
12 individuals = []
13 checkboxes = []
14 strings = []
15
16 for arg in args[3:]:
17 if ':' in arg:
18 arg_type, arg = arg.split(':', 1)
19 else:
20 print >> sys.stderr, "unknown argument:", arg
21 usage()
22
23 if arg_type == 'individual':
24 individuals.append(arg)
25 elif arg_type == 'checkbox':
26 checkboxes.append(arg)
27 elif arg_type == 'string':
28 strings.append(arg)
29 else:
30 print >> sys.stderr, "unknown argument:", arg
31 usage()
32
33 return input_file, output_file, individuals, checkboxes, strings
34
35 def usage():
36 print >> sys.stderr, "Usage: %s <input> <output> [<individual:col:name> ...] [<checkbox:col:name> ...] [<string:base64> ...]" % (sys.argv[0])
37 sys.exit(1)
38
39 def parse_individuals(individuals):
40 ind_col2name = {}
41 ind_name2col = {}
42
43 for individual in individuals:
44 if ':' in individual:
45 column, name = individual.split(':', 1)
46 else:
47 print >> sys.stderr, "invalid individual specification:", individual
48 usage()
49
50 try:
51 column = int(column)
52 except:
53 print "individual column is not an integer:", individual
54 usage()
55
56 if column not in ind_col2name:
57 ind_col2name[column] = name
58 else:
59 if ind_col2name[column] != name:
60 print "duplicate individual column:", name, column, ind_col2name[column]
61 usage()
62
63 if name not in ind_name2col:
64 ind_name2col[name] = [column]
65 elif column not in ind_name2col[name]:
66 ind_name2col[name].append(column)
67
68 return ind_col2name, ind_name2col
69
70 def parse_checkboxes(checkboxes, ind_col2name):
71 columns = [] 10 columns = []
72 11
73 for checkbox in checkboxes: 12 string = gd_util.unwrap_string(str_arg)
74 if ':' in checkbox: 13 tokens = find_tokens(string, ind_token2col)
75 column, name = checkbox.split(':', 1)
76 else:
77 print >> sys.stderr, "invalid checkbox specification:", checkbox
78 usage()
79 14
80 try: 15 for token in tokens:
81 column = int(column) 16 col = ind_token2col[token]
82 except: 17 if col not in columns:
83 print "checkbox column is not an integer:", checkbox 18 columns.append(col)
84 usage()
85
86 if column not in ind_col2name:
87 print "individual not in SNP table:", name
88 usage()
89
90 if column not in columns:
91 columns.append(column)
92 19
93 return columns 20 return columns
94 21
95 def parse_strings(strings, ind_col2name, ind_name2col): 22 def find_tokens(string, tokens):
96 columns = []
97
98 for string in strings:
99 try:
100 decoded = base64.b64decode(string)
101 except:
102 print >> sys.stderr, "invalid base64 string:", string
103 usage()
104
105 names = find_names(decoded, ind_name2col.keys())
106 for name in names:
107 cols = ind_name2col[name]
108 if len(cols) == 1:
109 col = cols[0]
110 if col not in columns:
111 columns.append(col)
112 else:
113 print >> sys.stderr, "name with multiple columns:", name
114 usage()
115
116 return columns
117
118 def find_names(string, names):
119 rv = [] 23 rv = []
120 for name in names: 24 for token in tokens:
121 if name in string: 25 if token in string:
122 if name not in rv: 26 if token not in rv:
123 rv.append(name) 27 rv.append(token)
124 return rv 28 return rv
125 29
30 ################################################################################
126 31
32 if len(sys.argv) != 6:
33 gd_util.die('Usage')
127 34
35 input, output, ind_arg, cb_arg, str_arg = sys.argv[1:]
128 36
129 input_file, output_file, individuals, checkboxes, strings = parse_args(sys.argv) 37 p_total = Population()
130 ind_col2name, ind_name2col = parse_individuals(individuals) 38 p_total.from_wrapped_dict(ind_arg)
131 cb_cols = parse_checkboxes(checkboxes, ind_col2name)
132 str_cols = parse_strings(strings, ind_col2name, ind_name2col)
133 39
134 out_cols = cb_cols 40 p_cb = Population()
135 for col in str_cols: 41 p_cb.from_wrapped_dict(cb_arg)
136 if col not in out_cols:
137 out_cols.append(col)
138 42
139 with open(output_file, 'w') as fh: 43 if not p_total.is_superset(p_cb):
140 for col in sorted(out_cols): 44 gd_util.die('There is a checked individual that does not appear in the SNP table')
141 print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']]) 45
46 ################################################################################
47
48 ind_col2name = {}
49 ind_token2col = {}
50 for col in p_total.column_list():
51 individual = p_total.individual_with_column(col)
52 name = individual.name
53 ind_col2name[col] = name
54 first_token = name.split()[0]
55 if first_token not in ind_token2col:
56 ind_token2col[first_token] = col
57 else:
58 gd_util.die('duplicate first token: {0}'.format(first_token))
59
60 out_cols = p_cb.column_list()
61 str_cols = parse_string(str_arg, ind_token2col)
62
63 with open(output, 'w') as fh:
64 for col in sorted(ind_col2name.keys()):
65 if col in out_cols or col in str_cols:
66 print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']])
142 67
143 sys.exit(0) 68 sys.exit(0)
144 69
145
146
147