comparison specify.py @ 24:248b06e86022

Added gd_genotype datatype. Modified tools to support new datatype.
author Richard Burhans <burhans@bx.psu.edu>
date Tue, 28 May 2013 16:24:19 -0400
parents
children 8997f2ca8c7a
comparison
equal deleted inserted replaced
23:66a183c44dd5 24:248b06e86022
1 #!/usr/bin/env python
2
3 import sys
4 import base64
5
6 def parse_args(args):
7 if len(args) < 3:
8 usage()
9
10 input_file, output_file = args[1:3]
11
12 individuals = []
13 checkboxes = []
14 strings = []
15
16 for arg in args[3:]:
17 if ':' in arg:
18 arg_type, arg = arg.split(':', 1)
19 else:
20 print >> sys.stderr, "unknown argument:", arg
21 usage()
22
23 if arg_type == 'individual':
24 individuals.append(arg)
25 elif arg_type == 'checkbox':
26 checkboxes.append(arg)
27 elif arg_type == 'string':
28 strings.append(arg)
29 else:
30 print >> sys.stderr, "unknown argument:", arg
31 usage()
32
33 return input_file, output_file, individuals, checkboxes, strings
34
35 def usage():
36 print >> sys.stderr, "Usage: %s <input> <output> [<individual:col:name> ...] [<checkbox:col:name> ...] [<string:base64> ...]" % (sys.argv[0])
37 sys.exit(1)
38
39 def parse_individuals(individuals):
40 ind_col2name = {}
41 ind_name2col = {}
42
43 for individual in individuals:
44 if ':' in individual:
45 column, name = individual.split(':', 1)
46 else:
47 print >> sys.stderr, "invalid individual specification:", individual
48 usage()
49
50 try:
51 column = int(column)
52 except:
53 print "individual column is not an integer:", individual
54 usage()
55
56 if column not in ind_col2name:
57 ind_col2name[column] = name
58 else:
59 if ind_col2name[column] != name:
60 print "duplicate individual column:", name, column, ind_col2name[column]
61 usage()
62
63 if name not in ind_name2col:
64 ind_name2col[name] = [column]
65 elif column not in ind_name2col[name]:
66 ind_name2col[name].append(column)
67
68 return ind_col2name, ind_name2col
69
70 def parse_checkboxes(checkboxes, ind_col2name):
71 columns = []
72
73 for checkbox in checkboxes:
74 if ':' in checkbox:
75 column, name = checkbox.split(':', 1)
76 else:
77 print >> sys.stderr, "invalid checkbox specification:", checkbox
78 usage()
79
80 try:
81 column = int(column)
82 except:
83 print "checkbox column is not an integer:", checkbox
84 usage()
85
86 if column not in ind_col2name:
87 print "individual not in SNP table:", name
88 usage()
89
90 if column not in columns:
91 columns.append(column)
92
93 return columns
94
95 def parse_strings(strings, ind_col2name, ind_name2col):
96 columns = []
97
98 for string in strings:
99 try:
100 decoded = base64.b64decode(string)
101 except:
102 print >> sys.stderr, "invalid base64 string:", string
103 usage()
104
105 names = find_names(decoded, ind_name2col.keys())
106 for name in names:
107 cols = ind_name2col[name]
108 if len(cols) == 1:
109 col = cols[0]
110 if col not in columns:
111 columns.append(col)
112 else:
113 print >> sys.stderr, "name with multiple columns:", name
114 usage()
115
116 return columns
117
118 def find_names(string, names):
119 rv = []
120 for name in names:
121 if name in string:
122 if name not in rv:
123 rv.append(name)
124 return rv
125
126
127
128
129 input_file, output_file, individuals, checkboxes, strings = parse_args(sys.argv)
130 ind_col2name, ind_name2col = parse_individuals(individuals)
131 cb_cols = parse_checkboxes(checkboxes, ind_col2name)
132 str_cols = parse_strings(strings, ind_col2name, ind_name2col)
133
134 out_cols = cb_cols
135 for col in str_cols:
136 if col not in out_cols:
137 out_cols.append(col)
138
139 with open(output_file, 'w') as fh:
140 for col in sorted(out_cols):
141 print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']])
142
143 sys.exit(0)
144
145
146
147