Mercurial > repos > miller-lab > genome_diversity
comparison make_gd_file.py @ 27:8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Mon, 15 Jul 2013 10:47:35 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
26:91e835060ad2 | 27:8997f2ca8c7a |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import base64 | |
4 import json | |
5 import math | |
6 import re | |
7 import sys | |
8 | |
9 identifier_regex = re.compile('[0-9A-Z_a-z]+$') | |
10 | |
11 def unwrap_column_names(string): | |
12 column_names = [] | |
13 string = unwrap_string(string) | |
14 for line in string.split('\n'): | |
15 line = line.strip() | |
16 if is_identifier(line): | |
17 column_names.append(line) | |
18 else: | |
19 die('invalid column name format: {}'.format(line)) | |
20 return column_names | |
21 | |
22 def unwrap_string(string): | |
23 try: | |
24 decoded = base64.b64decode(string) | |
25 except: | |
26 die('invalid base64 string: {}'.format(string)) | |
27 return decoded | |
28 | |
29 def is_identifier(string): | |
30 match = identifier_regex.match(string) | |
31 if match: | |
32 return True | |
33 else: | |
34 return False | |
35 | |
36 def read_individual_names(filename): | |
37 tokens = [] | |
38 names = [] | |
39 with open(filename) as fh: | |
40 for line in fh: | |
41 line = line.rstrip('\r\n') | |
42 elems = line.split() | |
43 | |
44 columns = len(elems) | |
45 if columns == 0: | |
46 continue | |
47 | |
48 first_token = elems[0] | |
49 | |
50 if columns == 1: | |
51 name = first_token | |
52 else: | |
53 keywords = ' '.join(elems[1:]) | |
54 name = ' '.join([first_token, keywords]) | |
55 | |
56 if first_token not in tokens: | |
57 tokens.append(first_token) | |
58 names.append(name) | |
59 else: | |
60 die('duplicate first column entry in Names dataset: {}'.format(first_token)) | |
61 return names | |
62 | |
63 def fold_line(line, maxlen, prefix): | |
64 prefix_len = len(prefix) | |
65 | |
66 lines = [] | |
67 | |
68 while len(line) > maxlen: | |
69 split_points = [] | |
70 state = 0 | |
71 for i in range(maxlen - prefix_len): | |
72 c = line[i] | |
73 if state == 0: | |
74 if c == '"': | |
75 state = 1 | |
76 elif c in [ '{', ':', ',', '}', '[', ']' ]: | |
77 split_points.append(i) | |
78 elif state == 1: | |
79 if c == '"': | |
80 state = 0 | |
81 elif c == '\\': | |
82 state = 2 | |
83 elif state == 2: | |
84 state = 1 | |
85 idx = split_points[-1] | |
86 lines.append('{0}{1}'.format(prefix, line[:idx+1])) | |
87 line = line[idx+1:] | |
88 | |
89 lines.append('{0}{1}'.format(prefix, line)) | |
90 | |
91 return lines | |
92 | |
93 def die(message): | |
94 print >> sys.stderr, message | |
95 sys.exit(1) | |
96 | |
97 ################################################################################ | |
98 | |
99 type_to_columns = { | |
100 'gd_snp':4, | |
101 'gd_genotype':1 | |
102 } | |
103 | |
104 if len(sys.argv) != 12: | |
105 print >> sys.stderr, 'Usage' | |
106 sys.exit(1) | |
107 | |
108 input, scaffold_col, pos_col, ref_col, rPos_col, preamble_arg, names, species_arg, dbkey, output_type, output = sys.argv[1:12] | |
109 | |
110 preamble_column_names = unwrap_column_names(preamble_arg) | |
111 first_individual_column = len(preamble_column_names) + 1 | |
112 | |
113 individual_names = read_individual_names(names) | |
114 | |
115 species = unwrap_string(species_arg) | |
116 if not is_identifier(species): | |
117 die('invalid species format: {}'.format(species)) | |
118 | |
119 if not output_type in type_to_columns: | |
120 die('unknown output type: {}'.format(output_type)) | |
121 columns_per_individual = type_to_columns[output_type] | |
122 | |
123 jdict = {} | |
124 | |
125 column_names = preamble_column_names[:] | |
126 for i in range(1, len(individual_names) + 1): | |
127 if output_type == 'gd_snp': | |
128 column_names.append('{}A'.format(i)) | |
129 column_names.append('{}B'.format(i)) | |
130 column_names.append('{}G'.format(i)) | |
131 column_names.append('{}Q'.format(i)) | |
132 elif output_type == 'gd_genotype': | |
133 column_names.append('{}G'.format(i)) | |
134 else: | |
135 die('unknown output type: {}'.format(output_type)) | |
136 | |
137 jdict['column_names'] = column_names | |
138 | |
139 individuals = [] | |
140 | |
141 for pos, individual in enumerate(individual_names): | |
142 col = first_individual_column + pos * columns_per_individual | |
143 individuals.append([individual, col]) | |
144 | |
145 jdict['individuals'] = individuals | |
146 | |
147 jdict['scaffold'] = int(scaffold_col) | |
148 jdict['pos'] = int(pos_col) | |
149 jdict['ref'] = int(ref_col) | |
150 jdict['rPos'] = int(rPos_col) | |
151 | |
152 jdict['species'] = species | |
153 jdict['dbkey'] = dbkey | |
154 | |
155 json_string = json.dumps(jdict, separators=(',',':'), sort_keys=True) | |
156 | |
157 min_cols = len(column_names) | |
158 pos_col = int(pos_col) - 1 | |
159 rPos_col = int(rPos_col) - 1 | |
160 | |
161 def is_int(string): | |
162 try: | |
163 int(string) | |
164 return True | |
165 except ValueError: | |
166 return False | |
167 | |
168 with open(output, 'w') as ofh: | |
169 lines = fold_line(json_string, 200, '#') | |
170 for line in lines: | |
171 print >> ofh, line | |
172 | |
173 with open(input) as fh: | |
174 line_number = 0 | |
175 for line in fh: | |
176 line_number += 1 | |
177 if line[0] == '#': | |
178 continue | |
179 line = line.rstrip('\r\n') | |
180 elems = line.split('\t') | |
181 if len(elems) < min_cols: | |
182 die('Too few columns on line {0} of input file. Expecting {1}, saw {2}.'.format(line_number, min_cols, len(elems))) | |
183 if not is_int(elems[pos_col]): | |
184 die('bad pos on line {0} column {1} of input file: {2}'.format(line_number, pos_col+1, elems[pos_col])) | |
185 if not is_int(elems[rPos_col]): | |
186 die('bad rPos on line {0} column {1} of input file: {2}'.format(line_number, rPos_col+1, elems[rPos_col])) | |
187 print >> ofh, line |