annotate make_gd_file.py @ 32:03c22b722882

remove BeautifulSoup dependency
author Richard Burhans <burhans@bx.psu.edu>
date Fri, 20 Sep 2013 13:54:23 -0400
parents 8997f2ca8c7a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
1 #!/usr/bin/env python
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
2
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
3 import base64
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
4 import json
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
5 import math
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
6 import re
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
7 import sys
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
8
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
9 identifier_regex = re.compile('[0-9A-Z_a-z]+$')
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
10
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
11 def unwrap_column_names(string):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
12 column_names = []
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
13 string = unwrap_string(string)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
14 for line in string.split('\n'):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
15 line = line.strip()
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
16 if is_identifier(line):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
17 column_names.append(line)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
18 else:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
19 die('invalid column name format: {}'.format(line))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
20 return column_names
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
21
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
22 def unwrap_string(string):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
23 try:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
24 decoded = base64.b64decode(string)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
25 except:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
26 die('invalid base64 string: {}'.format(string))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
27 return decoded
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
28
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
29 def is_identifier(string):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
30 match = identifier_regex.match(string)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
31 if match:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
32 return True
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
33 else:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
34 return False
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
35
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
36 def read_individual_names(filename):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
37 tokens = []
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
38 names = []
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
39 with open(filename) as fh:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
40 for line in fh:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
41 line = line.rstrip('\r\n')
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
42 elems = line.split()
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
43
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
44 columns = len(elems)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
45 if columns == 0:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
46 continue
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
47
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
48 first_token = elems[0]
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
49
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
50 if columns == 1:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
51 name = first_token
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
52 else:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
53 keywords = ' '.join(elems[1:])
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
54 name = ' '.join([first_token, keywords])
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
55
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
56 if first_token not in tokens:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
57 tokens.append(first_token)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
58 names.append(name)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
59 else:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
60 die('duplicate first column entry in Names dataset: {}'.format(first_token))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
61 return names
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
62
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
63 def fold_line(line, maxlen, prefix):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
64 prefix_len = len(prefix)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
65
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
66 lines = []
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
67
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
68 while len(line) > maxlen:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
69 split_points = []
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
70 state = 0
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
71 for i in range(maxlen - prefix_len):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
72 c = line[i]
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
73 if state == 0:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
74 if c == '"':
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
75 state = 1
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
76 elif c in [ '{', ':', ',', '}', '[', ']' ]:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
77 split_points.append(i)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
78 elif state == 1:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
79 if c == '"':
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
80 state = 0
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
81 elif c == '\\':
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
82 state = 2
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
83 elif state == 2:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
84 state = 1
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
85 idx = split_points[-1]
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
86 lines.append('{0}{1}'.format(prefix, line[:idx+1]))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
87 line = line[idx+1:]
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
88
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
89 lines.append('{0}{1}'.format(prefix, line))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
90
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
91 return lines
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
92
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
93 def die(message):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
94 print >> sys.stderr, message
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
95 sys.exit(1)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
96
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
97 ################################################################################
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
98
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
99 type_to_columns = {
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
100 'gd_snp':4,
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
101 'gd_genotype':1
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
102 }
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
103
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
104 if len(sys.argv) != 12:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
105 print >> sys.stderr, 'Usage'
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
106 sys.exit(1)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
107
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
108 input, scaffold_col, pos_col, ref_col, rPos_col, preamble_arg, names, species_arg, dbkey, output_type, output = sys.argv[1:12]
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
109
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
110 preamble_column_names = unwrap_column_names(preamble_arg)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
111 first_individual_column = len(preamble_column_names) + 1
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
112
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
113 individual_names = read_individual_names(names)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
114
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
115 species = unwrap_string(species_arg)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
116 if not is_identifier(species):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
117 die('invalid species format: {}'.format(species))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
118
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
119 if not output_type in type_to_columns:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
120 die('unknown output type: {}'.format(output_type))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
121 columns_per_individual = type_to_columns[output_type]
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
122
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
123 jdict = {}
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
124
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
125 column_names = preamble_column_names[:]
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
126 for i in range(1, len(individual_names) + 1):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
127 if output_type == 'gd_snp':
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
128 column_names.append('{}A'.format(i))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
129 column_names.append('{}B'.format(i))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
130 column_names.append('{}G'.format(i))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
131 column_names.append('{}Q'.format(i))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
132 elif output_type == 'gd_genotype':
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
133 column_names.append('{}G'.format(i))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
134 else:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
135 die('unknown output type: {}'.format(output_type))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
136
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
137 jdict['column_names'] = column_names
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
138
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
139 individuals = []
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
140
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
141 for pos, individual in enumerate(individual_names):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
142 col = first_individual_column + pos * columns_per_individual
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
143 individuals.append([individual, col])
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
144
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
145 jdict['individuals'] = individuals
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
146
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
147 jdict['scaffold'] = int(scaffold_col)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
148 jdict['pos'] = int(pos_col)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
149 jdict['ref'] = int(ref_col)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
150 jdict['rPos'] = int(rPos_col)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
151
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
152 jdict['species'] = species
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
153 jdict['dbkey'] = dbkey
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
154
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
155 json_string = json.dumps(jdict, separators=(',',':'), sort_keys=True)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
156
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
157 min_cols = len(column_names)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
158 pos_col = int(pos_col) - 1
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
159 rPos_col = int(rPos_col) - 1
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
160
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
161 def is_int(string):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
162 try:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
163 int(string)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
164 return True
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
165 except ValueError:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
166 return False
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
167
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
168 with open(output, 'w') as ofh:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
169 lines = fold_line(json_string, 200, '#')
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
170 for line in lines:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
171 print >> ofh, line
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
172
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
173 with open(input) as fh:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
174 line_number = 0
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
175 for line in fh:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
176 line_number += 1
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
177 if line[0] == '#':
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
178 continue
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
179 line = line.rstrip('\r\n')
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
180 elems = line.split('\t')
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
181 if len(elems) < min_cols:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
182 die('Too few columns on line {0} of input file. Expecting {1}, saw {2}.'.format(line_number, min_cols, len(elems)))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
183 if not is_int(elems[pos_col]):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
184 die('bad pos on line {0} column {1} of input file: {2}'.format(line_number, pos_col+1, elems[pos_col]))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
185 if not is_int(elems[rPos_col]):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
186 die('bad rPos on line {0} column {1} of input file: {2}'.format(line_number, rPos_col+1, elems[rPos_col]))
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
diff changeset
187 print >> ofh, line