comparison gff3_to_json.py @ 0:be6cec883b02 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
author earlhaminst
date Wed, 21 Dec 2016 10:02:59 -0500
parents
children befe6021e476
comparison
equal deleted inserted replaced
-1:000000000000 0:be6cec883b02
1 from __future__ import print_function
2
3 import json
4 import optparse
5 import sys
6
7 cds_parent_dict = dict()
8 exon_parent_dict = dict()
9 five_prime_utr_parent_dict = dict()
10 gene_count = 0
11 gene_dict = dict()
12 transcript_dict = dict()
13 three_prime_utr_parent_dict = dict()
14
15
16 def feature_to_json(cols):
17 d = {
18 'end': int(cols[4]),
19 'start': int(cols[3]),
20 }
21 for attr in cols[8].split(';'):
22 if '=' in attr:
23 (tag, value) = attr.split('=')
24 if tag == 'ID':
25 d['id'] = value
26 else:
27 d[tag] = value
28 if cols[6] == '+':
29 d['strand'] = 1
30 elif cols[6] == '-':
31 d['strand'] = -1
32 else:
33 raise Exception("Unrecognized strand '%s'" % cols[6])
34 return d
35
36
37 def gene_to_json(cols, species):
38 global gene_count
39 gene = feature_to_json(cols)
40 gene.update({
41 'member_id': gene_count,
42 'object_type': 'Gene',
43 'seq_region_name': cols[0],
44 'species': species,
45 'Transcript': [],
46 })
47 gene_dict[gene['id']] = gene
48 gene_count = gene_count + 1
49
50
51 def transcript_to_json(cols, species):
52 transcript = feature_to_json(cols)
53 transcript.update({
54 'object_type': 'Transcript',
55 'seq_region_name': cols[0],
56 'species': species,
57 })
58 transcript_dict[transcript['id']] = transcript
59
60
61 def exon_to_json(cols, species):
62 exon = feature_to_json(cols)
63 exon.update({
64 'length': int(cols[4]) - int(cols[3]) + 1,
65 'object_type': 'Exon',
66 'seq_region_name': cols[0],
67 'species': species,
68 })
69 if 'id' not in exon and 'Name' in exon:
70 exon['id'] = exon['Name']
71
72 if 'Parent' in exon:
73 for parent in exon['Parent'].split(','):
74 if parent not in exon_parent_dict:
75 exon_parent_dict[parent] = [exon]
76 else:
77 exon_parent_dict[parent].append(exon)
78
79
80 def five_prime_utr_to_json(cols):
81 five_prime_utr = feature_to_json(cols)
82 if 'Parent' in five_prime_utr:
83 for parent in five_prime_utr['Parent'].split(','):
84 # the 5' UTR can be split among multiple exons
85 if parent not in five_prime_utr_parent_dict:
86 five_prime_utr_parent_dict[parent] = [five_prime_utr]
87 else:
88 five_prime_utr_parent_dict[parent].append(five_prime_utr)
89
90
91 def three_prime_utr_to_json(cols):
92 three_prime_utr = feature_to_json(cols)
93 if 'Parent' in three_prime_utr:
94 for parent in three_prime_utr['Parent'].split(','):
95 # the 3' UTR can be split among multiple exons
96 if parent not in three_prime_utr_parent_dict:
97 three_prime_utr_parent_dict[parent] = [three_prime_utr]
98 else:
99 three_prime_utr_parent_dict[parent].append(three_prime_utr)
100
101
102 def cds_to_json(cols):
103 cds = feature_to_json(cols)
104 if 'id' not in cds:
105 if 'Name' in cds:
106 cds['id'] = cds['Name']
107 elif 'Parent' in cds:
108 cds['id'] = cds['Parent']
109 if 'Parent' in cds:
110 # At this point we are sure than 'id' is in cds
111 for parent in cds['Parent'].split(','):
112 if parent not in cds_parent_dict:
113 cds_parent_dict[parent] = [cds]
114 else:
115 cds_parent_dict[parent].append(cds)
116
117
118 def join_dicts():
119 for parent, exon_list in exon_parent_dict.items():
120 exon_list.sort(key=lambda _: _['start'])
121 if parent in transcript_dict:
122 transcript_dict[parent]['Exon'] = exon_list
123
124 for transcript_id, transcript in transcript_dict.items():
125 translation = {
126 'CDS': [],
127 'id': None,
128 'end': transcript['end'],
129 'object_type': 'Translation',
130 'species': transcript['species'],
131 'start': transcript['start'],
132 }
133 found_cds = False
134 derived_translation_start = None
135 derived_translation_end = None
136 if transcript_id in cds_parent_dict:
137 cds_list = cds_parent_dict[transcript_id]
138 cds_ids = set(_['id'] for _ in cds_list)
139 if len(cds_ids) > 1:
140 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % parent)
141 translation['id'] = cds_ids.pop()
142 cds_list.sort(key=lambda _: _['start'])
143 translation['CDS'] = cds_list
144 translation['start'] = cds_list[0]['start']
145 translation['end'] = cds_list[-1]['end']
146 found_cds = True
147 if transcript_id in five_prime_utr_parent_dict:
148 five_prime_utr_list = five_prime_utr_parent_dict[transcript_id]
149 five_prime_utr_list.sort(key=lambda _: _['start'])
150 if transcript['strand'] == 1:
151 derived_translation_start = five_prime_utr_list[-1]['end'] + 1
152 else:
153 derived_translation_end = five_prime_utr_list[0]['start'] - 1
154 if transcript_id in three_prime_utr_parent_dict:
155 three_prime_utr_list = three_prime_utr_parent_dict[transcript_id]
156 three_prime_utr_list.sort(key=lambda _: _['start'])
157 if transcript['strand'] == 1:
158 derived_translation_end = three_prime_utr_list[0]['start'] - 1
159 else:
160 derived_translation_start = three_prime_utr_list[-1]['end'] + 1
161 if derived_translation_start is not None:
162 if found_cds:
163 if derived_translation_start > translation['start']:
164 raise Exception("UTR overlaps with CDS")
165 else:
166 translation['start'] = derived_translation_start
167 if derived_translation_end is not None:
168 if found_cds:
169 if derived_translation_end < translation['end']:
170 raise Exception("UTR overlaps with CDS")
171 else:
172 translation['end'] = derived_translation_end
173 if found_cds or derived_translation_start is not None or derived_translation_end is not None:
174 transcript['Translation'] = translation
175
176 for transcript in transcript_dict.values():
177 if 'Parent' in transcript:
178 # A polycistronic transcript can have multiple parents
179 for parent in transcript['Parent'].split(','):
180 if parent in gene_dict:
181 gene_dict[parent]['Transcript'].append(transcript)
182
183
184 def merge_dicts(json_arg):
185 with open(json_arg) as f:
186 dict_from_json = json.load(f)
187 gene_intersection = set(gene_dict.keys()) & set(dict_from_json.keys())
188 if gene_intersection:
189 raise Exception("JSON file '%s' contains information for genes '%s', which are also present in other files" % (json_arg, ', '.join(gene_intersection)))
190 gene_dict.update(dict_from_json)
191
192
193 def write_json(outfile=None, sort_keys=False):
194 if outfile:
195 with open(outfile, 'w') as f:
196 json.dump(gene_dict, f, sort_keys=sort_keys)
197 else:
198 print(json.dumps(gene_dict, indent=3, sort_keys=sort_keys))
199
200
201 def __main__():
202 parser = optparse.OptionParser()
203 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
204 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
205 parser.add_option('-s', '--sort', action='store_true', help='Sort the keys in the JSON output')
206 parser.add_option('-o', '--output', help='Path of the output file. If not specified, will print on the standard output')
207 options, args = parser.parse_args()
208
209 if args:
210 raise Exception('Use options to provide inputs')
211 for gff3_arg in options.gff3:
212 try:
213 (species, filename) = gff3_arg.split(':')
214 except ValueError:
215 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg)
216 with open(filename) as f:
217 for i, line in enumerate(f):
218 line = line.strip()
219 if not line:
220 # skip empty lines
221 continue
222 if line[0] == '#':
223 # skip comment lines
224 continue
225 cols = line.split('\t')
226 if len(cols) != 9:
227 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line))
228 feature_type = cols[2]
229 try:
230 if feature_type == 'gene':
231 gene_to_json(cols, species)
232 elif feature_type in ('mRNA', 'transcript'):
233 transcript_to_json(cols, species)
234 elif feature_type == 'exon':
235 exon_to_json(cols, species)
236 elif feature_type == 'five_prime_UTR':
237 five_prime_utr_to_json(cols)
238 elif feature_type == 'three_prime_UTR':
239 three_prime_utr_to_json(cols)
240 elif feature_type == 'CDS':
241 cds_to_json(cols)
242 else:
243 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr)
244 except Exception as e:
245 raise Exception("Line %i in file '%s': %s" % (i, filename, e))
246 join_dicts()
247
248 for json_arg in options.json:
249 merge_dicts(json_arg)
250
251 write_json(options.output, options.sort)
252
253
254 if __name__ == '__main__':
255 __main__()