Mercurial > repos > earlhaminst > gff3_to_json
comparison gff3_to_json.py @ 0:be6cec883b02 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
author | earlhaminst |
---|---|
date | Wed, 21 Dec 2016 10:02:59 -0500 |
parents | |
children | befe6021e476 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:be6cec883b02 |
---|---|
1 from __future__ import print_function | |
2 | |
3 import json | |
4 import optparse | |
5 import sys | |
6 | |
7 cds_parent_dict = dict() | |
8 exon_parent_dict = dict() | |
9 five_prime_utr_parent_dict = dict() | |
10 gene_count = 0 | |
11 gene_dict = dict() | |
12 transcript_dict = dict() | |
13 three_prime_utr_parent_dict = dict() | |
14 | |
15 | |
16 def feature_to_json(cols): | |
17 d = { | |
18 'end': int(cols[4]), | |
19 'start': int(cols[3]), | |
20 } | |
21 for attr in cols[8].split(';'): | |
22 if '=' in attr: | |
23 (tag, value) = attr.split('=') | |
24 if tag == 'ID': | |
25 d['id'] = value | |
26 else: | |
27 d[tag] = value | |
28 if cols[6] == '+': | |
29 d['strand'] = 1 | |
30 elif cols[6] == '-': | |
31 d['strand'] = -1 | |
32 else: | |
33 raise Exception("Unrecognized strand '%s'" % cols[6]) | |
34 return d | |
35 | |
36 | |
37 def gene_to_json(cols, species): | |
38 global gene_count | |
39 gene = feature_to_json(cols) | |
40 gene.update({ | |
41 'member_id': gene_count, | |
42 'object_type': 'Gene', | |
43 'seq_region_name': cols[0], | |
44 'species': species, | |
45 'Transcript': [], | |
46 }) | |
47 gene_dict[gene['id']] = gene | |
48 gene_count = gene_count + 1 | |
49 | |
50 | |
51 def transcript_to_json(cols, species): | |
52 transcript = feature_to_json(cols) | |
53 transcript.update({ | |
54 'object_type': 'Transcript', | |
55 'seq_region_name': cols[0], | |
56 'species': species, | |
57 }) | |
58 transcript_dict[transcript['id']] = transcript | |
59 | |
60 | |
61 def exon_to_json(cols, species): | |
62 exon = feature_to_json(cols) | |
63 exon.update({ | |
64 'length': int(cols[4]) - int(cols[3]) + 1, | |
65 'object_type': 'Exon', | |
66 'seq_region_name': cols[0], | |
67 'species': species, | |
68 }) | |
69 if 'id' not in exon and 'Name' in exon: | |
70 exon['id'] = exon['Name'] | |
71 | |
72 if 'Parent' in exon: | |
73 for parent in exon['Parent'].split(','): | |
74 if parent not in exon_parent_dict: | |
75 exon_parent_dict[parent] = [exon] | |
76 else: | |
77 exon_parent_dict[parent].append(exon) | |
78 | |
79 | |
80 def five_prime_utr_to_json(cols): | |
81 five_prime_utr = feature_to_json(cols) | |
82 if 'Parent' in five_prime_utr: | |
83 for parent in five_prime_utr['Parent'].split(','): | |
84 # the 5' UTR can be split among multiple exons | |
85 if parent not in five_prime_utr_parent_dict: | |
86 five_prime_utr_parent_dict[parent] = [five_prime_utr] | |
87 else: | |
88 five_prime_utr_parent_dict[parent].append(five_prime_utr) | |
89 | |
90 | |
91 def three_prime_utr_to_json(cols): | |
92 three_prime_utr = feature_to_json(cols) | |
93 if 'Parent' in three_prime_utr: | |
94 for parent in three_prime_utr['Parent'].split(','): | |
95 # the 3' UTR can be split among multiple exons | |
96 if parent not in three_prime_utr_parent_dict: | |
97 three_prime_utr_parent_dict[parent] = [three_prime_utr] | |
98 else: | |
99 three_prime_utr_parent_dict[parent].append(three_prime_utr) | |
100 | |
101 | |
102 def cds_to_json(cols): | |
103 cds = feature_to_json(cols) | |
104 if 'id' not in cds: | |
105 if 'Name' in cds: | |
106 cds['id'] = cds['Name'] | |
107 elif 'Parent' in cds: | |
108 cds['id'] = cds['Parent'] | |
109 if 'Parent' in cds: | |
110 # At this point we are sure than 'id' is in cds | |
111 for parent in cds['Parent'].split(','): | |
112 if parent not in cds_parent_dict: | |
113 cds_parent_dict[parent] = [cds] | |
114 else: | |
115 cds_parent_dict[parent].append(cds) | |
116 | |
117 | |
118 def join_dicts(): | |
119 for parent, exon_list in exon_parent_dict.items(): | |
120 exon_list.sort(key=lambda _: _['start']) | |
121 if parent in transcript_dict: | |
122 transcript_dict[parent]['Exon'] = exon_list | |
123 | |
124 for transcript_id, transcript in transcript_dict.items(): | |
125 translation = { | |
126 'CDS': [], | |
127 'id': None, | |
128 'end': transcript['end'], | |
129 'object_type': 'Translation', | |
130 'species': transcript['species'], | |
131 'start': transcript['start'], | |
132 } | |
133 found_cds = False | |
134 derived_translation_start = None | |
135 derived_translation_end = None | |
136 if transcript_id in cds_parent_dict: | |
137 cds_list = cds_parent_dict[transcript_id] | |
138 cds_ids = set(_['id'] for _ in cds_list) | |
139 if len(cds_ids) > 1: | |
140 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % parent) | |
141 translation['id'] = cds_ids.pop() | |
142 cds_list.sort(key=lambda _: _['start']) | |
143 translation['CDS'] = cds_list | |
144 translation['start'] = cds_list[0]['start'] | |
145 translation['end'] = cds_list[-1]['end'] | |
146 found_cds = True | |
147 if transcript_id in five_prime_utr_parent_dict: | |
148 five_prime_utr_list = five_prime_utr_parent_dict[transcript_id] | |
149 five_prime_utr_list.sort(key=lambda _: _['start']) | |
150 if transcript['strand'] == 1: | |
151 derived_translation_start = five_prime_utr_list[-1]['end'] + 1 | |
152 else: | |
153 derived_translation_end = five_prime_utr_list[0]['start'] - 1 | |
154 if transcript_id in three_prime_utr_parent_dict: | |
155 three_prime_utr_list = three_prime_utr_parent_dict[transcript_id] | |
156 three_prime_utr_list.sort(key=lambda _: _['start']) | |
157 if transcript['strand'] == 1: | |
158 derived_translation_end = three_prime_utr_list[0]['start'] - 1 | |
159 else: | |
160 derived_translation_start = three_prime_utr_list[-1]['end'] + 1 | |
161 if derived_translation_start is not None: | |
162 if found_cds: | |
163 if derived_translation_start > translation['start']: | |
164 raise Exception("UTR overlaps with CDS") | |
165 else: | |
166 translation['start'] = derived_translation_start | |
167 if derived_translation_end is not None: | |
168 if found_cds: | |
169 if derived_translation_end < translation['end']: | |
170 raise Exception("UTR overlaps with CDS") | |
171 else: | |
172 translation['end'] = derived_translation_end | |
173 if found_cds or derived_translation_start is not None or derived_translation_end is not None: | |
174 transcript['Translation'] = translation | |
175 | |
176 for transcript in transcript_dict.values(): | |
177 if 'Parent' in transcript: | |
178 # A polycistronic transcript can have multiple parents | |
179 for parent in transcript['Parent'].split(','): | |
180 if parent in gene_dict: | |
181 gene_dict[parent]['Transcript'].append(transcript) | |
182 | |
183 | |
184 def merge_dicts(json_arg): | |
185 with open(json_arg) as f: | |
186 dict_from_json = json.load(f) | |
187 gene_intersection = set(gene_dict.keys()) & set(dict_from_json.keys()) | |
188 if gene_intersection: | |
189 raise Exception("JSON file '%s' contains information for genes '%s', which are also present in other files" % (json_arg, ', '.join(gene_intersection))) | |
190 gene_dict.update(dict_from_json) | |
191 | |
192 | |
193 def write_json(outfile=None, sort_keys=False): | |
194 if outfile: | |
195 with open(outfile, 'w') as f: | |
196 json.dump(gene_dict, f, sort_keys=sort_keys) | |
197 else: | |
198 print(json.dumps(gene_dict, indent=3, sort_keys=sort_keys)) | |
199 | |
200 | |
201 def __main__(): | |
202 parser = optparse.OptionParser() | |
203 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') | |
204 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') | |
205 parser.add_option('-s', '--sort', action='store_true', help='Sort the keys in the JSON output') | |
206 parser.add_option('-o', '--output', help='Path of the output file. If not specified, will print on the standard output') | |
207 options, args = parser.parse_args() | |
208 | |
209 if args: | |
210 raise Exception('Use options to provide inputs') | |
211 for gff3_arg in options.gff3: | |
212 try: | |
213 (species, filename) = gff3_arg.split(':') | |
214 except ValueError: | |
215 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg) | |
216 with open(filename) as f: | |
217 for i, line in enumerate(f): | |
218 line = line.strip() | |
219 if not line: | |
220 # skip empty lines | |
221 continue | |
222 if line[0] == '#': | |
223 # skip comment lines | |
224 continue | |
225 cols = line.split('\t') | |
226 if len(cols) != 9: | |
227 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line)) | |
228 feature_type = cols[2] | |
229 try: | |
230 if feature_type == 'gene': | |
231 gene_to_json(cols, species) | |
232 elif feature_type in ('mRNA', 'transcript'): | |
233 transcript_to_json(cols, species) | |
234 elif feature_type == 'exon': | |
235 exon_to_json(cols, species) | |
236 elif feature_type == 'five_prime_UTR': | |
237 five_prime_utr_to_json(cols) | |
238 elif feature_type == 'three_prime_UTR': | |
239 three_prime_utr_to_json(cols) | |
240 elif feature_type == 'CDS': | |
241 cds_to_json(cols) | |
242 else: | |
243 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr) | |
244 except Exception as e: | |
245 raise Exception("Line %i in file '%s': %s" % (i, filename, e)) | |
246 join_dicts() | |
247 | |
248 for json_arg in options.json: | |
249 merge_dicts(json_arg) | |
250 | |
251 write_json(options.output, options.sort) | |
252 | |
253 | |
254 if __name__ == '__main__': | |
255 __main__() |