Mercurial > repos > earlhaminst > gff3_to_json
comparison gff3_to_json.py @ 1:befe6021e476 draft default tip
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
author | earlhaminst |
---|---|
date | Tue, 28 Feb 2017 12:06:04 -0500 |
parents | be6cec883b02 |
children |
comparison
equal
deleted
inserted
replaced
0:be6cec883b02 | 1:befe6021e476 |
---|---|
2 | 2 |
3 import json | 3 import json |
4 import optparse | 4 import optparse |
5 import sys | 5 import sys |
6 | 6 |
7 cds_parent_dict = dict() | |
8 exon_parent_dict = dict() | |
9 five_prime_utr_parent_dict = dict() | |
10 gene_count = 0 | 7 gene_count = 0 |
11 gene_dict = dict() | 8 |
12 transcript_dict = dict() | 9 |
13 three_prime_utr_parent_dict = dict() | 10 def remove_type_from_list_of_ids(l): |
14 | 11 return ','.join(remove_type_from_id(_) for _ in l.split(',')) |
15 | 12 |
16 def feature_to_json(cols): | 13 |
14 def remove_type_from_id(id_): | |
15 colon_index = id_.find(':') | |
16 if colon_index >= 0: | |
17 return id_[colon_index + 1:] | |
18 else: | |
19 return id_ | |
20 | |
21 | |
22 def feature_to_dict(cols, parent_dict=None): | |
17 d = { | 23 d = { |
18 'end': int(cols[4]), | 24 'end': int(cols[4]), |
19 'start': int(cols[3]), | 25 'start': int(cols[3]), |
20 } | 26 } |
21 for attr in cols[8].split(';'): | 27 for attr in cols[8].split(';'): |
22 if '=' in attr: | 28 if '=' in attr: |
23 (tag, value) = attr.split('=') | 29 (tag, value) = attr.split('=') |
24 if tag == 'ID': | 30 if tag == 'ID': |
25 d['id'] = value | 31 tag = 'id' |
26 else: | 32 value = remove_type_from_id(value) |
27 d[tag] = value | 33 elif tag == 'Parent': |
34 value = remove_type_from_list_of_ids(value) | |
35 d[tag] = value | |
28 if cols[6] == '+': | 36 if cols[6] == '+': |
29 d['strand'] = 1 | 37 d['strand'] = 1 |
30 elif cols[6] == '-': | 38 elif cols[6] == '-': |
31 d['strand'] = -1 | 39 d['strand'] = -1 |
32 else: | 40 else: |
33 raise Exception("Unrecognized strand '%s'" % cols[6]) | 41 raise Exception("Unrecognized strand '%s'" % cols[6]) |
42 if parent_dict is not None and 'Parent' in d: | |
43 # a 3' UTR can be split among multiple exons | |
44 # a 5' UTR can be split among multiple exons | |
45 # a CDS can be part of multiple transcripts | |
46 for parent in d['Parent'].split(','): | |
47 if parent not in parent_dict: | |
48 parent_dict[parent] = [d] | |
49 else: | |
50 parent_dict[parent].append(d) | |
34 return d | 51 return d |
35 | 52 |
36 | 53 |
37 def gene_to_json(cols, species): | 54 def add_gene_to_dict(cols, species, gene_dict): |
38 global gene_count | 55 global gene_count |
39 gene = feature_to_json(cols) | 56 gene = feature_to_dict(cols) |
40 gene.update({ | 57 gene.update({ |
41 'member_id': gene_count, | 58 'member_id': gene_count, |
42 'object_type': 'Gene', | 59 'object_type': 'Gene', |
43 'seq_region_name': cols[0], | 60 'seq_region_name': cols[0], |
44 'species': species, | 61 'species': species, |
46 }) | 63 }) |
47 gene_dict[gene['id']] = gene | 64 gene_dict[gene['id']] = gene |
48 gene_count = gene_count + 1 | 65 gene_count = gene_count + 1 |
49 | 66 |
50 | 67 |
51 def transcript_to_json(cols, species): | 68 def add_transcript_to_dict(cols, species, transcript_dict): |
52 transcript = feature_to_json(cols) | 69 transcript = feature_to_dict(cols) |
53 transcript.update({ | 70 transcript.update({ |
54 'object_type': 'Transcript', | 71 'object_type': 'Transcript', |
55 'seq_region_name': cols[0], | 72 'seq_region_name': cols[0], |
56 'species': species, | 73 'species': species, |
57 }) | 74 }) |
58 transcript_dict[transcript['id']] = transcript | 75 transcript_dict[transcript['id']] = transcript |
59 | 76 |
60 | 77 |
61 def exon_to_json(cols, species): | 78 def add_exon_to_dict(cols, species, exon_parent_dict): |
62 exon = feature_to_json(cols) | 79 exon = feature_to_dict(cols, exon_parent_dict) |
63 exon.update({ | 80 exon.update({ |
64 'length': int(cols[4]) - int(cols[3]) + 1, | 81 'length': int(cols[4]) - int(cols[3]) + 1, |
65 'object_type': 'Exon', | 82 'object_type': 'Exon', |
66 'seq_region_name': cols[0], | 83 'seq_region_name': cols[0], |
67 'species': species, | 84 'species': species, |
68 }) | 85 }) |
69 if 'id' not in exon and 'Name' in exon: | 86 if 'id' not in exon and 'Name' in exon: |
70 exon['id'] = exon['Name'] | 87 exon['id'] = exon['Name'] |
71 | 88 |
72 if 'Parent' in exon: | 89 |
73 for parent in exon['Parent'].split(','): | 90 def add_cds_to_dict(cols, cds_parent_dict): |
74 if parent not in exon_parent_dict: | 91 cds = feature_to_dict(cols, cds_parent_dict) |
75 exon_parent_dict[parent] = [exon] | |
76 else: | |
77 exon_parent_dict[parent].append(exon) | |
78 | |
79 | |
80 def five_prime_utr_to_json(cols): | |
81 five_prime_utr = feature_to_json(cols) | |
82 if 'Parent' in five_prime_utr: | |
83 for parent in five_prime_utr['Parent'].split(','): | |
84 # the 5' UTR can be split among multiple exons | |
85 if parent not in five_prime_utr_parent_dict: | |
86 five_prime_utr_parent_dict[parent] = [five_prime_utr] | |
87 else: | |
88 five_prime_utr_parent_dict[parent].append(five_prime_utr) | |
89 | |
90 | |
91 def three_prime_utr_to_json(cols): | |
92 three_prime_utr = feature_to_json(cols) | |
93 if 'Parent' in three_prime_utr: | |
94 for parent in three_prime_utr['Parent'].split(','): | |
95 # the 3' UTR can be split among multiple exons | |
96 if parent not in three_prime_utr_parent_dict: | |
97 three_prime_utr_parent_dict[parent] = [three_prime_utr] | |
98 else: | |
99 three_prime_utr_parent_dict[parent].append(three_prime_utr) | |
100 | |
101 | |
102 def cds_to_json(cols): | |
103 cds = feature_to_json(cols) | |
104 if 'id' not in cds: | 92 if 'id' not in cds: |
105 if 'Name' in cds: | 93 if 'Name' in cds: |
106 cds['id'] = cds['Name'] | 94 cds['id'] = cds['Name'] |
107 elif 'Parent' in cds: | 95 elif 'Parent' in cds and ',' not in cds['Parent']: |
108 cds['id'] = cds['Parent'] | 96 cds['id'] = cds['Parent'] |
109 if 'Parent' in cds: | 97 |
110 # At this point we are sure than 'id' is in cds | 98 |
111 for parent in cds['Parent'].split(','): | 99 def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict): |
112 if parent not in cds_parent_dict: | |
113 cds_parent_dict[parent] = [cds] | |
114 else: | |
115 cds_parent_dict[parent].append(cds) | |
116 | |
117 | |
118 def join_dicts(): | |
119 for parent, exon_list in exon_parent_dict.items(): | 100 for parent, exon_list in exon_parent_dict.items(): |
120 exon_list.sort(key=lambda _: _['start']) | |
121 if parent in transcript_dict: | 101 if parent in transcript_dict: |
102 exon_list.sort(key=lambda _: _['start']) | |
122 transcript_dict[parent]['Exon'] = exon_list | 103 transcript_dict[parent]['Exon'] = exon_list |
123 | 104 |
124 for transcript_id, transcript in transcript_dict.items(): | 105 for transcript_id, transcript in transcript_dict.items(): |
125 translation = { | 106 translation = { |
126 'CDS': [], | 107 'CDS': [], |
179 for parent in transcript['Parent'].split(','): | 160 for parent in transcript['Parent'].split(','): |
180 if parent in gene_dict: | 161 if parent in gene_dict: |
181 gene_dict[parent]['Transcript'].append(transcript) | 162 gene_dict[parent]['Transcript'].append(transcript) |
182 | 163 |
183 | 164 |
184 def merge_dicts(json_arg): | 165 def update_full_gene_dict_no_overwrite(full_gene_dict, gene_dict): |
185 with open(json_arg) as f: | 166 gene_intersection = set(full_gene_dict.keys()) & set(gene_dict.keys()) |
186 dict_from_json = json.load(f) | |
187 gene_intersection = set(gene_dict.keys()) & set(dict_from_json.keys()) | |
188 if gene_intersection: | 167 if gene_intersection: |
189 raise Exception("JSON file '%s' contains information for genes '%s', which are also present in other files" % (json_arg, ', '.join(gene_intersection))) | 168 raise Exception("Information for genes '%s' are present in multiple files" % ', '.join(gene_intersection)) |
190 gene_dict.update(dict_from_json) | 169 full_gene_dict.update(gene_dict) |
191 | 170 |
192 | 171 |
193 def write_json(outfile=None, sort_keys=False): | 172 def write_json(full_gene_dict, outfile=None, sort_keys=False): |
194 if outfile: | 173 if outfile: |
195 with open(outfile, 'w') as f: | 174 with open(outfile, 'w') as f: |
196 json.dump(gene_dict, f, sort_keys=sort_keys) | 175 json.dump(full_gene_dict, f, sort_keys=sort_keys) |
197 else: | 176 else: |
198 print(json.dumps(gene_dict, indent=3, sort_keys=sort_keys)) | 177 json.dump(full_gene_dict, sys.stdout, sort_keys=sort_keys) |
199 | 178 |
200 | 179 |
201 def __main__(): | 180 def __main__(): |
202 parser = optparse.OptionParser() | 181 parser = optparse.OptionParser() |
203 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') | 182 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') |
204 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') | 183 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') |
205 parser.add_option('-s', '--sort', action='store_true', help='Sort the keys in the JSON output') | 184 parser.add_option('-s', '--sort', action='store_true', help='Sort the keys in the JSON output') |
206 parser.add_option('-o', '--output', help='Path of the output file. If not specified, will print on the standard output') | 185 parser.add_option('-o', '--output', help='Path of the output file. If not specified, will print on the standard output') |
207 options, args = parser.parse_args() | 186 options, args = parser.parse_args() |
208 | |
209 if args: | 187 if args: |
210 raise Exception('Use options to provide inputs') | 188 raise Exception('Use options to provide inputs') |
189 | |
190 full_gene_dict = dict() | |
211 for gff3_arg in options.gff3: | 191 for gff3_arg in options.gff3: |
212 try: | 192 try: |
213 (species, filename) = gff3_arg.split(':') | 193 (species, filename) = gff3_arg.split(':') |
214 except ValueError: | 194 except ValueError: |
215 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg) | 195 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg) |
196 gene_dict = dict() | |
197 transcript_dict = dict() | |
198 exon_parent_dict = dict() | |
199 cds_parent_dict = dict() | |
200 five_prime_utr_parent_dict = dict() | |
201 three_prime_utr_parent_dict = dict() | |
216 with open(filename) as f: | 202 with open(filename) as f: |
217 for i, line in enumerate(f): | 203 for i, line in enumerate(f, start=1): |
218 line = line.strip() | 204 line = line.strip() |
219 if not line: | 205 if not line: |
220 # skip empty lines | 206 # skip empty lines |
221 continue | 207 continue |
222 if line[0] == '#': | 208 if line[0] == '#': |
226 if len(cols) != 9: | 212 if len(cols) != 9: |
227 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line)) | 213 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line)) |
228 feature_type = cols[2] | 214 feature_type = cols[2] |
229 try: | 215 try: |
230 if feature_type == 'gene': | 216 if feature_type == 'gene': |
231 gene_to_json(cols, species) | 217 add_gene_to_dict(cols, species, gene_dict) |
232 elif feature_type in ('mRNA', 'transcript'): | 218 elif feature_type in ('mRNA', 'transcript'): |
233 transcript_to_json(cols, species) | 219 add_transcript_to_dict(cols, species, transcript_dict) |
234 elif feature_type == 'exon': | 220 elif feature_type == 'exon': |
235 exon_to_json(cols, species) | 221 add_exon_to_dict(cols, species, exon_parent_dict) |
236 elif feature_type == 'five_prime_UTR': | 222 elif feature_type == 'five_prime_UTR': |
237 five_prime_utr_to_json(cols) | 223 feature_to_dict(cols, five_prime_utr_parent_dict) |
238 elif feature_type == 'three_prime_UTR': | 224 elif feature_type == 'three_prime_UTR': |
239 three_prime_utr_to_json(cols) | 225 feature_to_dict(cols, three_prime_utr_parent_dict) |
240 elif feature_type == 'CDS': | 226 elif feature_type == 'CDS': |
241 cds_to_json(cols) | 227 add_cds_to_dict(cols, cds_parent_dict) |
242 else: | 228 else: |
243 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr) | 229 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr) |
244 except Exception as e: | 230 except Exception as e: |
245 raise Exception("Line %i in file '%s': %s" % (i, filename, e)) | 231 raise Exception("Line %i in file '%s': %s" % (i, filename, e)) |
246 join_dicts() | 232 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) |
233 update_full_gene_dict_no_overwrite(full_gene_dict, gene_dict) | |
247 | 234 |
248 for json_arg in options.json: | 235 for json_arg in options.json: |
249 merge_dicts(json_arg) | 236 with open(json_arg) as f: |
250 | 237 update_full_gene_dict_no_overwrite(full_gene_dict, json.load(f)) |
251 write_json(options.output, options.sort) | 238 |
239 write_json(full_gene_dict, options.output, options.sort) | |
252 | 240 |
253 | 241 |
254 if __name__ == '__main__': | 242 if __name__ == '__main__': |
255 __main__() | 243 __main__() |