comparison gff3_to_json.py @ 1:befe6021e476 draft default tip

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
author earlhaminst
date Tue, 28 Feb 2017 12:06:04 -0500
parents be6cec883b02
children
comparison
equal deleted inserted replaced
0:be6cec883b02 1:befe6021e476
2 2
3 import json 3 import json
4 import optparse 4 import optparse
5 import sys 5 import sys
6 6
7 cds_parent_dict = dict()
8 exon_parent_dict = dict()
9 five_prime_utr_parent_dict = dict()
10 gene_count = 0 7 gene_count = 0
11 gene_dict = dict() 8
12 transcript_dict = dict() 9
13 three_prime_utr_parent_dict = dict() 10 def remove_type_from_list_of_ids(l):
14 11 return ','.join(remove_type_from_id(_) for _ in l.split(','))
15 12
16 def feature_to_json(cols): 13
14 def remove_type_from_id(id_):
15 colon_index = id_.find(':')
16 if colon_index >= 0:
17 return id_[colon_index + 1:]
18 else:
19 return id_
20
21
22 def feature_to_dict(cols, parent_dict=None):
17 d = { 23 d = {
18 'end': int(cols[4]), 24 'end': int(cols[4]),
19 'start': int(cols[3]), 25 'start': int(cols[3]),
20 } 26 }
21 for attr in cols[8].split(';'): 27 for attr in cols[8].split(';'):
22 if '=' in attr: 28 if '=' in attr:
23 (tag, value) = attr.split('=') 29 (tag, value) = attr.split('=')
24 if tag == 'ID': 30 if tag == 'ID':
25 d['id'] = value 31 tag = 'id'
26 else: 32 value = remove_type_from_id(value)
27 d[tag] = value 33 elif tag == 'Parent':
34 value = remove_type_from_list_of_ids(value)
35 d[tag] = value
28 if cols[6] == '+': 36 if cols[6] == '+':
29 d['strand'] = 1 37 d['strand'] = 1
30 elif cols[6] == '-': 38 elif cols[6] == '-':
31 d['strand'] = -1 39 d['strand'] = -1
32 else: 40 else:
33 raise Exception("Unrecognized strand '%s'" % cols[6]) 41 raise Exception("Unrecognized strand '%s'" % cols[6])
42 if parent_dict is not None and 'Parent' in d:
43 # a 3' UTR can be split among multiple exons
44 # a 5' UTR can be split among multiple exons
45 # a CDS can be part of multiple transcripts
46 for parent in d['Parent'].split(','):
47 if parent not in parent_dict:
48 parent_dict[parent] = [d]
49 else:
50 parent_dict[parent].append(d)
34 return d 51 return d
35 52
36 53
37 def gene_to_json(cols, species): 54 def add_gene_to_dict(cols, species, gene_dict):
38 global gene_count 55 global gene_count
39 gene = feature_to_json(cols) 56 gene = feature_to_dict(cols)
40 gene.update({ 57 gene.update({
41 'member_id': gene_count, 58 'member_id': gene_count,
42 'object_type': 'Gene', 59 'object_type': 'Gene',
43 'seq_region_name': cols[0], 60 'seq_region_name': cols[0],
44 'species': species, 61 'species': species,
46 }) 63 })
47 gene_dict[gene['id']] = gene 64 gene_dict[gene['id']] = gene
48 gene_count = gene_count + 1 65 gene_count = gene_count + 1
49 66
50 67
51 def transcript_to_json(cols, species): 68 def add_transcript_to_dict(cols, species, transcript_dict):
52 transcript = feature_to_json(cols) 69 transcript = feature_to_dict(cols)
53 transcript.update({ 70 transcript.update({
54 'object_type': 'Transcript', 71 'object_type': 'Transcript',
55 'seq_region_name': cols[0], 72 'seq_region_name': cols[0],
56 'species': species, 73 'species': species,
57 }) 74 })
58 transcript_dict[transcript['id']] = transcript 75 transcript_dict[transcript['id']] = transcript
59 76
60 77
61 def exon_to_json(cols, species): 78 def add_exon_to_dict(cols, species, exon_parent_dict):
62 exon = feature_to_json(cols) 79 exon = feature_to_dict(cols, exon_parent_dict)
63 exon.update({ 80 exon.update({
64 'length': int(cols[4]) - int(cols[3]) + 1, 81 'length': int(cols[4]) - int(cols[3]) + 1,
65 'object_type': 'Exon', 82 'object_type': 'Exon',
66 'seq_region_name': cols[0], 83 'seq_region_name': cols[0],
67 'species': species, 84 'species': species,
68 }) 85 })
69 if 'id' not in exon and 'Name' in exon: 86 if 'id' not in exon and 'Name' in exon:
70 exon['id'] = exon['Name'] 87 exon['id'] = exon['Name']
71 88
72 if 'Parent' in exon: 89
73 for parent in exon['Parent'].split(','): 90 def add_cds_to_dict(cols, cds_parent_dict):
74 if parent not in exon_parent_dict: 91 cds = feature_to_dict(cols, cds_parent_dict)
75 exon_parent_dict[parent] = [exon]
76 else:
77 exon_parent_dict[parent].append(exon)
78
79
80 def five_prime_utr_to_json(cols):
81 five_prime_utr = feature_to_json(cols)
82 if 'Parent' in five_prime_utr:
83 for parent in five_prime_utr['Parent'].split(','):
84 # the 5' UTR can be split among multiple exons
85 if parent not in five_prime_utr_parent_dict:
86 five_prime_utr_parent_dict[parent] = [five_prime_utr]
87 else:
88 five_prime_utr_parent_dict[parent].append(five_prime_utr)
89
90
91 def three_prime_utr_to_json(cols):
92 three_prime_utr = feature_to_json(cols)
93 if 'Parent' in three_prime_utr:
94 for parent in three_prime_utr['Parent'].split(','):
95 # the 3' UTR can be split among multiple exons
96 if parent not in three_prime_utr_parent_dict:
97 three_prime_utr_parent_dict[parent] = [three_prime_utr]
98 else:
99 three_prime_utr_parent_dict[parent].append(three_prime_utr)
100
101
102 def cds_to_json(cols):
103 cds = feature_to_json(cols)
104 if 'id' not in cds: 92 if 'id' not in cds:
105 if 'Name' in cds: 93 if 'Name' in cds:
106 cds['id'] = cds['Name'] 94 cds['id'] = cds['Name']
107 elif 'Parent' in cds: 95 elif 'Parent' in cds and ',' not in cds['Parent']:
108 cds['id'] = cds['Parent'] 96 cds['id'] = cds['Parent']
109 if 'Parent' in cds: 97
110 # At this point we are sure than 'id' is in cds 98
111 for parent in cds['Parent'].split(','): 99 def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict):
112 if parent not in cds_parent_dict:
113 cds_parent_dict[parent] = [cds]
114 else:
115 cds_parent_dict[parent].append(cds)
116
117
118 def join_dicts():
119 for parent, exon_list in exon_parent_dict.items(): 100 for parent, exon_list in exon_parent_dict.items():
120 exon_list.sort(key=lambda _: _['start'])
121 if parent in transcript_dict: 101 if parent in transcript_dict:
102 exon_list.sort(key=lambda _: _['start'])
122 transcript_dict[parent]['Exon'] = exon_list 103 transcript_dict[parent]['Exon'] = exon_list
123 104
124 for transcript_id, transcript in transcript_dict.items(): 105 for transcript_id, transcript in transcript_dict.items():
125 translation = { 106 translation = {
126 'CDS': [], 107 'CDS': [],
179 for parent in transcript['Parent'].split(','): 160 for parent in transcript['Parent'].split(','):
180 if parent in gene_dict: 161 if parent in gene_dict:
181 gene_dict[parent]['Transcript'].append(transcript) 162 gene_dict[parent]['Transcript'].append(transcript)
182 163
183 164
184 def merge_dicts(json_arg): 165 def update_full_gene_dict_no_overwrite(full_gene_dict, gene_dict):
185 with open(json_arg) as f: 166 gene_intersection = set(full_gene_dict.keys()) & set(gene_dict.keys())
186 dict_from_json = json.load(f)
187 gene_intersection = set(gene_dict.keys()) & set(dict_from_json.keys())
188 if gene_intersection: 167 if gene_intersection:
189 raise Exception("JSON file '%s' contains information for genes '%s', which are also present in other files" % (json_arg, ', '.join(gene_intersection))) 168 raise Exception("Information for genes '%s' are present in multiple files" % ', '.join(gene_intersection))
190 gene_dict.update(dict_from_json) 169 full_gene_dict.update(gene_dict)
191 170
192 171
193 def write_json(outfile=None, sort_keys=False): 172 def write_json(full_gene_dict, outfile=None, sort_keys=False):
194 if outfile: 173 if outfile:
195 with open(outfile, 'w') as f: 174 with open(outfile, 'w') as f:
196 json.dump(gene_dict, f, sort_keys=sort_keys) 175 json.dump(full_gene_dict, f, sort_keys=sort_keys)
197 else: 176 else:
198 print(json.dumps(gene_dict, indent=3, sort_keys=sort_keys)) 177 json.dump(full_gene_dict, sys.stdout, sort_keys=sort_keys)
199 178
200 179
201 def __main__(): 180 def __main__():
202 parser = optparse.OptionParser() 181 parser = optparse.OptionParser()
203 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') 182 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
204 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') 183 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
205 parser.add_option('-s', '--sort', action='store_true', help='Sort the keys in the JSON output') 184 parser.add_option('-s', '--sort', action='store_true', help='Sort the keys in the JSON output')
206 parser.add_option('-o', '--output', help='Path of the output file. If not specified, will print on the standard output') 185 parser.add_option('-o', '--output', help='Path of the output file. If not specified, will print on the standard output')
207 options, args = parser.parse_args() 186 options, args = parser.parse_args()
208
209 if args: 187 if args:
210 raise Exception('Use options to provide inputs') 188 raise Exception('Use options to provide inputs')
189
190 full_gene_dict = dict()
211 for gff3_arg in options.gff3: 191 for gff3_arg in options.gff3:
212 try: 192 try:
213 (species, filename) = gff3_arg.split(':') 193 (species, filename) = gff3_arg.split(':')
214 except ValueError: 194 except ValueError:
215 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg) 195 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg)
196 gene_dict = dict()
197 transcript_dict = dict()
198 exon_parent_dict = dict()
199 cds_parent_dict = dict()
200 five_prime_utr_parent_dict = dict()
201 three_prime_utr_parent_dict = dict()
216 with open(filename) as f: 202 with open(filename) as f:
217 for i, line in enumerate(f): 203 for i, line in enumerate(f, start=1):
218 line = line.strip() 204 line = line.strip()
219 if not line: 205 if not line:
220 # skip empty lines 206 # skip empty lines
221 continue 207 continue
222 if line[0] == '#': 208 if line[0] == '#':
226 if len(cols) != 9: 212 if len(cols) != 9:
227 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line)) 213 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line))
228 feature_type = cols[2] 214 feature_type = cols[2]
229 try: 215 try:
230 if feature_type == 'gene': 216 if feature_type == 'gene':
231 gene_to_json(cols, species) 217 add_gene_to_dict(cols, species, gene_dict)
232 elif feature_type in ('mRNA', 'transcript'): 218 elif feature_type in ('mRNA', 'transcript'):
233 transcript_to_json(cols, species) 219 add_transcript_to_dict(cols, species, transcript_dict)
234 elif feature_type == 'exon': 220 elif feature_type == 'exon':
235 exon_to_json(cols, species) 221 add_exon_to_dict(cols, species, exon_parent_dict)
236 elif feature_type == 'five_prime_UTR': 222 elif feature_type == 'five_prime_UTR':
237 five_prime_utr_to_json(cols) 223 feature_to_dict(cols, five_prime_utr_parent_dict)
238 elif feature_type == 'three_prime_UTR': 224 elif feature_type == 'three_prime_UTR':
239 three_prime_utr_to_json(cols) 225 feature_to_dict(cols, three_prime_utr_parent_dict)
240 elif feature_type == 'CDS': 226 elif feature_type == 'CDS':
241 cds_to_json(cols) 227 add_cds_to_dict(cols, cds_parent_dict)
242 else: 228 else:
243 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr) 229 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr)
244 except Exception as e: 230 except Exception as e:
245 raise Exception("Line %i in file '%s': %s" % (i, filename, e)) 231 raise Exception("Line %i in file '%s': %s" % (i, filename, e))
246 join_dicts() 232 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
233 update_full_gene_dict_no_overwrite(full_gene_dict, gene_dict)
247 234
248 for json_arg in options.json: 235 for json_arg in options.json:
249 merge_dicts(json_arg) 236 with open(json_arg) as f:
250 237 update_full_gene_dict_no_overwrite(full_gene_dict, json.load(f))
251 write_json(options.output, options.sort) 238
239 write_json(full_gene_dict, options.output, options.sort)
252 240
253 241
254 if __name__ == '__main__': 242 if __name__ == '__main__':
255 __main__() 243 __main__()