annotate gff3_to_json.py @ 1:befe6021e476 draft default tip

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
author earlhaminst
date Tue, 28 Feb 2017 12:06:04 -0500
parents be6cec883b02
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
1 from __future__ import print_function
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
2
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
3 import json
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
4 import optparse
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
5 import sys
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
6
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
7 gene_count = 0
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
8
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
9
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
10 def remove_type_from_list_of_ids(l):
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
11 return ','.join(remove_type_from_id(_) for _ in l.split(','))
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
12
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
13
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
14 def remove_type_from_id(id_):
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
15 colon_index = id_.find(':')
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
16 if colon_index >= 0:
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
17 return id_[colon_index + 1:]
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
18 else:
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
19 return id_
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
20
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
21
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
22 def feature_to_dict(cols, parent_dict=None):
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
23 d = {
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
24 'end': int(cols[4]),
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
25 'start': int(cols[3]),
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
26 }
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
27 for attr in cols[8].split(';'):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
28 if '=' in attr:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
29 (tag, value) = attr.split('=')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
30 if tag == 'ID':
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
31 tag = 'id'
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
32 value = remove_type_from_id(value)
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
33 elif tag == 'Parent':
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
34 value = remove_type_from_list_of_ids(value)
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
35 d[tag] = value
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
36 if cols[6] == '+':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
37 d['strand'] = 1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
38 elif cols[6] == '-':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
39 d['strand'] = -1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
40 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
41 raise Exception("Unrecognized strand '%s'" % cols[6])
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
42 if parent_dict is not None and 'Parent' in d:
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
43 # a 3' UTR can be split among multiple exons
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
44 # a 5' UTR can be split among multiple exons
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
45 # a CDS can be part of multiple transcripts
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
46 for parent in d['Parent'].split(','):
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
47 if parent not in parent_dict:
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
48 parent_dict[parent] = [d]
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
49 else:
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
50 parent_dict[parent].append(d)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
51 return d
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
52
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
53
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
54 def add_gene_to_dict(cols, species, gene_dict):
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
55 global gene_count
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
56 gene = feature_to_dict(cols)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
57 gene.update({
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
58 'member_id': gene_count,
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
59 'object_type': 'Gene',
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
60 'seq_region_name': cols[0],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
61 'species': species,
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
62 'Transcript': [],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
63 })
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
64 gene_dict[gene['id']] = gene
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
65 gene_count = gene_count + 1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
66
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
67
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
68 def add_transcript_to_dict(cols, species, transcript_dict):
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
69 transcript = feature_to_dict(cols)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
70 transcript.update({
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
71 'object_type': 'Transcript',
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
72 'seq_region_name': cols[0],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
73 'species': species,
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
74 })
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
75 transcript_dict[transcript['id']] = transcript
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
76
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
77
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
78 def add_exon_to_dict(cols, species, exon_parent_dict):
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
79 exon = feature_to_dict(cols, exon_parent_dict)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
80 exon.update({
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
81 'length': int(cols[4]) - int(cols[3]) + 1,
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
82 'object_type': 'Exon',
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
83 'seq_region_name': cols[0],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
84 'species': species,
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
85 })
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
86 if 'id' not in exon and 'Name' in exon:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
87 exon['id'] = exon['Name']
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
88
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
89
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
90 def add_cds_to_dict(cols, cds_parent_dict):
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
91 cds = feature_to_dict(cols, cds_parent_dict)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
92 if 'id' not in cds:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
93 if 'Name' in cds:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
94 cds['id'] = cds['Name']
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
95 elif 'Parent' in cds and ',' not in cds['Parent']:
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
96 cds['id'] = cds['Parent']
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
97
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
98
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
99 def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict):
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
100 for parent, exon_list in exon_parent_dict.items():
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
101 if parent in transcript_dict:
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
102 exon_list.sort(key=lambda _: _['start'])
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
103 transcript_dict[parent]['Exon'] = exon_list
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
104
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
105 for transcript_id, transcript in transcript_dict.items():
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
106 translation = {
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
107 'CDS': [],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
108 'id': None,
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
109 'end': transcript['end'],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
110 'object_type': 'Translation',
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
111 'species': transcript['species'],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
112 'start': transcript['start'],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
113 }
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
114 found_cds = False
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
115 derived_translation_start = None
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
116 derived_translation_end = None
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
117 if transcript_id in cds_parent_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
118 cds_list = cds_parent_dict[transcript_id]
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
119 cds_ids = set(_['id'] for _ in cds_list)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
120 if len(cds_ids) > 1:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
121 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % parent)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
122 translation['id'] = cds_ids.pop()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
123 cds_list.sort(key=lambda _: _['start'])
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
124 translation['CDS'] = cds_list
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
125 translation['start'] = cds_list[0]['start']
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
126 translation['end'] = cds_list[-1]['end']
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
127 found_cds = True
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
128 if transcript_id in five_prime_utr_parent_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
129 five_prime_utr_list = five_prime_utr_parent_dict[transcript_id]
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
130 five_prime_utr_list.sort(key=lambda _: _['start'])
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
131 if transcript['strand'] == 1:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
132 derived_translation_start = five_prime_utr_list[-1]['end'] + 1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
133 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
134 derived_translation_end = five_prime_utr_list[0]['start'] - 1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
135 if transcript_id in three_prime_utr_parent_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
136 three_prime_utr_list = three_prime_utr_parent_dict[transcript_id]
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
137 three_prime_utr_list.sort(key=lambda _: _['start'])
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
138 if transcript['strand'] == 1:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
139 derived_translation_end = three_prime_utr_list[0]['start'] - 1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
140 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
141 derived_translation_start = three_prime_utr_list[-1]['end'] + 1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
142 if derived_translation_start is not None:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
143 if found_cds:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
144 if derived_translation_start > translation['start']:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
145 raise Exception("UTR overlaps with CDS")
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
146 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
147 translation['start'] = derived_translation_start
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
148 if derived_translation_end is not None:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
149 if found_cds:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
150 if derived_translation_end < translation['end']:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
151 raise Exception("UTR overlaps with CDS")
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
152 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
153 translation['end'] = derived_translation_end
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
154 if found_cds or derived_translation_start is not None or derived_translation_end is not None:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
155 transcript['Translation'] = translation
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
156
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
157 for transcript in transcript_dict.values():
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
158 if 'Parent' in transcript:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
159 # A polycistronic transcript can have multiple parents
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
160 for parent in transcript['Parent'].split(','):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
161 if parent in gene_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
162 gene_dict[parent]['Transcript'].append(transcript)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
163
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
164
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
165 def update_full_gene_dict_no_overwrite(full_gene_dict, gene_dict):
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
166 gene_intersection = set(full_gene_dict.keys()) & set(gene_dict.keys())
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
167 if gene_intersection:
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
168 raise Exception("Information for genes '%s' are present in multiple files" % ', '.join(gene_intersection))
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
169 full_gene_dict.update(gene_dict)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
170
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
171
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
172 def write_json(full_gene_dict, outfile=None, sort_keys=False):
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
173 if outfile:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
174 with open(outfile, 'w') as f:
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
175 json.dump(full_gene_dict, f, sort_keys=sort_keys)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
176 else:
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
177 json.dump(full_gene_dict, sys.stdout, sort_keys=sort_keys)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
178
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
179
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
180 def __main__():
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
181 parser = optparse.OptionParser()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
182 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
183 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
184 parser.add_option('-s', '--sort', action='store_true', help='Sort the keys in the JSON output')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
185 parser.add_option('-o', '--output', help='Path of the output file. If not specified, will print on the standard output')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
186 options, args = parser.parse_args()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
187 if args:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
188 raise Exception('Use options to provide inputs')
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
189
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
190 full_gene_dict = dict()
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
191 for gff3_arg in options.gff3:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
192 try:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
193 (species, filename) = gff3_arg.split(':')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
194 except ValueError:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
195 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg)
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
196 gene_dict = dict()
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
197 transcript_dict = dict()
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
198 exon_parent_dict = dict()
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
199 cds_parent_dict = dict()
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
200 five_prime_utr_parent_dict = dict()
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
201 three_prime_utr_parent_dict = dict()
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
202 with open(filename) as f:
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
203 for i, line in enumerate(f, start=1):
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
204 line = line.strip()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
205 if not line:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
206 # skip empty lines
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
207 continue
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
208 if line[0] == '#':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
209 # skip comment lines
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
210 continue
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
211 cols = line.split('\t')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
212 if len(cols) != 9:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
213 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line))
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
214 feature_type = cols[2]
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
215 try:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
216 if feature_type == 'gene':
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
217 add_gene_to_dict(cols, species, gene_dict)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
218 elif feature_type in ('mRNA', 'transcript'):
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
219 add_transcript_to_dict(cols, species, transcript_dict)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
220 elif feature_type == 'exon':
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
221 add_exon_to_dict(cols, species, exon_parent_dict)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
222 elif feature_type == 'five_prime_UTR':
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
223 feature_to_dict(cols, five_prime_utr_parent_dict)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
224 elif feature_type == 'three_prime_UTR':
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
225 feature_to_dict(cols, three_prime_utr_parent_dict)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
226 elif feature_type == 'CDS':
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
227 add_cds_to_dict(cols, cds_parent_dict)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
228 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
229 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
230 except Exception as e:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
231 raise Exception("Line %i in file '%s': %s" % (i, filename, e))
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
232 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
233 update_full_gene_dict_no_overwrite(full_gene_dict, gene_dict)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
234
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
235 for json_arg in options.json:
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
236 with open(json_arg) as f:
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
237 update_full_gene_dict_no_overwrite(full_gene_dict, json.load(f))
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
238
1
befe6021e476 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
earlhaminst
parents: 0
diff changeset
239 write_json(full_gene_dict, options.output, options.sort)
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
240
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
241
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
242 if __name__ == '__main__':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
243 __main__()