annotate gff3_to_json.py @ 0:be6cec883b02 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
author earlhaminst
date Wed, 21 Dec 2016 10:02:59 -0500
parents
children befe6021e476
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
1 from __future__ import print_function
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
2
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
3 import json
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
4 import optparse
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
5 import sys
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
6
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
7 cds_parent_dict = dict()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
8 exon_parent_dict = dict()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
9 five_prime_utr_parent_dict = dict()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
10 gene_count = 0
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
11 gene_dict = dict()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
12 transcript_dict = dict()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
13 three_prime_utr_parent_dict = dict()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
14
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
15
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
16 def feature_to_json(cols):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
17 d = {
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
18 'end': int(cols[4]),
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
19 'start': int(cols[3]),
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
20 }
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
21 for attr in cols[8].split(';'):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
22 if '=' in attr:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
23 (tag, value) = attr.split('=')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
24 if tag == 'ID':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
25 d['id'] = value
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
26 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
27 d[tag] = value
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
28 if cols[6] == '+':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
29 d['strand'] = 1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
30 elif cols[6] == '-':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
31 d['strand'] = -1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
32 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
33 raise Exception("Unrecognized strand '%s'" % cols[6])
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
34 return d
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
35
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
36
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
37 def gene_to_json(cols, species):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
38 global gene_count
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
39 gene = feature_to_json(cols)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
40 gene.update({
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
41 'member_id': gene_count,
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
42 'object_type': 'Gene',
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
43 'seq_region_name': cols[0],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
44 'species': species,
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
45 'Transcript': [],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
46 })
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
47 gene_dict[gene['id']] = gene
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
48 gene_count = gene_count + 1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
49
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
50
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
51 def transcript_to_json(cols, species):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
52 transcript = feature_to_json(cols)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
53 transcript.update({
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
54 'object_type': 'Transcript',
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
55 'seq_region_name': cols[0],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
56 'species': species,
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
57 })
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
58 transcript_dict[transcript['id']] = transcript
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
59
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
60
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
61 def exon_to_json(cols, species):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
62 exon = feature_to_json(cols)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
63 exon.update({
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
64 'length': int(cols[4]) - int(cols[3]) + 1,
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
65 'object_type': 'Exon',
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
66 'seq_region_name': cols[0],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
67 'species': species,
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
68 })
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
69 if 'id' not in exon and 'Name' in exon:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
70 exon['id'] = exon['Name']
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
71
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
72 if 'Parent' in exon:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
73 for parent in exon['Parent'].split(','):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
74 if parent not in exon_parent_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
75 exon_parent_dict[parent] = [exon]
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
76 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
77 exon_parent_dict[parent].append(exon)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
78
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
79
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
80 def five_prime_utr_to_json(cols):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
81 five_prime_utr = feature_to_json(cols)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
82 if 'Parent' in five_prime_utr:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
83 for parent in five_prime_utr['Parent'].split(','):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
84 # the 5' UTR can be split among multiple exons
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
85 if parent not in five_prime_utr_parent_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
86 five_prime_utr_parent_dict[parent] = [five_prime_utr]
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
87 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
88 five_prime_utr_parent_dict[parent].append(five_prime_utr)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
89
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
90
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
91 def three_prime_utr_to_json(cols):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
92 three_prime_utr = feature_to_json(cols)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
93 if 'Parent' in three_prime_utr:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
94 for parent in three_prime_utr['Parent'].split(','):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
95 # the 3' UTR can be split among multiple exons
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
96 if parent not in three_prime_utr_parent_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
97 three_prime_utr_parent_dict[parent] = [three_prime_utr]
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
98 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
99 three_prime_utr_parent_dict[parent].append(three_prime_utr)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
100
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
101
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
102 def cds_to_json(cols):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
103 cds = feature_to_json(cols)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
104 if 'id' not in cds:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
105 if 'Name' in cds:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
106 cds['id'] = cds['Name']
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
107 elif 'Parent' in cds:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
108 cds['id'] = cds['Parent']
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
109 if 'Parent' in cds:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
110 # At this point we are sure than 'id' is in cds
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
111 for parent in cds['Parent'].split(','):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
112 if parent not in cds_parent_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
113 cds_parent_dict[parent] = [cds]
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
114 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
115 cds_parent_dict[parent].append(cds)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
116
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
117
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
118 def join_dicts():
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
119 for parent, exon_list in exon_parent_dict.items():
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
120 exon_list.sort(key=lambda _: _['start'])
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
121 if parent in transcript_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
122 transcript_dict[parent]['Exon'] = exon_list
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
123
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
124 for transcript_id, transcript in transcript_dict.items():
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
125 translation = {
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
126 'CDS': [],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
127 'id': None,
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
128 'end': transcript['end'],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
129 'object_type': 'Translation',
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
130 'species': transcript['species'],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
131 'start': transcript['start'],
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
132 }
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
133 found_cds = False
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
134 derived_translation_start = None
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
135 derived_translation_end = None
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
136 if transcript_id in cds_parent_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
137 cds_list = cds_parent_dict[transcript_id]
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
138 cds_ids = set(_['id'] for _ in cds_list)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
139 if len(cds_ids) > 1:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
140 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % parent)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
141 translation['id'] = cds_ids.pop()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
142 cds_list.sort(key=lambda _: _['start'])
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
143 translation['CDS'] = cds_list
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
144 translation['start'] = cds_list[0]['start']
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
145 translation['end'] = cds_list[-1]['end']
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
146 found_cds = True
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
147 if transcript_id in five_prime_utr_parent_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
148 five_prime_utr_list = five_prime_utr_parent_dict[transcript_id]
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
149 five_prime_utr_list.sort(key=lambda _: _['start'])
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
150 if transcript['strand'] == 1:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
151 derived_translation_start = five_prime_utr_list[-1]['end'] + 1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
152 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
153 derived_translation_end = five_prime_utr_list[0]['start'] - 1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
154 if transcript_id in three_prime_utr_parent_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
155 three_prime_utr_list = three_prime_utr_parent_dict[transcript_id]
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
156 three_prime_utr_list.sort(key=lambda _: _['start'])
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
157 if transcript['strand'] == 1:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
158 derived_translation_end = three_prime_utr_list[0]['start'] - 1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
159 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
160 derived_translation_start = three_prime_utr_list[-1]['end'] + 1
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
161 if derived_translation_start is not None:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
162 if found_cds:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
163 if derived_translation_start > translation['start']:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
164 raise Exception("UTR overlaps with CDS")
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
165 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
166 translation['start'] = derived_translation_start
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
167 if derived_translation_end is not None:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
168 if found_cds:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
169 if derived_translation_end < translation['end']:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
170 raise Exception("UTR overlaps with CDS")
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
171 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
172 translation['end'] = derived_translation_end
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
173 if found_cds or derived_translation_start is not None or derived_translation_end is not None:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
174 transcript['Translation'] = translation
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
175
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
176 for transcript in transcript_dict.values():
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
177 if 'Parent' in transcript:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
178 # A polycistronic transcript can have multiple parents
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
179 for parent in transcript['Parent'].split(','):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
180 if parent in gene_dict:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
181 gene_dict[parent]['Transcript'].append(transcript)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
182
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
183
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
184 def merge_dicts(json_arg):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
185 with open(json_arg) as f:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
186 dict_from_json = json.load(f)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
187 gene_intersection = set(gene_dict.keys()) & set(dict_from_json.keys())
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
188 if gene_intersection:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
189 raise Exception("JSON file '%s' contains information for genes '%s', which are also present in other files" % (json_arg, ', '.join(gene_intersection)))
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
190 gene_dict.update(dict_from_json)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
191
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
192
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
193 def write_json(outfile=None, sort_keys=False):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
194 if outfile:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
195 with open(outfile, 'w') as f:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
196 json.dump(gene_dict, f, sort_keys=sort_keys)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
197 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
198 print(json.dumps(gene_dict, indent=3, sort_keys=sort_keys))
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
199
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
200
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
201 def __main__():
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
202 parser = optparse.OptionParser()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
203 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
204 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
205 parser.add_option('-s', '--sort', action='store_true', help='Sort the keys in the JSON output')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
206 parser.add_option('-o', '--output', help='Path of the output file. If not specified, will print on the standard output')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
207 options, args = parser.parse_args()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
208
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
209 if args:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
210 raise Exception('Use options to provide inputs')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
211 for gff3_arg in options.gff3:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
212 try:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
213 (species, filename) = gff3_arg.split(':')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
214 except ValueError:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
215 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
216 with open(filename) as f:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
217 for i, line in enumerate(f):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
218 line = line.strip()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
219 if not line:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
220 # skip empty lines
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
221 continue
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
222 if line[0] == '#':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
223 # skip comment lines
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
224 continue
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
225 cols = line.split('\t')
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
226 if len(cols) != 9:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
227 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line))
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
228 feature_type = cols[2]
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
229 try:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
230 if feature_type == 'gene':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
231 gene_to_json(cols, species)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
232 elif feature_type in ('mRNA', 'transcript'):
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
233 transcript_to_json(cols, species)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
234 elif feature_type == 'exon':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
235 exon_to_json(cols, species)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
236 elif feature_type == 'five_prime_UTR':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
237 five_prime_utr_to_json(cols)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
238 elif feature_type == 'three_prime_UTR':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
239 three_prime_utr_to_json(cols)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
240 elif feature_type == 'CDS':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
241 cds_to_json(cols)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
242 else:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
243 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
244 except Exception as e:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
245 raise Exception("Line %i in file '%s': %s" % (i, filename, e))
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
246 join_dicts()
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
247
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
248 for json_arg in options.json:
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
249 merge_dicts(json_arg)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
250
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
251 write_json(options.output, options.sort)
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
252
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
253
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
254 if __name__ == '__main__':
be6cec883b02 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 822c798d43a72724eeab174043fdaafcfdac845f-dirty
earlhaminst
parents:
diff changeset
255 __main__()