comparison aragorn_out_to_gff3.py @ 3:b86f3e5626f4 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/trna_prediction commit 1b3f92275bb60e606cd4fdc394fb9df95232a5aa
author bgruening
date Tue, 28 Oct 2025 09:15:42 +0000
parents 358f58401cd6
children
comparison
equal deleted inserted replaced
2:358f58401cd6 3:b86f3e5626f4
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 import sys 2 import sys
3 3
4 full_gene_model = False 4 full_gene_model = False
5 if '--full' in sys.argv: 5 if "--full" in sys.argv:
6 full_gene_model = True 6 full_gene_model = True
7 7
8 genome_id = None 8 genome_id = None
9 stdin_data = [] 9 stdin_data = []
10 KEY_ORDER = ('parent', 'source', 'type', 'start', 'end', 'score', 'strand', 10 KEY_ORDER = (
11 '8', 'quals') 11 "parent",
12 "source",
13 "type",
14 "start",
15 "end",
16 "score",
17 "strand",
18 "8",
19 "quals",
20 )
12 21
13 # Table of amino acids 22 # Table of amino acids
14 aa_table = { 23 aa_table = {
15 'Ala' : 'A', 24 "Ala": "A",
16 'Arg' : 'R', 25 "Arg": "R",
17 'Asn' : 'N', 26 "Asn": "N",
18 'Asp' : 'D', 27 "Asp": "D",
19 'Cys' : 'C', 28 "Cys": "C",
20 'Gln' : 'Q', 29 "Gln": "Q",
21 'Glu' : 'E', 30 "Glu": "E",
22 'Gly' : 'G', 31 "Gly": "G",
23 'His' : 'H', 32 "His": "H",
24 'Ile' : 'I', 33 "Ile": "I",
25 'Leu' : 'L', 34 "Leu": "L",
26 'Lys' : 'K', 35 "Lys": "K",
27 'Met' : 'M', 36 "Met": "M",
28 'Phe' : 'F', 37 "Phe": "F",
29 'Pro' : 'P', 38 "Pro": "P",
30 'Ser' : 'S', 39 "Ser": "S",
31 'Thr' : 'T', 40 "Thr": "T",
32 'Trp' : 'W', 41 "Trp": "W",
33 'Tyr' : 'Y', 42 "Tyr": "Y",
34 'Val' : 'V', 43 "Val": "V",
35 'Pyl' : 'O', 44 "Pyl": "O",
36 'seC' : 'U', 45 "seC": "U",
37 '???' : 'X' } 46 "???": "X",
47 }
48
38 49
39 def output_line(gff3): 50 def output_line(gff3):
40 print '\t'.join(str(gff3[x]) for x in KEY_ORDER) 51 print("\t".join(str(gff3[x]) for x in KEY_ORDER))
41 52
42 print '##gff-version 3' 53
54 print("##gff-version 3")
43 for line in sys.stdin: 55 for line in sys.stdin:
44 if line.startswith('>'): 56 if line.startswith(">"):
45 genome_id = line[1:].strip() 57 genome_id = line[1:].strip()
46 if ' ' in genome_id: 58 if " " in genome_id:
47 genome_id = genome_id[0:genome_id.index(' ')] 59 genome_id = genome_id[0: genome_id.index(" ")]
48 else: 60 else:
49 data = line.split() 61 data = line.split()
50 if len(data) == 5: 62 if len(data) == 5:
51 # Parse data 63 # Parse data
52 strand = '-' if data[2].startswith('c') else '+' 64 strand = "-" if data[2].startswith("c") else "+"
53 start, end = data[2][data[2].index('[') + 1:-1].split(',') 65 start, end = data[2][data[2].index("[") + 1: -1].split(",")
54 66
55 gff3 = { 67 gff3 = {
56 'parent': genome_id, 68 "parent": genome_id,
57 'source': 'aragorn', 69 "source": "aragorn",
58 'start': int(start), 70 "start": int(start),
59 'end': int(end), 71 "end": int(end),
60 'strand': strand, 72 "strand": strand,
61 'score': '.', 73 "score": ".",
62 '8': '.', 74 "8": ".",
63 } 75 }
64 76
65 aa_long = data[1][5:] 77 aa_long = data[1][5:]
66 aa_short = aa_table[aa_long] 78 aa_short = aa_table[aa_long]
67 anticodon = data[4][1:data[4].index(")")].upper().replace("T", "U") 79 anticodon = data[4][1: data[4].index(")")].upper().replace("T", "U")
68 name = 'trn{}-{}'.format(aa_short, anticodon) 80 name = "trn{}-{}".format(aa_short, anticodon)
69 81
70 if not full_gene_model: 82 if not full_gene_model:
71 gff3.update({ 83 gff3.update(
72 'type': 'tRNA', 84 {
73 'quals': 'ID=tRNA{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name), 85 "type": "tRNA",
74 }) 86 "quals": "ID=tRNA{0}.{1};Name={name};product={2}".format(
87 genome_id, *data, name=name
88 ),
89 }
90 )
75 output_line(gff3) 91 output_line(gff3)
76 else: 92 else:
77 gff3.update({ 93 gff3.update(
78 'type': 'gene', 94 {
79 'quals': 'ID=gene{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name), 95 "type": "gene",
80 }) 96 "quals": "ID=gene{0}.{1};Name={name};product={2}".format(
97 genome_id, *data, name=name
98 ),
99 }
100 )
81 output_line(gff3) 101 output_line(gff3)
82 gff3.update({ 102 gff3.update(
83 'type': 'tRNA', 103 {
84 'quals': 'ID=tRNA{0}.{1};Parent=gene{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name), 104 "type": "tRNA",
85 }) 105 "quals": "ID=tRNA{0}.{1};Parent=gene{0}.{1};Name={name};product={2}".format(
106 genome_id, *data, name=name
107 ),
108 }
109 )
86 output_line(gff3) 110 output_line(gff3)
87 111
88 # If no introns 112 # If no introns
89 if ')i(' not in data[4]: 113 if ")i(" not in data[4]:
90 gff3['type'] = 'exon' 114 gff3["type"] = "exon"
91 gff3['quals'] = 'Parent=tRNA{0}.{1}'.format(genome_id, *data) 115 gff3["quals"] = "Parent=tRNA{0}.{1}".format(genome_id, *data)
92 output_line(gff3) 116 output_line(gff3)
93 else: 117 else:
94 intron_location = data[4][data[4].rindex('(') + 1:-1].split(',') 118 intron_location = data[4][data[4].rindex("(") + 1: -1].split(",")
95 intron_start, intron_length = map(int, intron_location) 119 intron_start, intron_length = map(int, intron_location)
96 if strand == '+': 120 if strand == "+":
97 original_end = gff3['end'] 121 original_end = gff3["end"]
98 else: 122 else:
99 original_end = gff3['start'] 123 original_end = gff3["start"]
100 124
101 # EXON 125 # EXON
102 gff3.update({ 126 gff3.update(
103 'type': 'exon', 127 {
104 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data), 128 "type": "exon",
105 }) 129 "quals": "Parent=tRNA{0}.{1}".format(genome_id, *data),
106 if strand == '+': 130 }
107 gff3['end'] = gff3['start'] + intron_start - 2 131 )
132 if strand == "+":
133 gff3["end"] = gff3["start"] + intron_start - 2
108 else: 134 else:
109 gff3['start'] = gff3['end'] - intron_start + 2 135 gff3["start"] = gff3["end"] - intron_start + 2
110 136
111 output_line(gff3) 137 output_line(gff3)
112 138
113 # INTRON 139 # INTRON
114 gff3.update({ 140 gff3.update(
115 'type': 'intron', 141 {
116 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data), 142 "type": "intron",
117 }) 143 "quals": "Parent=tRNA{0}.{1}".format(genome_id, *data),
118 if strand == '+': 144 }
119 gff3['start'] = gff3['end'] + 1 145 )
120 gff3['end'] = gff3['start'] + intron_length + 2 146 if strand == "+":
147 gff3["start"] = gff3["end"] + 1
148 gff3["end"] = gff3["start"] + intron_length + 2
121 else: 149 else:
122 gff3['end'] = gff3['start'] - 1 150 gff3["end"] = gff3["start"] - 1
123 gff3['start'] = gff3['end'] - intron_length + 1 151 gff3["start"] = gff3["end"] - intron_length + 1
124 152
125 output_line(gff3) 153 output_line(gff3)
126 154
127 # EXON 155 # EXON
128 gff3.update({ 156 gff3.update(
129 'type': 'exon', 157 {
130 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data), 158 "type": "exon",
131 }) 159 "quals": "Parent=tRNA{0}.{1}".format(genome_id, *data),
132 if strand == '+': 160 }
133 gff3['start'] = gff3['end'] + 1 161 )
134 gff3['end'] = original_end 162 if strand == "+":
163 gff3["start"] = gff3["end"] + 1
164 gff3["end"] = original_end
135 else: 165 else:
136 gff3['end'] = gff3['start'] - 1 166 gff3["end"] = gff3["start"] - 1
137 gff3['start'] = original_end 167 gff3["start"] = original_end
138 168
139 output_line(gff3) 169 output_line(gff3)