Mercurial > repos > bgruening > trna_prediction
comparison aragorn_out_to_gff3.py @ 3:b86f3e5626f4 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/trna_prediction commit 1b3f92275bb60e606cd4fdc394fb9df95232a5aa
| author | bgruening |
|---|---|
| date | Tue, 28 Oct 2025 09:15:42 +0000 |
| parents | 358f58401cd6 |
| children |
comparison
equal
deleted
inserted
replaced
| 2:358f58401cd6 | 3:b86f3e5626f4 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 import sys | 2 import sys |
| 3 | 3 |
| 4 full_gene_model = False | 4 full_gene_model = False |
| 5 if '--full' in sys.argv: | 5 if "--full" in sys.argv: |
| 6 full_gene_model = True | 6 full_gene_model = True |
| 7 | 7 |
| 8 genome_id = None | 8 genome_id = None |
| 9 stdin_data = [] | 9 stdin_data = [] |
| 10 KEY_ORDER = ('parent', 'source', 'type', 'start', 'end', 'score', 'strand', | 10 KEY_ORDER = ( |
| 11 '8', 'quals') | 11 "parent", |
| 12 "source", | |
| 13 "type", | |
| 14 "start", | |
| 15 "end", | |
| 16 "score", | |
| 17 "strand", | |
| 18 "8", | |
| 19 "quals", | |
| 20 ) | |
| 12 | 21 |
| 13 # Table of amino acids | 22 # Table of amino acids |
| 14 aa_table = { | 23 aa_table = { |
| 15 'Ala' : 'A', | 24 "Ala": "A", |
| 16 'Arg' : 'R', | 25 "Arg": "R", |
| 17 'Asn' : 'N', | 26 "Asn": "N", |
| 18 'Asp' : 'D', | 27 "Asp": "D", |
| 19 'Cys' : 'C', | 28 "Cys": "C", |
| 20 'Gln' : 'Q', | 29 "Gln": "Q", |
| 21 'Glu' : 'E', | 30 "Glu": "E", |
| 22 'Gly' : 'G', | 31 "Gly": "G", |
| 23 'His' : 'H', | 32 "His": "H", |
| 24 'Ile' : 'I', | 33 "Ile": "I", |
| 25 'Leu' : 'L', | 34 "Leu": "L", |
| 26 'Lys' : 'K', | 35 "Lys": "K", |
| 27 'Met' : 'M', | 36 "Met": "M", |
| 28 'Phe' : 'F', | 37 "Phe": "F", |
| 29 'Pro' : 'P', | 38 "Pro": "P", |
| 30 'Ser' : 'S', | 39 "Ser": "S", |
| 31 'Thr' : 'T', | 40 "Thr": "T", |
| 32 'Trp' : 'W', | 41 "Trp": "W", |
| 33 'Tyr' : 'Y', | 42 "Tyr": "Y", |
| 34 'Val' : 'V', | 43 "Val": "V", |
| 35 'Pyl' : 'O', | 44 "Pyl": "O", |
| 36 'seC' : 'U', | 45 "seC": "U", |
| 37 '???' : 'X' } | 46 "???": "X", |
| 47 } | |
| 48 | |
| 38 | 49 |
| 39 def output_line(gff3): | 50 def output_line(gff3): |
| 40 print '\t'.join(str(gff3[x]) for x in KEY_ORDER) | 51 print("\t".join(str(gff3[x]) for x in KEY_ORDER)) |
| 41 | 52 |
| 42 print '##gff-version 3' | 53 |
| 54 print("##gff-version 3") | |
| 43 for line in sys.stdin: | 55 for line in sys.stdin: |
| 44 if line.startswith('>'): | 56 if line.startswith(">"): |
| 45 genome_id = line[1:].strip() | 57 genome_id = line[1:].strip() |
| 46 if ' ' in genome_id: | 58 if " " in genome_id: |
| 47 genome_id = genome_id[0:genome_id.index(' ')] | 59 genome_id = genome_id[0: genome_id.index(" ")] |
| 48 else: | 60 else: |
| 49 data = line.split() | 61 data = line.split() |
| 50 if len(data) == 5: | 62 if len(data) == 5: |
| 51 # Parse data | 63 # Parse data |
| 52 strand = '-' if data[2].startswith('c') else '+' | 64 strand = "-" if data[2].startswith("c") else "+" |
| 53 start, end = data[2][data[2].index('[') + 1:-1].split(',') | 65 start, end = data[2][data[2].index("[") + 1: -1].split(",") |
| 54 | 66 |
| 55 gff3 = { | 67 gff3 = { |
| 56 'parent': genome_id, | 68 "parent": genome_id, |
| 57 'source': 'aragorn', | 69 "source": "aragorn", |
| 58 'start': int(start), | 70 "start": int(start), |
| 59 'end': int(end), | 71 "end": int(end), |
| 60 'strand': strand, | 72 "strand": strand, |
| 61 'score': '.', | 73 "score": ".", |
| 62 '8': '.', | 74 "8": ".", |
| 63 } | 75 } |
| 64 | 76 |
| 65 aa_long = data[1][5:] | 77 aa_long = data[1][5:] |
| 66 aa_short = aa_table[aa_long] | 78 aa_short = aa_table[aa_long] |
| 67 anticodon = data[4][1:data[4].index(")")].upper().replace("T", "U") | 79 anticodon = data[4][1: data[4].index(")")].upper().replace("T", "U") |
| 68 name = 'trn{}-{}'.format(aa_short, anticodon) | 80 name = "trn{}-{}".format(aa_short, anticodon) |
| 69 | 81 |
| 70 if not full_gene_model: | 82 if not full_gene_model: |
| 71 gff3.update({ | 83 gff3.update( |
| 72 'type': 'tRNA', | 84 { |
| 73 'quals': 'ID=tRNA{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name), | 85 "type": "tRNA", |
| 74 }) | 86 "quals": "ID=tRNA{0}.{1};Name={name};product={2}".format( |
| 87 genome_id, *data, name=name | |
| 88 ), | |
| 89 } | |
| 90 ) | |
| 75 output_line(gff3) | 91 output_line(gff3) |
| 76 else: | 92 else: |
| 77 gff3.update({ | 93 gff3.update( |
| 78 'type': 'gene', | 94 { |
| 79 'quals': 'ID=gene{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name), | 95 "type": "gene", |
| 80 }) | 96 "quals": "ID=gene{0}.{1};Name={name};product={2}".format( |
| 97 genome_id, *data, name=name | |
| 98 ), | |
| 99 } | |
| 100 ) | |
| 81 output_line(gff3) | 101 output_line(gff3) |
| 82 gff3.update({ | 102 gff3.update( |
| 83 'type': 'tRNA', | 103 { |
| 84 'quals': 'ID=tRNA{0}.{1};Parent=gene{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name), | 104 "type": "tRNA", |
| 85 }) | 105 "quals": "ID=tRNA{0}.{1};Parent=gene{0}.{1};Name={name};product={2}".format( |
| 106 genome_id, *data, name=name | |
| 107 ), | |
| 108 } | |
| 109 ) | |
| 86 output_line(gff3) | 110 output_line(gff3) |
| 87 | 111 |
| 88 # If no introns | 112 # If no introns |
| 89 if ')i(' not in data[4]: | 113 if ")i(" not in data[4]: |
| 90 gff3['type'] = 'exon' | 114 gff3["type"] = "exon" |
| 91 gff3['quals'] = 'Parent=tRNA{0}.{1}'.format(genome_id, *data) | 115 gff3["quals"] = "Parent=tRNA{0}.{1}".format(genome_id, *data) |
| 92 output_line(gff3) | 116 output_line(gff3) |
| 93 else: | 117 else: |
| 94 intron_location = data[4][data[4].rindex('(') + 1:-1].split(',') | 118 intron_location = data[4][data[4].rindex("(") + 1: -1].split(",") |
| 95 intron_start, intron_length = map(int, intron_location) | 119 intron_start, intron_length = map(int, intron_location) |
| 96 if strand == '+': | 120 if strand == "+": |
| 97 original_end = gff3['end'] | 121 original_end = gff3["end"] |
| 98 else: | 122 else: |
| 99 original_end = gff3['start'] | 123 original_end = gff3["start"] |
| 100 | 124 |
| 101 # EXON | 125 # EXON |
| 102 gff3.update({ | 126 gff3.update( |
| 103 'type': 'exon', | 127 { |
| 104 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data), | 128 "type": "exon", |
| 105 }) | 129 "quals": "Parent=tRNA{0}.{1}".format(genome_id, *data), |
| 106 if strand == '+': | 130 } |
| 107 gff3['end'] = gff3['start'] + intron_start - 2 | 131 ) |
| 132 if strand == "+": | |
| 133 gff3["end"] = gff3["start"] + intron_start - 2 | |
| 108 else: | 134 else: |
| 109 gff3['start'] = gff3['end'] - intron_start + 2 | 135 gff3["start"] = gff3["end"] - intron_start + 2 |
| 110 | 136 |
| 111 output_line(gff3) | 137 output_line(gff3) |
| 112 | 138 |
| 113 # INTRON | 139 # INTRON |
| 114 gff3.update({ | 140 gff3.update( |
| 115 'type': 'intron', | 141 { |
| 116 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data), | 142 "type": "intron", |
| 117 }) | 143 "quals": "Parent=tRNA{0}.{1}".format(genome_id, *data), |
| 118 if strand == '+': | 144 } |
| 119 gff3['start'] = gff3['end'] + 1 | 145 ) |
| 120 gff3['end'] = gff3['start'] + intron_length + 2 | 146 if strand == "+": |
| 147 gff3["start"] = gff3["end"] + 1 | |
| 148 gff3["end"] = gff3["start"] + intron_length + 2 | |
| 121 else: | 149 else: |
| 122 gff3['end'] = gff3['start'] - 1 | 150 gff3["end"] = gff3["start"] - 1 |
| 123 gff3['start'] = gff3['end'] - intron_length + 1 | 151 gff3["start"] = gff3["end"] - intron_length + 1 |
| 124 | 152 |
| 125 output_line(gff3) | 153 output_line(gff3) |
| 126 | 154 |
| 127 # EXON | 155 # EXON |
| 128 gff3.update({ | 156 gff3.update( |
| 129 'type': 'exon', | 157 { |
| 130 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data), | 158 "type": "exon", |
| 131 }) | 159 "quals": "Parent=tRNA{0}.{1}".format(genome_id, *data), |
| 132 if strand == '+': | 160 } |
| 133 gff3['start'] = gff3['end'] + 1 | 161 ) |
| 134 gff3['end'] = original_end | 162 if strand == "+": |
| 163 gff3["start"] = gff3["end"] + 1 | |
| 164 gff3["end"] = original_end | |
| 135 else: | 165 else: |
| 136 gff3['end'] = gff3['start'] - 1 | 166 gff3["end"] = gff3["start"] - 1 |
| 137 gff3['start'] = original_end | 167 gff3["start"] = original_end |
| 138 | 168 |
| 139 output_line(gff3) | 169 output_line(gff3) |
