comparison remove_terminal_stop_codons.py @ 0:0290a7285026 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/remove_terminal_stop_codons commit 0c1c0e260ebecab6beb23fd56322b391e62d12fa
author iuc
date Fri, 05 Dec 2025 23:22:35 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0290a7285026
1 #!/usr/bin/env python3
2 """
3 Remove terminal stop codons from coding sequences.
4
5 Trim all terminal stop codons from sequences in a FASTA file, using a chosen
6 NCBI genetic code (translation table). Leave non-stop terminal codons alone.
7 If any INTERNAL, in-frame stop codon is found, exit with an error.
8
9 This tool is designed as a preprocessing step for tools like cawlign and HyPhy
10 that do not permit internal stop codons in their input sequences.
11
12 Requires: Biopython
13 """
14
15 import argparse
16 import sys
17
18 from Bio import SeqIO
19 from Bio.Data import CodonTable
20 from Bio.Seq import Seq
21
22
23 def load_table(table_arg):
24 """Return a DNA codon table from an NCBI table id (int) or name (str)."""
25 if table_arg is None:
26 return CodonTable.unambiguous_dna_by_id[1] # Standard
27 # try as integer id
28 try:
29 tid = int(table_arg)
30 return CodonTable.unambiguous_dna_by_id[tid]
31 except (ValueError, KeyError):
32 pass
33 # try as name
34 try:
35 return CodonTable.unambiguous_dna_by_name[table_arg]
36 except KeyError:
37 # Build a helpful hint list
38 valid_ids = sorted(CodonTable.unambiguous_dna_by_id.keys())
39 valid_names = sorted(CodonTable.unambiguous_dna_by_name.keys())
40 sys.stderr.write(
41 f"ERROR: Unknown genetic code '{table_arg}'.\n"
42 f"Try an NCBI table id (e.g., 1) or one of these names:\n"
43 f" {', '.join(valid_names)}\n"
44 f"(Valid ids include: {', '.join(map(str, valid_ids))})\n"
45 )
46 sys.exit(2)
47
48
49 def trim_terminal_stops_and_validate(record, stop_codons, check_internal=True):
50 """
51 Remove ALL trailing stop codons (0+ at the end).
52
53 If check_internal is True and any internal in-frame stop codon exists
54 (excluding the trailing block), exit with an error message.
55
56 Ignore a terminal codon that is not a stop codon.
57 """
58 # Work with DNA letters; treat any RNA U as T
59 seq_str = str(record.seq).upper().replace("U", "T")
60
61 # Count how many full codons sit at the end that are stops
62 idx = len(seq_str)
63 trailing_stops = 0
64 while idx >= 3:
65 codon = seq_str[idx - 3:idx]
66 if codon in stop_codons:
67 trailing_stops += 1
68 idx -= 3
69 else:
70 break
71
72 # Scan for INTERNAL stops: all complete codons up to (but not including)
73 # the trailing stop block (and ignoring any trailing partial codon).
74 if check_internal:
75 scan_end = (idx // 3) * 3 # only complete codons
76 for pos in range(0, scan_end, 3):
77 codon = seq_str[pos:pos + 3]
78 if codon in stop_codons:
79 sys.stderr.write(
80 f"ERROR: Found an internal stop codon in sequence "
81 f"'{record.id}' at position {pos}.\n"
82 f"Tools like HyPhy and cawlign do not permit internal "
83 f"stop codons. Please review your input sequences.\n"
84 )
85 sys.exit(2)
86
87 # Finally, remove the trailing stop codons (if any)
88 if trailing_stops > 0:
89 seq_str = seq_str[:idx]
90
91 # Leave sequences with non-stop terminal codons unchanged by design
92 return Seq(seq_str)
93
94
95 def main():
96 ap = argparse.ArgumentParser(
97 description="Remove all terminal stop codons from a FASTA, using a "
98 "chosen genetic code. Optionally fail if any internal "
99 "in-frame stop codon is present."
100 )
101 ap.add_argument("-i", "--input", required=True, help="Input FASTA file")
102 ap.add_argument("-o", "--output", required=True, help="Output FASTA file")
103 ap.add_argument(
104 "-t", "--table",
105 help="NCBI translation table id (e.g., 1) or name "
106 "(e.g., 'Vertebrate Mitochondrial'). Default: 1 (Standard)."
107 )
108 ap.add_argument(
109 "--no-check-internal",
110 action="store_true",
111 help="Do not check for internal stop codons (only remove terminal)."
112 )
113 args = ap.parse_args()
114
115 table = load_table(args.table)
116 stop_codons = set(table.stop_codons) # e.g., {'TAA','TAG','TGA'} for Standard
117
118 check_internal = not args.no_check_internal
119
120 records_out = []
121 for rec in SeqIO.parse(args.input, "fasta"):
122 new_seq = trim_terminal_stops_and_validate(rec, stop_codons, check_internal)
123 rec.seq = new_seq
124 records_out.append(rec)
125
126 SeqIO.write(records_out, args.output, "fasta")
127
128
129 if __name__ == "__main__":
130 main()