Mercurial > repos > iuc > remove_terminal_stop_codons
comparison remove_terminal_stop_codons.py @ 0:0290a7285026 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/remove_terminal_stop_codons commit 0c1c0e260ebecab6beb23fd56322b391e62d12fa
| author | iuc |
|---|---|
| date | Fri, 05 Dec 2025 23:22:35 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0290a7285026 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 """ | |
| 3 Remove terminal stop codons from coding sequences. | |
| 4 | |
| 5 Trim all terminal stop codons from sequences in a FASTA file, using a chosen | |
| 6 NCBI genetic code (translation table). Leave non-stop terminal codons alone. | |
| 7 If any INTERNAL, in-frame stop codon is found, exit with an error. | |
| 8 | |
| 9 This tool is designed as a preprocessing step for tools like cawlign and HyPhy | |
| 10 that do not permit internal stop codons in their input sequences. | |
| 11 | |
| 12 Requires: Biopython | |
| 13 """ | |
| 14 | |
| 15 import argparse | |
| 16 import sys | |
| 17 | |
| 18 from Bio import SeqIO | |
| 19 from Bio.Data import CodonTable | |
| 20 from Bio.Seq import Seq | |
| 21 | |
| 22 | |
| 23 def load_table(table_arg): | |
| 24 """Return a DNA codon table from an NCBI table id (int) or name (str).""" | |
| 25 if table_arg is None: | |
| 26 return CodonTable.unambiguous_dna_by_id[1] # Standard | |
| 27 # try as integer id | |
| 28 try: | |
| 29 tid = int(table_arg) | |
| 30 return CodonTable.unambiguous_dna_by_id[tid] | |
| 31 except (ValueError, KeyError): | |
| 32 pass | |
| 33 # try as name | |
| 34 try: | |
| 35 return CodonTable.unambiguous_dna_by_name[table_arg] | |
| 36 except KeyError: | |
| 37 # Build a helpful hint list | |
| 38 valid_ids = sorted(CodonTable.unambiguous_dna_by_id.keys()) | |
| 39 valid_names = sorted(CodonTable.unambiguous_dna_by_name.keys()) | |
| 40 sys.stderr.write( | |
| 41 f"ERROR: Unknown genetic code '{table_arg}'.\n" | |
| 42 f"Try an NCBI table id (e.g., 1) or one of these names:\n" | |
| 43 f" {', '.join(valid_names)}\n" | |
| 44 f"(Valid ids include: {', '.join(map(str, valid_ids))})\n" | |
| 45 ) | |
| 46 sys.exit(2) | |
| 47 | |
| 48 | |
| 49 def trim_terminal_stops_and_validate(record, stop_codons, check_internal=True): | |
| 50 """ | |
| 51 Remove ALL trailing stop codons (0+ at the end). | |
| 52 | |
| 53 If check_internal is True and any internal in-frame stop codon exists | |
| 54 (excluding the trailing block), exit with an error message. | |
| 55 | |
| 56 Ignore a terminal codon that is not a stop codon. | |
| 57 """ | |
| 58 # Work with DNA letters; treat any RNA U as T | |
| 59 seq_str = str(record.seq).upper().replace("U", "T") | |
| 60 | |
| 61 # Count how many full codons sit at the end that are stops | |
| 62 idx = len(seq_str) | |
| 63 trailing_stops = 0 | |
| 64 while idx >= 3: | |
| 65 codon = seq_str[idx - 3:idx] | |
| 66 if codon in stop_codons: | |
| 67 trailing_stops += 1 | |
| 68 idx -= 3 | |
| 69 else: | |
| 70 break | |
| 71 | |
| 72 # Scan for INTERNAL stops: all complete codons up to (but not including) | |
| 73 # the trailing stop block (and ignoring any trailing partial codon). | |
| 74 if check_internal: | |
| 75 scan_end = (idx // 3) * 3 # only complete codons | |
| 76 for pos in range(0, scan_end, 3): | |
| 77 codon = seq_str[pos:pos + 3] | |
| 78 if codon in stop_codons: | |
| 79 sys.stderr.write( | |
| 80 f"ERROR: Found an internal stop codon in sequence " | |
| 81 f"'{record.id}' at position {pos}.\n" | |
| 82 f"Tools like HyPhy and cawlign do not permit internal " | |
| 83 f"stop codons. Please review your input sequences.\n" | |
| 84 ) | |
| 85 sys.exit(2) | |
| 86 | |
| 87 # Finally, remove the trailing stop codons (if any) | |
| 88 if trailing_stops > 0: | |
| 89 seq_str = seq_str[:idx] | |
| 90 | |
| 91 # Leave sequences with non-stop terminal codons unchanged by design | |
| 92 return Seq(seq_str) | |
| 93 | |
| 94 | |
| 95 def main(): | |
| 96 ap = argparse.ArgumentParser( | |
| 97 description="Remove all terminal stop codons from a FASTA, using a " | |
| 98 "chosen genetic code. Optionally fail if any internal " | |
| 99 "in-frame stop codon is present." | |
| 100 ) | |
| 101 ap.add_argument("-i", "--input", required=True, help="Input FASTA file") | |
| 102 ap.add_argument("-o", "--output", required=True, help="Output FASTA file") | |
| 103 ap.add_argument( | |
| 104 "-t", "--table", | |
| 105 help="NCBI translation table id (e.g., 1) or name " | |
| 106 "(e.g., 'Vertebrate Mitochondrial'). Default: 1 (Standard)." | |
| 107 ) | |
| 108 ap.add_argument( | |
| 109 "--no-check-internal", | |
| 110 action="store_true", | |
| 111 help="Do not check for internal stop codons (only remove terminal)." | |
| 112 ) | |
| 113 args = ap.parse_args() | |
| 114 | |
| 115 table = load_table(args.table) | |
| 116 stop_codons = set(table.stop_codons) # e.g., {'TAA','TAG','TGA'} for Standard | |
| 117 | |
| 118 check_internal = not args.no_check_internal | |
| 119 | |
| 120 records_out = [] | |
| 121 for rec in SeqIO.parse(args.input, "fasta"): | |
| 122 new_seq = trim_terminal_stops_and_validate(rec, stop_codons, check_internal) | |
| 123 rec.seq = new_seq | |
| 124 records_out.append(rec) | |
| 125 | |
| 126 SeqIO.write(records_out, args.output, "fasta") | |
| 127 | |
| 128 | |
| 129 if __name__ == "__main__": | |
| 130 main() |
