align_back_trans: tools/align_back_trans/align_back

comparison tools/align_back_trans/align_back_trans.py @ 7:883842b81796 draft default tip

"Update all the pico_galaxy tools on main Tool Shed"

author	peterjc
date	Fri, 16 Apr 2021 22:26:52 +0000
parents	b27388e5a0bb
children

comparison

equal deleted inserted replaced

-:b27388e5a0bb
+:883842b81796
 #!/usr/bin/env python
-"""Back-translate a protein alignment to nucleotides
+"""Back-translate a protein alignment to nucleotides.
 This tool is a short Python script (using Biopython library functions) to
 load a protein alignment, and matching nucleotide FASTA file of unaligned
 sequences, in order to produce a codon aware nucleotide alignment - which
 can be viewed as a back translation.
 print("v0.0.9")
 sys.exit(0)
 def check_trans(identifier, nuc, prot, table):
-"""Returns nucleotide sequence if works (can remove trailing stop)"""
+"""Return nucleotide sequence, if works (can remove trailing stop)."""
 if len(nuc) % 3:
-sys.exit("Nucleotide sequence for %s is length %i (not a multiple of three)"
+sys.exit(
-% (identifier, len(nuc)))
+"Nucleotide sequence for %s is length %i (not a multiple of three)"
+% (identifier, len(nuc))
+)
 p = str(prot).upper().replace("*", "X")
 t = str(nuc.translate(table)).upper().replace("*", "X")
 if len(t) == len(p) + 1:
 if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons:
 # Allow this...
 t = t[:-1]
 nuc = nuc[:-3]  # edit return value
 if len(t) != len(p):
-err = ("Inconsistent lengths for %s, ungapped protein %i, "
+err = (
-"tripled %i vs ungapped nucleotide %i." %
+"Inconsistent lengths for %s, ungapped protein %i, "
-(identifier, len(p), len(p) * 3, len(nuc)))
+"tripled %i vs ungapped nucleotide %i."
+% (identifier, len(p), len(p) * 3, len(nuc))
+)
 if t.endswith(p):
 err += "\nThere are %i extra nucleotides at the start." % (len(t) - len(p))
 elif t.startswith(p):
 err += "\nThere are %i extra nucleotides at the end." % (len(t) - len(p))
 elif p in t:
 # TODO - Calculate and report the number to trim at start and end?
 err += "\nHowever, protein sequence found within translated nucleotides."
 elif p[1:] in t:
-err += "\nHowever, ignoring first amino acid, protein sequence found within translated nucleotides."
+err += "\nHowever, ignoring first amino acid, protein sequence found "
+"within translated nucleotides."
 sys.exit(err)
 if t == p:
 return nuc
 elif p.startswith("M") and "M" + t[1:] == p:
 # Close, was there a start codon?
 if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons:
 return nuc
 else:
-sys.exit("Translation check failed for %s\n"
+sys.exit(
-"Would match if %s was a start codon (check correct table used)\n"
+"Translation check failed for %s\n"
-% (identifier, nuc[0:3].upper()))
+"Would match if %s was a start codon (check correct table used)\n"
+% (identifier, nuc[0:3].upper())
+)
 else:
 # Allow * vs X here? e.g. internal stop codons
 m = "".join("." if x == y else "!" for (x, y) in zip(p, t))
 if len(prot) < 70:
 sys.stderr.write("Protein:     %s\n" % p)
 sys.stderr.write("             %s\n" % m)
 sys.stderr.write("Translation: %s\n" % t)
 else:
 for offset in range(0, len(p), 60):
-sys.stderr.write("Protein:     %s\n" % p[offset:offset + 60])
+sys.stderr.write("Protein:     %s\n" % p[offset : offset + 60])
-sys.stderr.write("             %s\n" % m[offset:offset + 60])
+sys.stderr.write("             %s\n" % m[offset : offset + 60])
-sys.stderr.write("Translation: %s\n\n" % t[offset:offset + 60])
+sys.stderr.write("Translation: %s\n\n" % t[offset : offset + 60])
 sys.exit("Translation check failed for %s\n" % identifier)
-def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap, table=0):
+def sequence_back_translate(
+aligned_protein_record, unaligned_nucleotide_record, gap, table=0
+):
 """Back-translate a sequence."""
 # TODO - Separate arguments for protein gap and nucleotide gap?
 if not gap or len(gap) != 1:
 raise ValueError("Please supply a single gap character")
 if hasattr(alpha, "gap_char"):
 gap_codon = alpha.gap_char * 3
 assert len(gap_codon) == 3
 else:
 from Bio.Alphabet import Gapped
 alpha = Gapped(alpha, gap)
 gap_codon = gap * 3
 ungapped_protein = aligned_protein_record.seq.ungap(gap)
 ungapped_nucleotide = unaligned_nucleotide_record.seq
 if table:
-ungapped_nucleotide = check_trans(aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table)
+ungapped_nucleotide = check_trans(
+aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table
+)
 elif len(ungapped_protein) * 3 != len(ungapped_nucleotide):
-sys.exit("Inconsistent lengths for %s, ungapped protein %i, "
+sys.exit(
-"tripled %i vs ungapped nucleotide %i" %
+"Inconsistent lengths for %s, ungapped protein %i, "
-(aligned_protein_record.id,
+"tripled %i vs ungapped nucleotide %i"
-len(ungapped_protein),
+% (
-len(ungapped_protein) * 3,
+aligned_protein_record.id,
-len(ungapped_nucleotide)))
+len(ungapped_protein),
+len(ungapped_protein) * 3,
+len(ungapped_nucleotide),
+)
+)
 seq = []
 nuc = str(ungapped_nucleotide)
 for amino_acid in aligned_protein_record.seq:
 if amino_acid == gap:
 seq.append(gap_codon)
 else:
 seq.append(nuc[:3])
 nuc = nuc[3:]
-assert not nuc, "Nucleotide sequence for %r longer than protein %r" \
+assert not nuc, "Nucleotide sequence for %r longer than protein %r" % (
-% (unaligned_nucleotide_record.id, aligned_protein_record.id)
+unaligned_nucleotide_record.id,
+aligned_protein_record.id,
+)
 aligned_nuc = unaligned_nucleotide_record[:]  # copy for most annotation
 aligned_nuc.letter_annotation = {}  # clear this
 aligned_nuc.seq = Seq("".join(seq), alpha)  # replace this
 assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc)
 return aligned_nuc
-def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None, table=0):
+def alignment_back_translate(
+protein_alignment, nucleotide_records, key_function=None, gap=None, table=0
+):
 """Thread nucleotide sequences onto a protein alignment."""
 # TODO - Separate arguments for protein and nucleotide gap characters?
 if gap is None:
 gap = "-"
 else:
 for protein in protein_alignment:
 nucleotide = nucleotide_records[key_function(protein.id)]
 aligned.append(sequence_back_translate(protein, nucleotide, gap, table))
 except KeyError:
-raise ValueError("Could not find nucleotide sequence for protein %r"
+raise ValueError(
-% protein.id)
+"Could not find nucleotide sequence for protein %r" % protein.id
+)
 return MultipleSeqAlignment(aligned)
 if len(sys.argv) == 4:
 align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:]
 table = 0
 elif len(sys.argv) == 6:
 align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:]
 else:
-sys.exit("""This is a Python script for 'back-translating' a protein alignment,
+sys.exit(
+"""This is a Python script for 'back-translating' a protein alignment,
 It requires three, four or five arguments:
 - alignment format (e.g. fasta, clustal),
 - aligned protein file (in specified format),
 - unaligned nucleotide file (in fasta format).
 Warning: If the output file already exists, it will be overwritten.
 This script is available with sample data and a Galaxy wrapper here:
 https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans
 http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans
-""")
+"""  # noqa: E501
+)
 try:
 table = int(table)
 except ValueError:
 sys.exit("Bad table argument %r" % table)

Mercurial > repos > peterjc > align_back_trans

comparison tools/align_back_trans/align_back_trans.py @ 7:883842b81796 draft default tip