comparison tools/align_back_trans/align_back_trans.py @ 7:883842b81796 draft default tip

"Update all the pico_galaxy tools on main Tool Shed"
author peterjc
date Fri, 16 Apr 2021 22:26:52 +0000
parents b27388e5a0bb
children
comparison
equal deleted inserted replaced
6:b27388e5a0bb 7:883842b81796
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """Back-translate a protein alignment to nucleotides 2 """Back-translate a protein alignment to nucleotides.
3 3
4 This tool is a short Python script (using Biopython library functions) to 4 This tool is a short Python script (using Biopython library functions) to
5 load a protein alignment, and matching nucleotide FASTA file of unaligned 5 load a protein alignment, and matching nucleotide FASTA file of unaligned
6 sequences, in order to produce a codon aware nucleotide alignment - which 6 sequences, in order to produce a codon aware nucleotide alignment - which
7 can be viewed as a back translation. 7 can be viewed as a back translation.
33 print("v0.0.9") 33 print("v0.0.9")
34 sys.exit(0) 34 sys.exit(0)
35 35
36 36
37 def check_trans(identifier, nuc, prot, table): 37 def check_trans(identifier, nuc, prot, table):
38 """Returns nucleotide sequence if works (can remove trailing stop)""" 38 """Return nucleotide sequence, if works (can remove trailing stop)."""
39 if len(nuc) % 3: 39 if len(nuc) % 3:
40 sys.exit("Nucleotide sequence for %s is length %i (not a multiple of three)" 40 sys.exit(
41 % (identifier, len(nuc))) 41 "Nucleotide sequence for %s is length %i (not a multiple of three)"
42 % (identifier, len(nuc))
43 )
42 44
43 p = str(prot).upper().replace("*", "X") 45 p = str(prot).upper().replace("*", "X")
44 t = str(nuc.translate(table)).upper().replace("*", "X") 46 t = str(nuc.translate(table)).upper().replace("*", "X")
45 if len(t) == len(p) + 1: 47 if len(t) == len(p) + 1:
46 if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons: 48 if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons:
47 # Allow this... 49 # Allow this...
48 t = t[:-1] 50 t = t[:-1]
49 nuc = nuc[:-3] # edit return value 51 nuc = nuc[:-3] # edit return value
50 if len(t) != len(p): 52 if len(t) != len(p):
51 err = ("Inconsistent lengths for %s, ungapped protein %i, " 53 err = (
52 "tripled %i vs ungapped nucleotide %i." % 54 "Inconsistent lengths for %s, ungapped protein %i, "
53 (identifier, len(p), len(p) * 3, len(nuc))) 55 "tripled %i vs ungapped nucleotide %i."
56 % (identifier, len(p), len(p) * 3, len(nuc))
57 )
54 if t.endswith(p): 58 if t.endswith(p):
55 err += "\nThere are %i extra nucleotides at the start." % (len(t) - len(p)) 59 err += "\nThere are %i extra nucleotides at the start." % (len(t) - len(p))
56 elif t.startswith(p): 60 elif t.startswith(p):
57 err += "\nThere are %i extra nucleotides at the end." % (len(t) - len(p)) 61 err += "\nThere are %i extra nucleotides at the end." % (len(t) - len(p))
58 elif p in t: 62 elif p in t:
59 # TODO - Calculate and report the number to trim at start and end? 63 # TODO - Calculate and report the number to trim at start and end?
60 err += "\nHowever, protein sequence found within translated nucleotides." 64 err += "\nHowever, protein sequence found within translated nucleotides."
61 elif p[1:] in t: 65 elif p[1:] in t:
62 err += "\nHowever, ignoring first amino acid, protein sequence found within translated nucleotides." 66 err += "\nHowever, ignoring first amino acid, protein sequence found "
67 "within translated nucleotides."
63 sys.exit(err) 68 sys.exit(err)
64 69
65 if t == p: 70 if t == p:
66 return nuc 71 return nuc
67 elif p.startswith("M") and "M" + t[1:] == p: 72 elif p.startswith("M") and "M" + t[1:] == p:
68 # Close, was there a start codon? 73 # Close, was there a start codon?
69 if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons: 74 if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons:
70 return nuc 75 return nuc
71 else: 76 else:
72 sys.exit("Translation check failed for %s\n" 77 sys.exit(
73 "Would match if %s was a start codon (check correct table used)\n" 78 "Translation check failed for %s\n"
74 % (identifier, nuc[0:3].upper())) 79 "Would match if %s was a start codon (check correct table used)\n"
80 % (identifier, nuc[0:3].upper())
81 )
75 else: 82 else:
76 # Allow * vs X here? e.g. internal stop codons 83 # Allow * vs X here? e.g. internal stop codons
77 m = "".join("." if x == y else "!" for (x, y) in zip(p, t)) 84 m = "".join("." if x == y else "!" for (x, y) in zip(p, t))
78 if len(prot) < 70: 85 if len(prot) < 70:
79 sys.stderr.write("Protein: %s\n" % p) 86 sys.stderr.write("Protein: %s\n" % p)
80 sys.stderr.write(" %s\n" % m) 87 sys.stderr.write(" %s\n" % m)
81 sys.stderr.write("Translation: %s\n" % t) 88 sys.stderr.write("Translation: %s\n" % t)
82 else: 89 else:
83 for offset in range(0, len(p), 60): 90 for offset in range(0, len(p), 60):
84 sys.stderr.write("Protein: %s\n" % p[offset:offset + 60]) 91 sys.stderr.write("Protein: %s\n" % p[offset : offset + 60])
85 sys.stderr.write(" %s\n" % m[offset:offset + 60]) 92 sys.stderr.write(" %s\n" % m[offset : offset + 60])
86 sys.stderr.write("Translation: %s\n\n" % t[offset:offset + 60]) 93 sys.stderr.write("Translation: %s\n\n" % t[offset : offset + 60])
87 sys.exit("Translation check failed for %s\n" % identifier) 94 sys.exit("Translation check failed for %s\n" % identifier)
88 95
89 96
90 def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap, table=0): 97 def sequence_back_translate(
98 aligned_protein_record, unaligned_nucleotide_record, gap, table=0
99 ):
91 """Back-translate a sequence.""" 100 """Back-translate a sequence."""
92 # TODO - Separate arguments for protein gap and nucleotide gap? 101 # TODO - Separate arguments for protein gap and nucleotide gap?
93 if not gap or len(gap) != 1: 102 if not gap or len(gap) != 1:
94 raise ValueError("Please supply a single gap character") 103 raise ValueError("Please supply a single gap character")
95 104
97 if hasattr(alpha, "gap_char"): 106 if hasattr(alpha, "gap_char"):
98 gap_codon = alpha.gap_char * 3 107 gap_codon = alpha.gap_char * 3
99 assert len(gap_codon) == 3 108 assert len(gap_codon) == 3
100 else: 109 else:
101 from Bio.Alphabet import Gapped 110 from Bio.Alphabet import Gapped
111
102 alpha = Gapped(alpha, gap) 112 alpha = Gapped(alpha, gap)
103 gap_codon = gap * 3 113 gap_codon = gap * 3
104 114
105 ungapped_protein = aligned_protein_record.seq.ungap(gap) 115 ungapped_protein = aligned_protein_record.seq.ungap(gap)
106 ungapped_nucleotide = unaligned_nucleotide_record.seq 116 ungapped_nucleotide = unaligned_nucleotide_record.seq
107 if table: 117 if table:
108 ungapped_nucleotide = check_trans(aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table) 118 ungapped_nucleotide = check_trans(
119 aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table
120 )
109 elif len(ungapped_protein) * 3 != len(ungapped_nucleotide): 121 elif len(ungapped_protein) * 3 != len(ungapped_nucleotide):
110 sys.exit("Inconsistent lengths for %s, ungapped protein %i, " 122 sys.exit(
111 "tripled %i vs ungapped nucleotide %i" % 123 "Inconsistent lengths for %s, ungapped protein %i, "
112 (aligned_protein_record.id, 124 "tripled %i vs ungapped nucleotide %i"
113 len(ungapped_protein), 125 % (
114 len(ungapped_protein) * 3, 126 aligned_protein_record.id,
115 len(ungapped_nucleotide))) 127 len(ungapped_protein),
128 len(ungapped_protein) * 3,
129 len(ungapped_nucleotide),
130 )
131 )
116 132
117 seq = [] 133 seq = []
118 nuc = str(ungapped_nucleotide) 134 nuc = str(ungapped_nucleotide)
119 for amino_acid in aligned_protein_record.seq: 135 for amino_acid in aligned_protein_record.seq:
120 if amino_acid == gap: 136 if amino_acid == gap:
121 seq.append(gap_codon) 137 seq.append(gap_codon)
122 else: 138 else:
123 seq.append(nuc[:3]) 139 seq.append(nuc[:3])
124 nuc = nuc[3:] 140 nuc = nuc[3:]
125 assert not nuc, "Nucleotide sequence for %r longer than protein %r" \ 141 assert not nuc, "Nucleotide sequence for %r longer than protein %r" % (
126 % (unaligned_nucleotide_record.id, aligned_protein_record.id) 142 unaligned_nucleotide_record.id,
143 aligned_protein_record.id,
144 )
127 145
128 aligned_nuc = unaligned_nucleotide_record[:] # copy for most annotation 146 aligned_nuc = unaligned_nucleotide_record[:] # copy for most annotation
129 aligned_nuc.letter_annotation = {} # clear this 147 aligned_nuc.letter_annotation = {} # clear this
130 aligned_nuc.seq = Seq("".join(seq), alpha) # replace this 148 aligned_nuc.seq = Seq("".join(seq), alpha) # replace this
131 assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc) 149 assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc)
132 return aligned_nuc 150 return aligned_nuc
133 151
134 152
135 def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None, table=0): 153 def alignment_back_translate(
154 protein_alignment, nucleotide_records, key_function=None, gap=None, table=0
155 ):
136 """Thread nucleotide sequences onto a protein alignment.""" 156 """Thread nucleotide sequences onto a protein alignment."""
137 # TODO - Separate arguments for protein and nucleotide gap characters? 157 # TODO - Separate arguments for protein and nucleotide gap characters?
138 if gap is None: 158 if gap is None:
139 gap = "-" 159 gap = "-"
140 160
147 else: 167 else:
148 for protein in protein_alignment: 168 for protein in protein_alignment:
149 nucleotide = nucleotide_records[key_function(protein.id)] 169 nucleotide = nucleotide_records[key_function(protein.id)]
150 aligned.append(sequence_back_translate(protein, nucleotide, gap, table)) 170 aligned.append(sequence_back_translate(protein, nucleotide, gap, table))
151 except KeyError: 171 except KeyError:
152 raise ValueError("Could not find nucleotide sequence for protein %r" 172 raise ValueError(
153 % protein.id) 173 "Could not find nucleotide sequence for protein %r" % protein.id
174 )
154 175
155 return MultipleSeqAlignment(aligned) 176 return MultipleSeqAlignment(aligned)
156 177
157 178
158 if len(sys.argv) == 4: 179 if len(sys.argv) == 4:
163 align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:] 184 align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:]
164 table = 0 185 table = 0
165 elif len(sys.argv) == 6: 186 elif len(sys.argv) == 6:
166 align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:] 187 align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:]
167 else: 188 else:
168 sys.exit("""This is a Python script for 'back-translating' a protein alignment, 189 sys.exit(
190 """This is a Python script for 'back-translating' a protein alignment,
169 191
170 It requires three, four or five arguments: 192 It requires three, four or five arguments:
171 - alignment format (e.g. fasta, clustal), 193 - alignment format (e.g. fasta, clustal),
172 - aligned protein file (in specified format), 194 - aligned protein file (in specified format),
173 - unaligned nucleotide file (in fasta format). 195 - unaligned nucleotide file (in fasta format).
183 Warning: If the output file already exists, it will be overwritten. 205 Warning: If the output file already exists, it will be overwritten.
184 206
185 This script is available with sample data and a Galaxy wrapper here: 207 This script is available with sample data and a Galaxy wrapper here:
186 https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans 208 https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans
187 http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans 209 http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans
188 """) 210 """ # noqa: E501
211 )
189 212
190 try: 213 try:
191 table = int(table) 214 table = int(table)
192 except ValueError: 215 except ValueError:
193 sys.exit("Bad table argument %r" % table) 216 sys.exit("Bad table argument %r" % table)