Mercurial > repos > peterjc > align_back_trans
comparison tools/align_back_trans/align_back_trans.py @ 7:883842b81796 draft default tip
"Update all the pico_galaxy tools on main Tool Shed"
author | peterjc |
---|---|
date | Fri, 16 Apr 2021 22:26:52 +0000 |
parents | b27388e5a0bb |
children |
comparison
equal
deleted
inserted
replaced
6:b27388e5a0bb | 7:883842b81796 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """Back-translate a protein alignment to nucleotides | 2 """Back-translate a protein alignment to nucleotides. |
3 | 3 |
4 This tool is a short Python script (using Biopython library functions) to | 4 This tool is a short Python script (using Biopython library functions) to |
5 load a protein alignment, and matching nucleotide FASTA file of unaligned | 5 load a protein alignment, and matching nucleotide FASTA file of unaligned |
6 sequences, in order to produce a codon aware nucleotide alignment - which | 6 sequences, in order to produce a codon aware nucleotide alignment - which |
7 can be viewed as a back translation. | 7 can be viewed as a back translation. |
33 print("v0.0.9") | 33 print("v0.0.9") |
34 sys.exit(0) | 34 sys.exit(0) |
35 | 35 |
36 | 36 |
37 def check_trans(identifier, nuc, prot, table): | 37 def check_trans(identifier, nuc, prot, table): |
38 """Returns nucleotide sequence if works (can remove trailing stop)""" | 38 """Return nucleotide sequence, if works (can remove trailing stop).""" |
39 if len(nuc) % 3: | 39 if len(nuc) % 3: |
40 sys.exit("Nucleotide sequence for %s is length %i (not a multiple of three)" | 40 sys.exit( |
41 % (identifier, len(nuc))) | 41 "Nucleotide sequence for %s is length %i (not a multiple of three)" |
42 % (identifier, len(nuc)) | |
43 ) | |
42 | 44 |
43 p = str(prot).upper().replace("*", "X") | 45 p = str(prot).upper().replace("*", "X") |
44 t = str(nuc.translate(table)).upper().replace("*", "X") | 46 t = str(nuc.translate(table)).upper().replace("*", "X") |
45 if len(t) == len(p) + 1: | 47 if len(t) == len(p) + 1: |
46 if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons: | 48 if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons: |
47 # Allow this... | 49 # Allow this... |
48 t = t[:-1] | 50 t = t[:-1] |
49 nuc = nuc[:-3] # edit return value | 51 nuc = nuc[:-3] # edit return value |
50 if len(t) != len(p): | 52 if len(t) != len(p): |
51 err = ("Inconsistent lengths for %s, ungapped protein %i, " | 53 err = ( |
52 "tripled %i vs ungapped nucleotide %i." % | 54 "Inconsistent lengths for %s, ungapped protein %i, " |
53 (identifier, len(p), len(p) * 3, len(nuc))) | 55 "tripled %i vs ungapped nucleotide %i." |
56 % (identifier, len(p), len(p) * 3, len(nuc)) | |
57 ) | |
54 if t.endswith(p): | 58 if t.endswith(p): |
55 err += "\nThere are %i extra nucleotides at the start." % (len(t) - len(p)) | 59 err += "\nThere are %i extra nucleotides at the start." % (len(t) - len(p)) |
56 elif t.startswith(p): | 60 elif t.startswith(p): |
57 err += "\nThere are %i extra nucleotides at the end." % (len(t) - len(p)) | 61 err += "\nThere are %i extra nucleotides at the end." % (len(t) - len(p)) |
58 elif p in t: | 62 elif p in t: |
59 # TODO - Calculate and report the number to trim at start and end? | 63 # TODO - Calculate and report the number to trim at start and end? |
60 err += "\nHowever, protein sequence found within translated nucleotides." | 64 err += "\nHowever, protein sequence found within translated nucleotides." |
61 elif p[1:] in t: | 65 elif p[1:] in t: |
62 err += "\nHowever, ignoring first amino acid, protein sequence found within translated nucleotides." | 66 err += "\nHowever, ignoring first amino acid, protein sequence found " |
67 "within translated nucleotides." | |
63 sys.exit(err) | 68 sys.exit(err) |
64 | 69 |
65 if t == p: | 70 if t == p: |
66 return nuc | 71 return nuc |
67 elif p.startswith("M") and "M" + t[1:] == p: | 72 elif p.startswith("M") and "M" + t[1:] == p: |
68 # Close, was there a start codon? | 73 # Close, was there a start codon? |
69 if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons: | 74 if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons: |
70 return nuc | 75 return nuc |
71 else: | 76 else: |
72 sys.exit("Translation check failed for %s\n" | 77 sys.exit( |
73 "Would match if %s was a start codon (check correct table used)\n" | 78 "Translation check failed for %s\n" |
74 % (identifier, nuc[0:3].upper())) | 79 "Would match if %s was a start codon (check correct table used)\n" |
80 % (identifier, nuc[0:3].upper()) | |
81 ) | |
75 else: | 82 else: |
76 # Allow * vs X here? e.g. internal stop codons | 83 # Allow * vs X here? e.g. internal stop codons |
77 m = "".join("." if x == y else "!" for (x, y) in zip(p, t)) | 84 m = "".join("." if x == y else "!" for (x, y) in zip(p, t)) |
78 if len(prot) < 70: | 85 if len(prot) < 70: |
79 sys.stderr.write("Protein: %s\n" % p) | 86 sys.stderr.write("Protein: %s\n" % p) |
80 sys.stderr.write(" %s\n" % m) | 87 sys.stderr.write(" %s\n" % m) |
81 sys.stderr.write("Translation: %s\n" % t) | 88 sys.stderr.write("Translation: %s\n" % t) |
82 else: | 89 else: |
83 for offset in range(0, len(p), 60): | 90 for offset in range(0, len(p), 60): |
84 sys.stderr.write("Protein: %s\n" % p[offset:offset + 60]) | 91 sys.stderr.write("Protein: %s\n" % p[offset : offset + 60]) |
85 sys.stderr.write(" %s\n" % m[offset:offset + 60]) | 92 sys.stderr.write(" %s\n" % m[offset : offset + 60]) |
86 sys.stderr.write("Translation: %s\n\n" % t[offset:offset + 60]) | 93 sys.stderr.write("Translation: %s\n\n" % t[offset : offset + 60]) |
87 sys.exit("Translation check failed for %s\n" % identifier) | 94 sys.exit("Translation check failed for %s\n" % identifier) |
88 | 95 |
89 | 96 |
90 def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap, table=0): | 97 def sequence_back_translate( |
98 aligned_protein_record, unaligned_nucleotide_record, gap, table=0 | |
99 ): | |
91 """Back-translate a sequence.""" | 100 """Back-translate a sequence.""" |
92 # TODO - Separate arguments for protein gap and nucleotide gap? | 101 # TODO - Separate arguments for protein gap and nucleotide gap? |
93 if not gap or len(gap) != 1: | 102 if not gap or len(gap) != 1: |
94 raise ValueError("Please supply a single gap character") | 103 raise ValueError("Please supply a single gap character") |
95 | 104 |
97 if hasattr(alpha, "gap_char"): | 106 if hasattr(alpha, "gap_char"): |
98 gap_codon = alpha.gap_char * 3 | 107 gap_codon = alpha.gap_char * 3 |
99 assert len(gap_codon) == 3 | 108 assert len(gap_codon) == 3 |
100 else: | 109 else: |
101 from Bio.Alphabet import Gapped | 110 from Bio.Alphabet import Gapped |
111 | |
102 alpha = Gapped(alpha, gap) | 112 alpha = Gapped(alpha, gap) |
103 gap_codon = gap * 3 | 113 gap_codon = gap * 3 |
104 | 114 |
105 ungapped_protein = aligned_protein_record.seq.ungap(gap) | 115 ungapped_protein = aligned_protein_record.seq.ungap(gap) |
106 ungapped_nucleotide = unaligned_nucleotide_record.seq | 116 ungapped_nucleotide = unaligned_nucleotide_record.seq |
107 if table: | 117 if table: |
108 ungapped_nucleotide = check_trans(aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table) | 118 ungapped_nucleotide = check_trans( |
119 aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table | |
120 ) | |
109 elif len(ungapped_protein) * 3 != len(ungapped_nucleotide): | 121 elif len(ungapped_protein) * 3 != len(ungapped_nucleotide): |
110 sys.exit("Inconsistent lengths for %s, ungapped protein %i, " | 122 sys.exit( |
111 "tripled %i vs ungapped nucleotide %i" % | 123 "Inconsistent lengths for %s, ungapped protein %i, " |
112 (aligned_protein_record.id, | 124 "tripled %i vs ungapped nucleotide %i" |
113 len(ungapped_protein), | 125 % ( |
114 len(ungapped_protein) * 3, | 126 aligned_protein_record.id, |
115 len(ungapped_nucleotide))) | 127 len(ungapped_protein), |
128 len(ungapped_protein) * 3, | |
129 len(ungapped_nucleotide), | |
130 ) | |
131 ) | |
116 | 132 |
117 seq = [] | 133 seq = [] |
118 nuc = str(ungapped_nucleotide) | 134 nuc = str(ungapped_nucleotide) |
119 for amino_acid in aligned_protein_record.seq: | 135 for amino_acid in aligned_protein_record.seq: |
120 if amino_acid == gap: | 136 if amino_acid == gap: |
121 seq.append(gap_codon) | 137 seq.append(gap_codon) |
122 else: | 138 else: |
123 seq.append(nuc[:3]) | 139 seq.append(nuc[:3]) |
124 nuc = nuc[3:] | 140 nuc = nuc[3:] |
125 assert not nuc, "Nucleotide sequence for %r longer than protein %r" \ | 141 assert not nuc, "Nucleotide sequence for %r longer than protein %r" % ( |
126 % (unaligned_nucleotide_record.id, aligned_protein_record.id) | 142 unaligned_nucleotide_record.id, |
143 aligned_protein_record.id, | |
144 ) | |
127 | 145 |
128 aligned_nuc = unaligned_nucleotide_record[:] # copy for most annotation | 146 aligned_nuc = unaligned_nucleotide_record[:] # copy for most annotation |
129 aligned_nuc.letter_annotation = {} # clear this | 147 aligned_nuc.letter_annotation = {} # clear this |
130 aligned_nuc.seq = Seq("".join(seq), alpha) # replace this | 148 aligned_nuc.seq = Seq("".join(seq), alpha) # replace this |
131 assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc) | 149 assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc) |
132 return aligned_nuc | 150 return aligned_nuc |
133 | 151 |
134 | 152 |
135 def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None, table=0): | 153 def alignment_back_translate( |
154 protein_alignment, nucleotide_records, key_function=None, gap=None, table=0 | |
155 ): | |
136 """Thread nucleotide sequences onto a protein alignment.""" | 156 """Thread nucleotide sequences onto a protein alignment.""" |
137 # TODO - Separate arguments for protein and nucleotide gap characters? | 157 # TODO - Separate arguments for protein and nucleotide gap characters? |
138 if gap is None: | 158 if gap is None: |
139 gap = "-" | 159 gap = "-" |
140 | 160 |
147 else: | 167 else: |
148 for protein in protein_alignment: | 168 for protein in protein_alignment: |
149 nucleotide = nucleotide_records[key_function(protein.id)] | 169 nucleotide = nucleotide_records[key_function(protein.id)] |
150 aligned.append(sequence_back_translate(protein, nucleotide, gap, table)) | 170 aligned.append(sequence_back_translate(protein, nucleotide, gap, table)) |
151 except KeyError: | 171 except KeyError: |
152 raise ValueError("Could not find nucleotide sequence for protein %r" | 172 raise ValueError( |
153 % protein.id) | 173 "Could not find nucleotide sequence for protein %r" % protein.id |
174 ) | |
154 | 175 |
155 return MultipleSeqAlignment(aligned) | 176 return MultipleSeqAlignment(aligned) |
156 | 177 |
157 | 178 |
158 if len(sys.argv) == 4: | 179 if len(sys.argv) == 4: |
163 align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:] | 184 align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:] |
164 table = 0 | 185 table = 0 |
165 elif len(sys.argv) == 6: | 186 elif len(sys.argv) == 6: |
166 align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:] | 187 align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:] |
167 else: | 188 else: |
168 sys.exit("""This is a Python script for 'back-translating' a protein alignment, | 189 sys.exit( |
190 """This is a Python script for 'back-translating' a protein alignment, | |
169 | 191 |
170 It requires three, four or five arguments: | 192 It requires three, four or five arguments: |
171 - alignment format (e.g. fasta, clustal), | 193 - alignment format (e.g. fasta, clustal), |
172 - aligned protein file (in specified format), | 194 - aligned protein file (in specified format), |
173 - unaligned nucleotide file (in fasta format). | 195 - unaligned nucleotide file (in fasta format). |
183 Warning: If the output file already exists, it will be overwritten. | 205 Warning: If the output file already exists, it will be overwritten. |
184 | 206 |
185 This script is available with sample data and a Galaxy wrapper here: | 207 This script is available with sample data and a Galaxy wrapper here: |
186 https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans | 208 https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans |
187 http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans | 209 http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans |
188 """) | 210 """ # noqa: E501 |
211 ) | |
189 | 212 |
190 try: | 213 try: |
191 table = int(table) | 214 table = int(table) |
192 except ValueError: | 215 except ValueError: |
193 sys.exit("Bad table argument %r" % table) | 216 sys.exit("Bad table argument %r" % table) |