comparison tools/align_back_trans/align_back_trans.py @ 0:0c24e4e2177d draft

Uploaded v0.0.3, first stable release.
author peterjc
date Thu, 06 Mar 2014 08:58:13 -0500
parents
children ec202446408a
comparison
equal deleted inserted replaced
-1:000000000000 0:0c24e4e2177d
1 #!/usr/bin/env python
2 """Back-translate a protein alignment to nucleotides
3
4 This tool is a short Python script (using Biopython library functions) to
5 load a protein alignment, and matching nucleotide FASTA file of unaligned
6 sequences, in order to produce a codon aware nucleotide alignment - which
7 can be viewed as a back translation.
8
9 The development repository for this tool is here:
10
11 * https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans
12
13 This tool is available with a Galaxy wrapper from the Galaxy Tool Shed at:
14
15 * http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans
16
17 See accompanying text file for licence details (MIT licence).
18
19 This is version 0.0.3 of the script.
20 """
21
22 import sys
23 from Bio.Seq import Seq
24 from Bio.Alphabet import generic_dna, generic_protein
25 from Bio.Align import MultipleSeqAlignment
26 from Bio import SeqIO
27 from Bio import AlignIO
28 from Bio.Data.CodonTable import ambiguous_generic_by_id
29
30 if "-v" in sys.argv or "--version" in sys.argv:
31 print "v0.0.3"
32 sys.exit(0)
33
34 def stop_err(msg, error_level=1):
35 """Print error message to stdout and quit with given error level."""
36 sys.stderr.write("%s\n" % msg)
37 sys.exit(error_level)
38
39 def check_trans(identifier, nuc, prot, table):
40 """Returns nucleotide sequence if works (can remove trailing stop)"""
41 if len(nuc) % 3:
42 stop_err("Nucleotide sequence for %s is length %i (not a multiple of three)"
43 % (identifier, nuc))
44
45 p = str(prot).upper().replace("*", "X")
46 t = str(nuc.translate(table)).upper().replace("*", "X")
47 if len(t) == len(p) + 1:
48 if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons:
49 #Allow this...
50 t = t[:-1]
51 nuc = nuc[:-3] #edit return value
52 if len(t) != len(p) and p in t:
53 stop_err("%s translation matched but only as subset of nucleotides, "
54 "wrong start codon?" % identifier)
55 if len(t) != len(p) and p[1:] in t:
56 stop_err("%s translation matched (ignoring first base) but only "
57 "as subset of nucleotides, wrong start codon?" % identifier)
58 if len(t) != len(p):
59 stop_err("Inconsistent lengths for %s, ungapped protein %i, "
60 "tripled %i vs ungapped nucleotide %i" %
61 (identifier,
62 len(p),
63 len(p) * 3,
64 len(nuc)))
65
66
67 if t == p:
68 return nuc
69 elif p.startswith("M") and "M" + t[1:] == p:
70 #Close, was there a start codon?
71 if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons:
72 return nuc
73 else:
74 stop_err("Translation check failed for %s\n"
75 "Would match if %s was a start codon (check correct table used)\n"
76 % (identifier, nuc[0:3].upper()))
77 else:
78 #Allow * vs X here? e.g. internal stop codons
79 m = "".join("." if x==y else "!" for (x,y) in zip(p,t))
80 if len(prot) < 70:
81 sys.stderr.write("Protein: %s\n" % p)
82 sys.stderr.write(" %s\n" % m)
83 sys.stderr.write("Translation: %s\n" % t)
84 else:
85 for offset in range(0, len(p), 60):
86 sys.stderr.write("Protein: %s\n" % p[offset:offset+60])
87 sys.stderr.write(" %s\n" % m[offset:offset+60])
88 sys.stderr.write("Translation: %s\n\n" % t[offset:offset+60])
89 stop_err("Translation check failed for %s\n" % identifier)
90
91 def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap, table=0):
92 #TODO - Separate arguments for protein gap and nucleotide gap?
93 if not gap or len(gap) != 1:
94 raise ValueError("Please supply a single gap character")
95
96 alpha = unaligned_nucleotide_record.seq.alphabet
97 if hasattr(alpha, "gap_char"):
98 gap_codon = alpha.gap_char * 3
99 assert len(gap_codon) == 3
100 else:
101 from Bio.Alphabet import Gapped
102 alpha = Gapped(alpha, gap)
103 gap_codon = gap*3
104
105 ungapped_protein = aligned_protein_record.seq.ungap(gap)
106 ungapped_nucleotide = unaligned_nucleotide_record.seq
107 if table:
108 ungapped_nucleotide = check_trans(aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table)
109 elif len(ungapped_protein) * 3 != len(ungapped_nucleotide):
110 stop_err("Inconsistent lengths for %s, ungapped protein %i, "
111 "tripled %i vs ungapped nucleotide %i" %
112 (aligned_protein_record.id,
113 len(ungapped_protein),
114 len(ungapped_protein) * 3,
115 len(ungapped_nucleotide)))
116
117 seq = []
118 nuc = str(ungapped_nucleotide)
119 for amino_acid in aligned_protein_record.seq:
120 if amino_acid == gap:
121 seq.append(gap_codon)
122 else:
123 seq.append(nuc[:3])
124 nuc = nuc[3:]
125 assert not nuc, "Nucleotide sequence for %r longer than protein %r" \
126 % (unaligned_nucleotide_record.id, aligned_protein_record.id)
127
128 aligned_nuc = unaligned_nucleotide_record[:] #copy for most annotation
129 aligned_nuc.letter_annotation = {} #clear this
130 aligned_nuc.seq = Seq("".join(seq), alpha) #replace this
131 assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc)
132 return aligned_nuc
133
134 def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None, table=0):
135 """Thread nucleotide sequences onto a protein alignment."""
136 #TODO - Separate arguments for protein and nucleotide gap characters?
137 if key_function is None:
138 key_function = lambda x: x
139 if gap is None:
140 gap = "-"
141
142 aligned = []
143 for protein in protein_alignment:
144 try:
145 nucleotide = nucleotide_records[key_function(protein.id)]
146 except KeyError:
147 raise ValueError("Could not find nucleotide sequence for protein %r" \
148 % protein.id)
149 aligned.append(sequence_back_translate(protein, nucleotide, gap, table))
150 return MultipleSeqAlignment(aligned)
151
152
153 if len(sys.argv) == 4:
154 align_format, prot_align_file, nuc_fasta_file = sys.argv[1:]
155 nuc_align_file = sys.stdout
156 table = 0
157 elif len(sys.argv) == 5:
158 align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:]
159 table = 0
160 elif len(sys.argv) == 6:
161 align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:]
162 else:
163 stop_err("""This is a Python script for 'back-translating' a protein alignment,
164
165 It requires three, four or five arguments:
166 - alignment format (e.g. fasta, clustal),
167 - aligned protein file (in specified format),
168 - unaligned nucleotide file (in fasta format).
169 - aligned nucleotiode output file (in same format), optional.
170 - NCBI translation table (0 for none), optional
171
172 The nucleotide alignment is printed to stdout if no output filename is given.
173
174 Example usage:
175
176 $ python align_back_trans.py fasta demo_prot_align.fasta demo_nucs.fasta demo_nuc_align.fasta
177
178 Warning: If the output file already exists, it will be overwritten.
179
180 This script is available with sample data and a Galaxy wrapper here:
181 https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans
182 http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans
183 """)
184
185 try:
186 table = int(table)
187 except:
188 stop_err("Bad table argument %r" % table)
189
190 prot_align = AlignIO.read(prot_align_file, align_format, alphabet=generic_protein)
191 nuc_dict = SeqIO.index(nuc_fasta_file, "fasta")
192 nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-", table=table)
193 AlignIO.write(nuc_align, nuc_align_file, align_format)