0
|
1 #!/usr/bin/perl
|
|
2 #Author: Ryan Morin (rmorin@bcgsc.ca)
|
|
3 #last modified February, 2010 to match improvements to codon lookup table
|
|
4 #Script to annotate files that have been joined with an appropriate 'codon lookup' table.
|
|
5 #example usage: sort -k 1 snp_file.txt | join codon_lookup.sort - | this_script.pl
|
|
6 #Transcript information and amino acid/codon position information is used to report the amino acid change in this format: N123M where N is the original amino acid at position 123 and M is the new amino acid.
|
|
7 #Transcripts with the same reading frame for position 123 are separated by commas. Distinct reading frames for the affected position are separated by a semi-colon (e.g. 123 = 155 in another isoform)
|
|
8 #an example of the codon lookup table:
|
|
9 #chr10:101658781 ENSG00000107554 -1 GAA 3 37;79;791; ENST00000393570;ENST00000370423;ENST00000324109,ENST00000342239;
|
|
10 #hence, chr10:101658781 corresponds to the third position in a GAA codon which is amino acid 791 in both ENST00000324109 ENST00000342239, amino acid 79 in ENST00000370423 and 37 in ENST00000393570
|
|
11
|
|
12 use strict;
|
|
13 use Data::Dumper;
|
|
14 use Getopt::Std;
|
|
15 use vars qw($opt_h);
|
|
16 getopts("h");
|
|
17
|
|
18 my %lookup;
|
|
19
|
|
20 load_ambig();
|
|
21 my %codons = codons();
|
|
22
|
|
23 my $requested_header = $opt_h;
|
|
24
|
|
25 my $header_printed;
|
|
26 if($requested_header){
|
|
27 $header_printed = 0;
|
|
28 }
|
|
29 else{
|
|
30 $header_printed = 1;
|
|
31 }
|
|
32 while(<STDIN>){
|
|
33
|
|
34 chomp;
|
|
35 my $line = $_;
|
|
36
|
|
37 my @cols = split /\s/, $line;
|
|
38 my $ncols = @cols;
|
|
39 unless($header_printed){
|
|
40 print "\#CHR:POSITION GENE STRAND REF_CODON CODON_POSITION REFBASE IUPAC_BASE_CALL MINOR_ALLELE_REPRESENTATION NEW_CODON REF_AA NEW_AA STATUS";
|
|
41 print "\n";
|
|
42 $header_printed = 1;
|
|
43 }
|
|
44
|
|
45 my $gene = $cols[1];
|
|
46 my $strand = $cols[2];
|
|
47 my $codon = $cols[3];
|
|
48 unless ($codon =~ /[ACTG]{3}/){
|
|
49 print "\n";
|
|
50 next;
|
|
51 }
|
|
52 my $codon_pos = $cols[4];
|
|
53 my $aa_pos = $cols[5];
|
|
54 my @transcript_strings = split /;/, $cols[6];
|
|
55
|
|
56 my $ref = uc($cols[7]);
|
|
57 my $amb = uc($cols[8]);
|
|
58 my $mut = amb_lookup($amb,$ref,$cols[0]);
|
|
59 next if $mut == -1;
|
|
60 my $mut_cor = $mut;
|
|
61
|
|
62 my $ref_cor = $ref;
|
|
63 if($strand < 0){
|
|
64 $mut_cor =~ tr/ACTG/TGAC/;
|
|
65 $ref_cor =~ tr/ACTG/TGAC/;
|
|
66 }
|
|
67 my @three = split //, $codon;
|
|
68 #sanity check
|
|
69
|
|
70 my $codon_ind = $codon_pos -1;
|
|
71 my $orig = $three[$codon_ind];
|
|
72 if($orig ne $ref_cor){
|
|
73 print STDERR "$cols[0]\tERROR, base $ref is not the same as codon position $codon_pos ($orig)\n";
|
|
74 s/1\s\w{3}\s\d\s/1 UNKNOWN 0 /;
|
|
75 print "$line";
|
|
76 print " UNKNOWN\n";
|
|
77 next;
|
|
78
|
|
79 }
|
|
80 else{
|
|
81 #ok, print normal leader line and continue to downstream work
|
|
82 print "$line";
|
|
83 }
|
|
84 $three[$codon_ind] = $mut_cor;
|
|
85 my $new_codon = join "", @three;
|
|
86 print " $new_codon ";
|
|
87
|
|
88 my $old_aa = translate($codon);
|
|
89 my $new_aa = translate($new_codon);
|
|
90
|
|
91 my $type = "CODING";
|
|
92 if($old_aa eq $new_aa){
|
|
93 $type = "SYNONYMOUS";
|
|
94 }
|
|
95
|
|
96 my $aa_string;
|
|
97
|
|
98 my @aa_positions = split /;/, $aa_pos;
|
|
99 for(my $i = 0;$i<@aa_positions;$i++){
|
|
100 my $string = $old_aa . $aa_positions[$i] . $new_aa;
|
|
101 $aa_string .= "$string;";
|
|
102 }
|
|
103 chop($aa_string);
|
|
104 print "$old_aa $new_aa $type $aa_string\n";
|
|
105 }
|
|
106
|
|
107 sub amb_lookup{
|
|
108 my $amb = shift;
|
|
109 my $ref = shift;
|
|
110 my $pos = shift;
|
|
111 if($amb =~ /[ACTG]/){
|
|
112 #print "$amb\n";
|
|
113 return($amb);
|
|
114 }
|
|
115 #print "LOOKUP: $amb\n";
|
|
116 my @all = @{$lookup{$amb}};
|
|
117 #print Dumper @all;
|
|
118 my @some;
|
|
119 for(@all){
|
|
120 next if $_ eq $ref;
|
|
121 push @some, $_;
|
|
122
|
|
123 }
|
|
124 if(@some > 1){
|
|
125 #print Dumper @some;
|
|
126 print STDERR "$pos\t$ref\t$amb\ttwo bases at this position, neither is reference\n";
|
|
127 return(-1);
|
|
128 }
|
|
129 return($some[0]);
|
|
130 }
|
|
131
|
|
132 sub load_ambig{
|
|
133 add('M',qw(A C));
|
|
134 add('R',qw(A G));
|
|
135 add('W',qw(A T));
|
|
136 add('S',qw(C G));
|
|
137 add('Y',qw(T C));
|
|
138 add('K',qw(T G));
|
|
139 add('V',qw(A C G));
|
|
140 add('H',qw(A C T));
|
|
141 add('D',qw(A G T));
|
|
142 add('B',qw(C G T));
|
|
143 }
|
|
144
|
|
145 sub add{
|
|
146
|
|
147 my $amb = shift;
|
|
148 my @copy = @_;
|
|
149 $lookup{$amb} = \@copy;
|
|
150 }
|
|
151 sub translate{
|
|
152 my $codon = shift;
|
|
153 return($codons{$codon});
|
|
154 }
|
|
155 sub codons{
|
|
156 my %codons = (
|
|
157 'TCA' => 'S', # Serine
|
|
158 'TCC' => 'S', # Serine
|
|
159 'TCG' => 'S', # Serine
|
|
160 'TCT' => 'S', # Serine
|
|
161 'TTC' => 'F', # Phenylalanine
|
|
162 'TTT' => 'F', # Phenylalanine
|
|
163 'TTA' => 'L', # Leucine
|
|
164 'TTG' => 'L', # Leucine
|
|
165 'TAC' => 'Y', # Tyrosine
|
|
166 'TAT' => 'Y', # Tyrosine
|
|
167 'TAA' => '*', # Stop
|
|
168 'TAG' => '*', # Stop
|
|
169 'TGC' => 'C', # Cysteine
|
|
170 'TGT' => 'C', # Cysteine
|
|
171 'TGA' => '*', # Stop
|
|
172 'TGG' => 'W', # Tryptophan
|
|
173 'CTA' => 'L', # Leucine
|
|
174 'CTC' => 'L', # Leucine
|
|
175 'CTG' => 'L', # Leucine
|
|
176 'CTT' => 'L', # Leucine
|
|
177 'CCA' => 'P', # Proline
|
|
178 'CCC' => 'P', # Proline
|
|
179 'CCG' => 'P', # Proline
|
|
180 'CCT' => 'P', # Proline
|
|
181 'CAC' => 'H', # Histidine
|
|
182 'CAT' => 'H', # Histidine
|
|
183 'CAA' => 'Q', # Glutamine
|
|
184 'CAG' => 'Q', # Glutamine
|
|
185 'CGA' => 'R', # Arginine
|
|
186 'CGC' => 'R', # Arginine
|
|
187 'CGG' => 'R', # Arginine
|
|
188 'CGT' => 'R', # Arginine
|
|
189 'ATA' => 'I', # Isoleucine
|
|
190 'ATC' => 'I', # Isoleucine
|
|
191 'ATT' => 'I', # Isoleucine
|
|
192 'ATG' => 'M', # Methionine
|
|
193 'ACA' => 'T', # Threonine
|
|
194 'ACC' => 'T', # Threonine
|
|
195 'ACG' => 'T', # Threonine
|
|
196 'ACT' => 'T', # Threonine
|
|
197 'AAC' => 'N', # Asparagine
|
|
198 'AAT' => 'N', # Asparagine
|
|
199 'AAA' => 'K', # Lysine
|
|
200 'AAG' => 'K', # Lysine
|
|
201 'AGC' => 'S', # Serine
|
|
202 'AGT' => 'S', # Serine
|
|
203 'AGA' => 'R', # Arginine
|
|
204 'AGG' => 'R', # Arginine
|
|
205 'GTA' => 'V', # Valine
|
|
206 'GTC' => 'V', # Valine
|
|
207 'GTG' => 'V', # Valine
|
|
208 'GTT' => 'V', # Valine
|
|
209 'GCA' => 'A', # Alanine
|
|
210 'GCC' => 'A', # Alanine
|
|
211 'GCG' => 'A', # Alanine
|
|
212 'GCT' => 'A', # Alanine
|
|
213 'GAC' => 'D', # Aspartic Acid
|
|
214 'GAT' => 'D', # Aspartic Acid
|
|
215 'GAA' => 'E', # Glutamic Acid
|
|
216 'GAG' => 'E', # Glutamic Acid
|
|
217 'GGA' => 'G', # Glycine
|
|
218 'GGC' => 'G', # Glycine
|
|
219 'GGG' => 'G', # Glycine
|
|
220 'GGT' => 'G', # Glycine
|
|
221 );
|
|
222 return(%codons);
|
|
223 }
|