Mercurial > repos > ryanmorin > nextgen_variant_identification
view SNV/identify_nonsynonymous_mutations.pl @ 7:351b3acadd17 default tip
Uploaded
author | ryanmorin |
---|---|
date | Tue, 18 Oct 2011 18:33:15 -0400 |
parents | 74f5ea818cea |
children |
line wrap: on
line source
#!/usr/bin/perl #Author: Ryan Morin (rmorin@bcgsc.ca) #last modified February, 2010 to match improvements to codon lookup table #Script to annotate files that have been joined with an appropriate 'codon lookup' table. #example usage: sort -k 1 snp_file.txt | join codon_lookup.sort - | this_script.pl #Transcript information and amino acid/codon position information is used to report the amino acid change in this format: N123M where N is the original amino acid at position 123 and M is the new amino acid. #Transcripts with the same reading frame for position 123 are separated by commas. Distinct reading frames for the affected position are separated by a semi-colon (e.g. 123 = 155 in another isoform) #an example of the codon lookup table: #chr10:101658781 ENSG00000107554 -1 GAA 3 37;79;791; ENST00000393570;ENST00000370423;ENST00000324109,ENST00000342239; #hence, chr10:101658781 corresponds to the third position in a GAA codon which is amino acid 791 in both ENST00000324109 ENST00000342239, amino acid 79 in ENST00000370423 and 37 in ENST00000393570 use strict; use Data::Dumper; use Getopt::Std; use vars qw($opt_h); getopts("h"); my %lookup; load_ambig(); my %codons = codons(); my $requested_header = $opt_h; my $header_printed; if($requested_header){ $header_printed = 0; } else{ $header_printed = 1; } while(<STDIN>){ chomp; my $line = $_; my @cols = split /\s/, $line; my $ncols = @cols; unless($header_printed){ print "\#CHR:POSITION GENE STRAND REF_CODON CODON_POSITION REFBASE IUPAC_BASE_CALL MINOR_ALLELE_REPRESENTATION NEW_CODON REF_AA NEW_AA STATUS"; print "\n"; $header_printed = 1; } my $gene = $cols[1]; my $strand = $cols[2]; my $codon = $cols[3]; unless ($codon =~ /[ACTG]{3}/){ print "\n"; next; } my $codon_pos = $cols[4]; my $aa_pos = $cols[5]; my @transcript_strings = split /;/, $cols[6]; my $ref = uc($cols[7]); my $amb = uc($cols[8]); my $mut = amb_lookup($amb,$ref,$cols[0]); next if $mut == -1; my $mut_cor = $mut; my $ref_cor = $ref; if($strand < 0){ $mut_cor =~ tr/ACTG/TGAC/; $ref_cor =~ tr/ACTG/TGAC/; } my @three = split //, $codon; #sanity check my $codon_ind = $codon_pos -1; my $orig = $three[$codon_ind]; if($orig ne $ref_cor){ print STDERR "$cols[0]\tERROR, base $ref is not the same as codon position $codon_pos ($orig)\n"; s/1\s\w{3}\s\d\s/1 UNKNOWN 0 /; print "$line"; print " UNKNOWN\n"; next; } else{ #ok, print normal leader line and continue to downstream work print "$line"; } $three[$codon_ind] = $mut_cor; my $new_codon = join "", @three; print " $new_codon "; my $old_aa = translate($codon); my $new_aa = translate($new_codon); my $type = "CODING"; if($old_aa eq $new_aa){ $type = "SYNONYMOUS"; } my $aa_string; my @aa_positions = split /;/, $aa_pos; for(my $i = 0;$i<@aa_positions;$i++){ my $string = $old_aa . $aa_positions[$i] . $new_aa; $aa_string .= "$string;"; } chop($aa_string); print "$old_aa $new_aa $type $aa_string\n"; } sub amb_lookup{ my $amb = shift; my $ref = shift; my $pos = shift; if($amb =~ /[ACTG]/){ #print "$amb\n"; return($amb); } #print "LOOKUP: $amb\n"; my @all = @{$lookup{$amb}}; #print Dumper @all; my @some; for(@all){ next if $_ eq $ref; push @some, $_; } if(@some > 1){ #print Dumper @some; print STDERR "$pos\t$ref\t$amb\ttwo bases at this position, neither is reference\n"; return(-1); } return($some[0]); } sub load_ambig{ add('M',qw(A C)); add('R',qw(A G)); add('W',qw(A T)); add('S',qw(C G)); add('Y',qw(T C)); add('K',qw(T G)); add('V',qw(A C G)); add('H',qw(A C T)); add('D',qw(A G T)); add('B',qw(C G T)); } sub add{ my $amb = shift; my @copy = @_; $lookup{$amb} = \@copy; } sub translate{ my $codon = shift; return($codons{$codon}); } sub codons{ my %codons = ( 'TCA' => 'S', # Serine 'TCC' => 'S', # Serine 'TCG' => 'S', # Serine 'TCT' => 'S', # Serine 'TTC' => 'F', # Phenylalanine 'TTT' => 'F', # Phenylalanine 'TTA' => 'L', # Leucine 'TTG' => 'L', # Leucine 'TAC' => 'Y', # Tyrosine 'TAT' => 'Y', # Tyrosine 'TAA' => '*', # Stop 'TAG' => '*', # Stop 'TGC' => 'C', # Cysteine 'TGT' => 'C', # Cysteine 'TGA' => '*', # Stop 'TGG' => 'W', # Tryptophan 'CTA' => 'L', # Leucine 'CTC' => 'L', # Leucine 'CTG' => 'L', # Leucine 'CTT' => 'L', # Leucine 'CCA' => 'P', # Proline 'CCC' => 'P', # Proline 'CCG' => 'P', # Proline 'CCT' => 'P', # Proline 'CAC' => 'H', # Histidine 'CAT' => 'H', # Histidine 'CAA' => 'Q', # Glutamine 'CAG' => 'Q', # Glutamine 'CGA' => 'R', # Arginine 'CGC' => 'R', # Arginine 'CGG' => 'R', # Arginine 'CGT' => 'R', # Arginine 'ATA' => 'I', # Isoleucine 'ATC' => 'I', # Isoleucine 'ATT' => 'I', # Isoleucine 'ATG' => 'M', # Methionine 'ACA' => 'T', # Threonine 'ACC' => 'T', # Threonine 'ACG' => 'T', # Threonine 'ACT' => 'T', # Threonine 'AAC' => 'N', # Asparagine 'AAT' => 'N', # Asparagine 'AAA' => 'K', # Lysine 'AAG' => 'K', # Lysine 'AGC' => 'S', # Serine 'AGT' => 'S', # Serine 'AGA' => 'R', # Arginine 'AGG' => 'R', # Arginine 'GTA' => 'V', # Valine 'GTC' => 'V', # Valine 'GTG' => 'V', # Valine 'GTT' => 'V', # Valine 'GCA' => 'A', # Alanine 'GCC' => 'A', # Alanine 'GCG' => 'A', # Alanine 'GCT' => 'A', # Alanine 'GAC' => 'D', # Aspartic Acid 'GAT' => 'D', # Aspartic Acid 'GAA' => 'E', # Glutamic Acid 'GAG' => 'E', # Glutamic Acid 'GGA' => 'G', # Glycine 'GGC' => 'G', # Glycine 'GGG' => 'G', # Glycine 'GGT' => 'G', # Glycine ); return(%codons); }