Mercurial > repos > ryanmorin > nextgen_variant_identification

#!/usr/bin/perl
#Author: Ryan Morin (rmorin@bcgsc.ca)
#last modified February, 2010 to match improvements to codon lookup table
#Script to annotate files that have been joined with an appropriate 'codon lookup' table.
#example usage: sort -k 1 snp_file.txt | join codon_lookup.sort - | this_script.pl
#Transcript information and amino acid/codon position information is used to report the amino acid change in this format:  N123M where N is the original amino acid at position 123 and M is the new amino acid.
#Transcripts with the same reading frame for position 123 are separated by commas.  Distinct reading frames for the affected position are separated by a semi-colon (e.g. 123 = 155 in another isoform)
#an example of the codon lookup table:
#chr10:101658781 ENSG00000107554 -1      GAA     3       37;79;791;      ENST00000393570;ENST00000370423;ENST00000324109,ENST00000342239;
#hence, chr10:101658781 corresponds to the third position in a GAA codon which is amino acid 791 in both ENST00000324109 ENST00000342239, amino acid 79 in ENST00000370423 and 37 in ENST00000393570

use strict;
use Data::Dumper;
use Getopt::Std;
use vars qw($opt_h);
getopts("h");

my %lookup;

load_ambig();
my %codons = codons();

my $requested_header = $opt_h;

my $header_printed;
if($requested_header){
    $header_printed = 0;
}
else{
    $header_printed = 1;
}
while(<STDIN>){

    chomp;
    my $line = $_;

    my @cols = split /\s/, $line;
    my $ncols = @cols;
    unless($header_printed){
	print "\#CHR:POSITION GENE STRAND REF_CODON CODON_POSITION REFBASE IUPAC_BASE_CALL MINOR_ALLELE_REPRESENTATION NEW_CODON REF_AA NEW_AA STATUS";
	print "\n";
	$header_printed = 1;
    }

    my $gene = $cols[1];
    my $strand = $cols[2];
    my $codon = $cols[3];
    unless ($codon =~ /[ACTG]{3}/){
	print "\n";
	next;
    }
    my $codon_pos = $cols[4];
    my $aa_pos = $cols[5];
    my @transcript_strings = split /;/, $cols[6];

    my $ref = uc($cols[7]);
    my $amb = uc($cols[8]);
    my $mut = amb_lookup($amb,$ref,$cols[0]);
    next if $mut == -1;
    my $mut_cor = $mut;

    my $ref_cor = $ref;
    if($strand < 0){
	$mut_cor =~ tr/ACTG/TGAC/;
	$ref_cor =~ tr/ACTG/TGAC/;
    }
    my @three = split //, $codon;
    #sanity check

    my $codon_ind = $codon_pos -1;
    my $orig = $three[$codon_ind];
    if($orig ne $ref_cor){
	print STDERR "$cols[0]\tERROR, base $ref is not the same as codon position $codon_pos ($orig)\n";
	s/1\s\w{3}\s\d\s/1 UNKNOWN 0 /;
	print "$line";
	print " UNKNOWN\n";
	next;

    }
    else{
	#ok, print normal leader line and continue to downstream work
	print "$line";
    }
    $three[$codon_ind] = $mut_cor;
    my $new_codon = join "", @three;
    print " $new_codon ";

    my $old_aa = translate($codon);
    my $new_aa = translate($new_codon);

    my $type = "CODING";
    if($old_aa eq $new_aa){
	$type = "SYNONYMOUS";
    }

    my $aa_string;

    my @aa_positions = split /;/, $aa_pos;
    for(my $i = 0;$i<@aa_positions;$i++){
	my $string = $old_aa . $aa_positions[$i] . $new_aa;
	$aa_string .= "$string;";
    }
    chop($aa_string);
    print "$old_aa $new_aa $type $aa_string\n";
}

sub amb_lookup{
    my $amb = shift;
    my $ref = shift;
    my $pos = shift;
    if($amb =~ /[ACTG]/){
	#print "$amb\n";
	return($amb);
    }
    #print "LOOKUP: $amb\n";
    my @all = @{$lookup{$amb}};
    #print Dumper @all;
    my @some;
    for(@all){
	next if $_ eq $ref;
	push @some, $_;

    }
    if(@some > 1){
	#print Dumper @some;
	print STDERR "$pos\t$ref\t$amb\ttwo bases at this position, neither is reference\n";
	return(-1);
    }
    return($some[0]);
}

sub load_ambig{
    add('M',qw(A C));
    add('R',qw(A G));
    add('W',qw(A T));
    add('S',qw(C G));
    add('Y',qw(T C));
    add('K',qw(T G));
    add('V',qw(A C G));
    add('H',qw(A C T));
    add('D',qw(A G T));
    add('B',qw(C G T));
}

sub add{

    my $amb = shift;
    my @copy = @_;
    $lookup{$amb} = \@copy;
}
sub translate{
    my $codon = shift;
    return($codons{$codon});
}
sub codons{
    my %codons = (
    'TCA' => 'S',    # Serine
    'TCC' => 'S',    # Serine
    'TCG' => 'S',    # Serine
    'TCT' => 'S',    # Serine
    'TTC' => 'F',    # Phenylalanine
    'TTT' => 'F',    # Phenylalanine
    'TTA' => 'L',    # Leucine
    'TTG' => 'L',    # Leucine
    'TAC' => 'Y',    # Tyrosine
    'TAT' => 'Y',    # Tyrosine
    'TAA' => '*',    # Stop
    'TAG' => '*',    # Stop
    'TGC' => 'C',    # Cysteine
    'TGT' => 'C',    # Cysteine
    'TGA' => '*',    # Stop
    'TGG' => 'W',    # Tryptophan
    'CTA' => 'L',    # Leucine
    'CTC' => 'L',    # Leucine
    'CTG' => 'L',    # Leucine
    'CTT' => 'L',    # Leucine
    'CCA' => 'P',    # Proline
    'CCC' => 'P',    # Proline
    'CCG' => 'P',    # Proline
    'CCT' => 'P',    # Proline
    'CAC' => 'H',    # Histidine
    'CAT' => 'H',    # Histidine
    'CAA' => 'Q',    # Glutamine
    'CAG' => 'Q',    # Glutamine
    'CGA' => 'R',    # Arginine
    'CGC' => 'R',    # Arginine
    'CGG' => 'R',    # Arginine
    'CGT' => 'R',    # Arginine
    'ATA' => 'I',    # Isoleucine
    'ATC' => 'I',    # Isoleucine
    'ATT' => 'I',    # Isoleucine
    'ATG' => 'M',    # Methionine
    'ACA' => 'T',    # Threonine
    'ACC' => 'T',    # Threonine
    'ACG' => 'T',    # Threonine
    'ACT' => 'T',    # Threonine
    'AAC' => 'N',    # Asparagine
    'AAT' => 'N',    # Asparagine
    'AAA' => 'K',    # Lysine
    'AAG' => 'K',    # Lysine
    'AGC' => 'S',    # Serine
    'AGT' => 'S',    # Serine
    'AGA' => 'R',    # Arginine
    'AGG' => 'R',    # Arginine
    'GTA' => 'V',    # Valine
    'GTC' => 'V',    # Valine
    'GTG' => 'V',    # Valine
    'GTT' => 'V',    # Valine
    'GCA' => 'A',    # Alanine
    'GCC' => 'A',    # Alanine
    'GCG' => 'A',    # Alanine
    'GCT' => 'A',    # Alanine
    'GAC' => 'D',    # Aspartic Acid
    'GAT' => 'D',    # Aspartic Acid
    'GAA' => 'E',    # Glutamic Acid
    'GAG' => 'E',    # Glutamic Acid
    'GGA' => 'G',    # Glycine
    'GGC' => 'G',    # Glycine
    'GGG' => 'G',    # Glycine
    'GGT' => 'G',    # Glycine
    );
    return(%codons);
}
author	ryanmorin
date	Tue, 18 Oct 2011 18:33:15 -0400
parents	74f5ea818cea
children