changeset 0:7c94246126b0 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:35:53 -0600
parents
children
files GroupAltTranscriptVariants.xml hgvs_collapse_transcripts
diffstat 2 files changed, 223 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GroupAltTranscriptVariants.xml	Wed Mar 25 13:35:53 2015 -0600
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+
+<tool id="hgvs_group_alt_variants" name="Group alternate transcript effects">
+  <description>to reduce the number of rows in a HGVS table</description>
+  <version_string>hgvs_collapse_transcripts -v</version_string>
+  <command interpreter="perl">hgvs_collapse_transcripts $input_hgvs_annotated_table $out_hgvs_annotated_table</command>
+  <inputs>
+    <param format="achri_annotated_snp_table" name="input_hgvs_annotated_table" type="data" label="Annotated variant calls table with HGVS syntax, one line per alternate transcript"/>
+  </inputs>
+  <outputs>
+    <data format="achri_annotated_snp_table" name="out_hgvs_annotated_table" type="data" label="Variant table with one row for all transcripts wih same AA mod"/>
+  </outputs>
+
+  <tests>
+  </tests>
+
+  <help>
+  This tool collapses multiple rows in an HGVS annotated variant table if they refer to the same genomic variant, and the amino acid HGVS syntax is the same except for the residue number.
+  The row with the lowest numeric suffix for the model transcript (e.g. NM_000245 is before NM1002345) will have its cDNA and AA positions reported, and the alternate IDs are appoended to the transcript ID field separated by ';'.
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgvs_collapse_transcripts	Wed Mar 25 13:35:53 2015 -0600
@@ -0,0 +1,201 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+# reports the variant in transcripts separately only if the AA change is different, or distance from splicing site is different
+@ARGV == 2 or @ARGV == 3 or die "Usage: $0 <hgvs input.txt> <output.txt> [ignore splice distance diff]\n";
+
+my %ftype_rank = ( protein_coding => 100,
+              processed_transcript => 90,
+              antisense => 80,
+              retained_intron => 70,
+              lincRNA => 60,
+              nonsense_mediated_decay => 50,
+              misc_enrichment_kit_target => 0); 
+
+my %mtype_rank = ( nonsense => 100,
+                      frameshift => 99,
+                      nonstop => 90,
+                      missense => 80,
+                      silent => 50,
+                      "non-coding" => 40);
+open(IN, $ARGV[0])
+  or die "Cannot open $ARGV[0] for reading: $!\n";
+open(OUT, ">$ARGV[1]")
+  or die "Cannot open $ARGV[1] for writing: $!\n";
+my $succinct = (@ARGV == 3);
+my @lines;
+my $last_chr = "";
+my $last_pos = "";
+my $last_alt = "";
+my %buffered_F;
+my %buffered_id_rank;
+my $header = <IN>;
+print OUT $header;
+
+my ($chr_column, $pos_column, $alt_column, $cdna_hgvs_column, $aa_hgvs_column, $phase_column, $transcript_column, $transcript_length_column, $exon_dist_column, $ftype_column, $mtype_column, $splicing_score_column, $splicing_effect_column, $sources_column);
+chomp $header;
+my @headers = split /\t/, $header;
+for(my $i = 0; $i <= $#headers; $i++){
+  if($headers[$i] eq "Chr"){
+    $chr_column = $i;
+  }
+  elsif($headers[$i] eq "Feature type"){
+    $ftype_column = $i;
+  }
+  elsif($headers[$i] eq "Variant type"){
+    $mtype_column = $i;
+  }
+  elsif($headers[$i] eq "DNA From" or $headers[$i] eq "DNA Pos"){
+    $pos_column = $i;
+  }
+  elsif($headers[$i] eq "Obs base"){
+    $alt_column = $i;
+  }
+  elsif($headers[$i] eq "Transcript HGVS"){
+    $cdna_hgvs_column = $i;
+  }
+  elsif($headers[$i] eq "Protein HGVS"){
+    $aa_hgvs_column = $i;
+  }
+  elsif($headers[$i] eq "Phase"){
+    $phase_column = $i;
+  }
+  elsif($headers[$i] eq "Selected transcript"){
+    $transcript_column = $i;
+  }
+  elsif($headers[$i] eq "Transcript length"){
+    $transcript_length_column = $i;
+  }
+  elsif($headers[$i] eq "Closest exon junction (AA coding variants)"){
+    $exon_dist_column = $i;
+  }
+  elsif($headers[$i] eq "Splicing alteration potential"){
+    $splicing_score_column = $i;
+  }
+  elsif($headers[$i] eq "Splicing alteration details"){
+    $splicing_effect_column = $i;
+  }
+  elsif($headers[$i] eq "Sources"){
+    $sources_column = $i;
+  }
+}
+if(not defined $chr_column){
+  die "Could not find Chr header in $ARGV[0], aborting.\n";
+}
+if(not defined $pos_column){
+  die "Could not find 'DNA From' header in $ARGV[0], aborting.\n";
+}
+if(not defined $alt_column){
+  die "Could not find 'Obs base' header in $ARGV[0], aborting.\n";
+}
+if(not defined $cdna_hgvs_column){
+  die "Could not find 'Transcript HGVS' header in $ARGV[0], aborting.\n";
+}
+if(not defined $aa_hgvs_column){
+  die "Could not find 'Protein HGVS' header in $ARGV[0], aborting.\n";
+}
+if(not defined $phase_column){
+  die "Could not find Phase header in $ARGV[0], aborting.\n";
+}
+if(not defined $transcript_column){
+  die "Could not find 'Selected transcript' header in $ARGV[0], aborting.\n";
+}
+if(not defined $transcript_length_column){
+  die "Could not find 'Transcript length' header in $ARGV[0], aborting.\n";
+}
+#if(not defined $mtype_column){
+#  die "Could not find 'Variant type' header in $ARGV[0], aborting.\n";
+#}
+if(not defined $ftype_column){
+  die "Could not find 'Feature type' header in $ARGV[0], aborting.\n";
+}
+if(not defined $exon_dist_column){
+  die "Could not find 'Closest exon junction (AA coding variants)' header in $ARGV[0], aborting.\n";
+}
+
+while(<IN>){
+  chomp;
+  my @F = split /\t/, $_, -1;
+  my $chr = $F[$chr_column];
+  my $pos = $F[$pos_column];
+  my $alt = $F[$alt_column];
+  my $cdna_hgvs = $F[$cdna_hgvs_column];
+  my $hgvs = $F[$aa_hgvs_column];
+  my $ftype = $F[$ftype_column];
+  my $mtype;
+  $mtype = $F[$mtype_column] if defined $mtype_column;
+  $hgvs =~ s/\d+//g; # look only at the non-position parts of the AA syntax
+  my $exon_edge_distance = $F[$exon_dist_column];
+  my $phase = $F[$phase_column] || "";
+  # in the case of large indels (i.e. CNVs), their positions may not be the same, but effectively they should be reported as such
+  if($phase =~ /CNV-\S+?:(\S+)/){
+    $pos = $1;
+  }
+  my $preferred_id = $succinct ? ($F[$transcript_column] =~ /^$succinct/o) : 0;
+  my $id_rank = $F[$transcript_column];
+  $id_rank =~ s/^.*?0*(\d+)(\.\d+)?$/$1/; # look at only trailing non-padded number (and no .version suffix)
+
+  my $collapse_key = $succinct ? "$chr:$pos:$alt" : "$chr:$pos:$hgvs:$exon_edge_distance:$phase";
+  # Same variant
+  if($chr eq $last_chr and $pos eq $last_pos and $alt eq $last_alt){
+    # same AA effect
+    if(not exists $buffered_F{$collapse_key}){
+      $buffered_F{$collapse_key} = \@F;
+      $buffered_id_rank{$collapse_key} = $id_rank;
+    }
+    else{
+      $buffered_F{$collapse_key}->[$sources_column] .= "; $F[$sources_column]" if defined $sources_column;
+      $ftype_rank{$ftype} = 0 if not defined $ftype_rank{$ftype};
+      $mtype_rank{$mtype} = 0 if defined $mtype and not exists $mtype_rank{$mtype};
+      my $buf_ftype = $buffered_F{$collapse_key}->[$ftype_column];
+      $ftype_rank{$buf_ftype} = 0 if not defined $ftype_rank{$buf_ftype};
+      my $buf_mtype;
+      $buf_mtype = $buffered_F{$collapse_key}->[$mtype_column] if defined $mtype_column;
+      $mtype_rank{$buf_mtype} = 0 if defined $buf_mtype and not exists $mtype_rank{$buf_mtype};
+      # see if this transcript is "earlier" than the other, based on ID #
+      if($ftype_rank{$ftype} > $ftype_rank{$buf_ftype} or 
+         (defined $mtype and $mtype_rank{$mtype} > $mtype_rank{$buf_mtype}) or
+         $preferred_id or
+          ($id_rank =~ /^\d+$/ and $buffered_id_rank{$collapse_key} =~ /^\d+$/ and $id_rank < $buffered_id_rank{$collapse_key}) 
+         or $F[$transcript_column] lt $buffered_F{$collapse_key}->[$transcript_column]){ # alphabetical as backup
+        # make this the first one reported (and use its HGVS syntax)
+        $buffered_F{$collapse_key}->[$transcript_length_column] = $F[$transcript_length_column]."; ".$buffered_F{$collapse_key}->[$transcript_length_column];
+        $buffered_F{$collapse_key}->[$transcript_column] = $F[$transcript_column]."; ".$buffered_F{$collapse_key}->[$transcript_column];
+        $buffered_F{$collapse_key}->[$cdna_hgvs_column] = $F[$cdna_hgvs_column];
+        $buffered_F{$collapse_key}->[$aa_hgvs_column] = $F[$aa_hgvs_column];
+        $buffered_id_rank{$collapse_key} = $id_rank;
+      }
+      else{
+        # just append it to the list of IDs
+        $buffered_F{$collapse_key}->[$transcript_length_column] .= "; $F[$transcript_length_column]";
+        $buffered_F{$collapse_key}->[$transcript_column] .= "; $F[$transcript_column]";
+      }
+      if($succinct and defined $splicing_score_column and $F[$splicing_score_column] ne "NA" and $F[$splicing_score_column] > $buffered_F{$collapse_key}->[$splicing_score_column]){
+        $buffered_F{$collapse_key}->[$splicing_score_column] = $F[$splicing_score_column];
+        $buffered_F{$collapse_key}->[$splicing_effect_column] = $F[$splicing_effect_column] ." (transcript model ".$F[$transcript_column].")";
+      }
+    }
+  }
+  # Different variant from the last line, dump any buffered data
+  else{
+    for my $collapse_key (keys %buffered_F){
+      if(defined $sources_column){
+        my %seen;
+        $buffered_F{$collapse_key}->[$sources_column] = join("; ", grep {not $seen{$_}++} split(/; /, $buffered_F{$collapse_key}->[$sources_column]));
+      }
+      print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n";
+    }
+    undef %buffered_F;
+    $last_chr = $chr;
+    $last_pos = $pos;
+    $last_alt = $alt;
+    $buffered_F{$collapse_key} = \@F;
+    $buffered_id_rank{$collapse_key} = $id_rank;
+  }
+}
+for my $collapse_key (keys %buffered_F){
+  print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; 
+}
+close(IN);