changeset 0:0d7a85ddac86 default tip

initial commit of project
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:44:41 -0600
parents
children
files TriageRareGenotypes.xml rare_triage
diffstat 2 files changed, 349 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TriageRareGenotypes.xml	Wed Mar 25 13:44:41 2015 -0600
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+
+<tool id="hgvs_combine_1" name="Triage cohort rare/novel variants">
+  <description>from combined cohort HGVS table</description>
+  <version_string>rare_triage -v</version_string>
+  <command interpreter="perl">rare_triage $input_hgvs_table $output_novel_gene_table $output_rare_gene_table $rare_freq $long_gene_threshold $non_coding
+
+
+  </command>
+  <inputs>
+    <param format="achri_snp_table" name="input_hgvs_table" type="data" label="A combined, non-collapsed HGVS annotation table for a cohort, with the sample names in the last column"/>
+    <param name="rare_freq" type="float" value="0.001" min="0" max="1" label="Max. minor allele frequency" help=" for reporting homozygous 'rare' variants"/>
+    <param name="long_gene_threshold" type="integer" value="30000" label="Gene length threshold" help="for making 'Exceptionally long gene'  marginal cohort call"/>
+    <param name="non_coding" type="boolean" truevalue="true" falsevalue="false" value="False" label="Report non-coding variants as well?"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output_novel_gene_table" type="data" label="Table of novel cohort variants"/>
+    <data format="tabular" name="output_rare_gene_table" type="data" label="Table of rare/marginal call cohort variants"/>
+  </outputs>
+
+  <tests>
+  </tests>
+
+  <help>
+This tool takes a combined HGVS table of genotype calls for a cohort, and generates two tables. The first lists genes with novel homozygous variants in all samples, allowing for locus heterogeneity. The second table lists genes with marginal calls (e.g. lots of genotyping caveats), or very rare minor allele frequencies in the general population.
+  </help> 
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rare_triage	Wed Mar 25 13:44:41 2015 -0600
@@ -0,0 +1,321 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $trio = 0;
+if(@ARGV == 1 and $ARGV[0] eq "-v"){
+  print "Version 0.1"; exit;
+} 
+if($ARGV[0] eq "-t"){
+  shift @ARGV;
+  $trio = 1;
+} 
+
+@ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-t(rio)] <input hgvs annotated variant table.txt> <novel output.txt> <rare output.txt> <maf reporting threshold> <max. gene DNA length caveat cutoff> <report non-coding variants too (true|false)> <num samples allowed to be missing> <dominant model (true|false)> [known false positive novels.txt]\n";
+
+my %known_fps; # a priori known false positives
+if(@ARGV == 9){
+  open(FP, $ARGV[8])
+    or die "Cannot open $ARGV[8] for reading: $!\n";
+  while(<FP>){
+    next if /^\s*#/;
+    chomp;
+    my @F = split /\t/, $_;
+    $known_fps{"$F[0]:$F[1]"} = 1;
+  }
+  close(FP);
+  pop @ARGV;
+}
+
+my $maybe_dominant = pop @ARGV; # i.e. should novel non-benign het calls be considered across the cohort?
+$maybe_dominant = $maybe_dominant =~ /^(?:t(?:rue)?|1|y(?:es)?)/i;
+my $num_allowed_missing = pop @ARGV; # report rare events even if not universal, as long as coverage is missing in the non-conforming samples
+my $all_variants = pop @ARGV; # restrict to protein-coding only?
+$all_variants = $all_variants =~ /^(?:t(?:rue)?|1|y(?:es)?)/i;
+my $gene_length_threshold = pop @ARGV;
+my $maf_threshold = pop @ARGV;
+
+my %gene_vars;
+my %gene2chr;
+my %gene2desc;
+my %pos2info;
+my %pos2hgvs;
+my %sample_count;
+open(IN, $ARGV[0])
+  or die "Cannot open $ARGV[0] for reading: $!\n";
+open(NOVEL, ">$ARGV[1]")
+  or die "Cannot open $ARGV[1] for writing: $!\n";
+open(RARE, ">$ARGV[2]")
+  or die "Cannot open $ARGV[2] for writing: $!\n";
+<IN>; # header
+while(<IN>){
+  chomp;
+  my @F = split /\t/, $_;
+  my $chr = $F[0];
+  my $pos = $F[1];
+  next if exists $known_fps{"$chr:$pos"};
+  my $var_depth = $F[6];
+  my $total_depth = $F[7];
+  my @rsid_list = split /,/, $F[10];
+  my @dbsnp_ver_list = split /,/, $F[11];
+  my @maf_list = split /,/, $F[12];
+  my $rare_event = 0;
+  for (my $i = 0; $i <= $#maf_list; $i++){
+    next if $maf_list[$i] eq "-";
+    if($maf_list[$i] eq "NA" or $maf_list[$i] <= $maf_threshold){ 
+      $rare_event = 1 if $rsid_list[$i] eq "novel" or $rsid_list[$i] =~ /^rs/ and $dbsnp_ver_list[$i] >= 132;
+#      print "$chr:$pos $maf_list[$i] $rsid_list[$i] $dbsnp_ver_list[$i] : $rare_event\n";
+      last;
+    }
+  }
+  next unless $rare_event;
+  my $sift = $F[13];
+  my $polyphen = $F[15];
+  my $dna_hgvs = $F[18];
+  next if $F[9] =~ /UTR/;
+  my $transcript_id = $F[19];
+  my $transcript_length = $F[20];
+  my $prot_hgvs = $F[21];
+  next if $prot_hgvs =~ /=$/;
+  my $zygosity_list = $F[22];
+  my $sample_list = $F[$#F];
+  my $phase_list = $F[$#F-1];
+  my $caveats = $F[$#F-2];
+  my $desc = $F[$#F-3];
+  my $tissues = $F[$#F-5];
+  my $gene = $F[17];
+  next if not defined $gene;
+  #for my $gene (split /;/, $gene_list){
+  next if not $gene =~ /\S/; # only work with named genes
+    $gene2chr{$gene} = $chr if not exists $gene2chr{$gene};
+    $gene2desc{$gene} = $desc if not exists $gene2desc{$gene};
+    $pos2info{"$chr:$pos"} = [join(",",@maf_list), $sift, $polyphen, $caveats, $var_depth, $total_depth, $transcript_length, join(",",@rsid_list)] if not exists $pos2info{"$chr:$pos"};
+    $pos2hgvs{"$chr:$pos"}->{$dna_hgvs} = [] if not exists $pos2hgvs{"$chr:$pos"}->{$dna_hgvs};
+    $pos2hgvs{"$chr:$pos"}->{$prot_hgvs} = [] if not exists $pos2hgvs{"$chr:$pos"}->{$dna_hgvs};
+    push @{$pos2hgvs{"$chr:$pos"}->{$dna_hgvs}}, $transcript_id;
+    push @{$pos2hgvs{"$chr:$pos"}->{$prot_hgvs}}, $transcript_id;
+    $gene_vars{$gene} = {} if not exists $gene_vars{$gene};
+    my @zygosities = split /; /, $zygosity_list;
+    my @phases = split /; /, $phase_list;
+    my @samples = split /; /, $sample_list;
+    for(my $i = 0; $i <= $#samples; $i++){
+      my $sample = $samples[$i];
+      my $phase = $phases[$i];
+      my $zygosity = $zygosities[$i];
+      $gene_vars{$gene}->{$sample} = {} if not exists $gene_vars{$gene}->{$sample};
+      $gene_vars{$gene}->{$sample}->{$pos} = [$zygosity, $phase];
+      $sample_count{$sample}++;
+  #  }
+ }
+}
+close(IN);
+
+print NOVEL "Gene Name\tChr/Pos\tIn samples\tEvent Type\tMAF\tSIFT\tPolyPhen2\tGenotyping Caveats\tTranscript Length\tdbSNP ID\tHGVS DNA\tHGVS AA\n";
+print RARE "Gene Name\tChr/Pos\tIn samples\tEvent Type\tMAF\tSIFT\tPolyPhen2\tGenotyping Caveats\tTranscript Length\tidbSNP ID\tHGVS DNA\tHGVS AA\n";
+my @sample_names = $trio ? qw(Father Mother Child) : sort keys %sample_count;
+for my $gene (sort keys %gene_vars){
+  #next unless scalar(keys %{$gene_vars{$gene}}) == scalar(@sample_names); # require complete gene penetrance
+  my $chr = $gene2chr{$gene};
+  my %rare_phase;
+  my %rare_events;
+  my %rare_samples;
+  my %sample_counts;
+  # For trios, it's assumed to be in the order: father, mother, child
+  for my $sample_index (0..$#sample_names){
+    my $sample = $sample_names[$sample_index];
+    next unless exists $gene_vars{$gene}->{$sample}; 
+    for my $pos (keys %{$gene_vars{$gene}->{$sample}}){
+      next unless $all_variants or grep /p\.|c\.\d+_\d+\+\d|c\.\d+\+[12]_|c\.\d+-[12]_|c\.\d+-\d+_\d+$/, keys %{$pos2hgvs{"$chr:$pos"}};
+      print "Checking $chr:$pos\n";
+      # For now, only look for homo rare or compound het (known from explicit haplotype phasing data provided, or trio)
+      if($gene_vars{$gene}->{$sample}->{$pos}->[0] =~ /homo/){ 
+        my $maf_list = $pos2info{"$chr:$pos"}->[0];
+        for my $maf (split /,/, $maf_list){
+          next if $maf eq "-";
+          if($trio){ # only report if not homozyguous in either parent
+            next if exists $gene_vars{$gene}->{$sample_names[0]} and exists $gene_vars{$gene}->{$sample_names[0]}->{$pos} and $gene_vars{$gene}->{$sample_names[0]}->{$pos}->[0] =~ /homo/ or 
+                    exists $gene_vars{$gene}->{$sample_names[1]} and exists $gene_vars{$gene}->{$sample_names[1]}->{$pos} and $gene_vars{$gene}->{$sample_names[1]}->{$pos}->[0] =~ /homo/;
+          }
+          if($maf eq "NA"){
+            if($pos2info{"$chr:$pos"}->[1] eq "1"){ # SIFT says its benign
+              $rare_events{$pos} = "benign homo novel variant";
+            }
+            else{
+              $rare_events{$pos} = "homo novel variant";
+            }
+            $rare_samples{$pos} = [] unless exists $rare_samples{$pos};
+            push @{$rare_samples{$pos}}, $sample;
+            $sample_counts{$sample}++;
+            last;
+          }
+          # if here, it's rare, not novel
+          next if is_benign(\%pos2info, $chr, $pos);
+          # if here, rare and maybe not benign
+          $rare_events{$pos} = "homo rare variant";
+          $rare_samples{$pos} = [] unless exists $rare_samples{$pos};
+          push @{$rare_samples{$pos}}, $sample;
+          $sample_counts{$sample}++;
+        }
+      } # end homo
+      # het novel (if dominant model enabled), possibly non-benign
+      elsif($maybe_dominant and $pos2info{"$chr:$pos"}->[0] =~ /NA/ and $pos2info{"$chr:$pos"}->[7] =~ /novel/){
+        next if $pos2info{"$chr:$pos"}->[3] =~ /non-unique/; # or is_benign(\%pos2info, $chr, $pos); # has mapping caveat or is almost definitely benign
+        if($trio){ # only report if not present in either parent
+            next if exists $gene_vars{$gene}->{$sample_names[0]} and exists $gene_vars{$gene}->{$sample_names[0]}->{$pos} or  
+                    exists $gene_vars{$gene}->{$sample_names[1]} and exists $gene_vars{$gene}->{$sample_names[1]}->{$pos};
+        }
+        $rare_events{$pos} = "het novel variant, possibly non-benign under dominant disease model";
+        $rare_samples{$pos} = [] unless exists $rare_samples{$pos};
+        push @{$rare_samples{$pos}}, $sample;
+        $sample_counts{$sample}++;
+      }
+      # check if two rare het variants are trans in a sample, using phasing info
+      # the phase info should look like 
+      # A-chrX:134986768-134986769
+      # B-chrX:134986768-134986769
+      # or, in the case of trios we have
+      # Mother-GENENAME:1-length
+      # Father-GENENAME:1-length
+      # todo: handle case where more than one type of event happens at one position
+      elsif($trio or $gene_vars{$gene}->{$sample}->{$pos}->[1] =~ /^(\S+?)-(\S+):(\d+)-(\d+)/){
+        #next if is_benign(\%pos2info, $chr, $pos); # don't care about benign rare hets
+        my $novel = $pos2info{"$chr:$pos"}->[0] =~ /NA/;
+        my $phase_group = $trio ? get_phase_group($gene_vars{$gene}->{$sample_names[0]}, $gene_vars{$gene}->{$sample_names[1]}, $pos) : $1; # if trio, need to determine of if the allele came from the father or mother
+        #next if $phase_group eq "Either";
+        my $phase_chr = $trio ? $gene2chr{$gene} : $2;
+        my $phase_start = $trio ? 1 : $3;
+        my $phase_end = $trio ? $pos2info{"$chr:$pos"}->[6] : $4;
+        if(exists $rare_phase{"$phase_chr:$phase_start-$phase_end"}){
+          my $other_group = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[0];
+          next if $other_group eq $phase_group; # rare events on same allele...ignore for now as LD artifact (todo: revisit this???)
+          my $other_pos = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[1];
+          my $other_novel = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[2];
+          my $other_sample = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[3];
+          $rare_events{$pos} = "het ".($novel ? "novel":"rare")." variant phased trans with het ".($other_novel ? "novel":"rare")." variant at $phase_chr:$other_pos";
+          $rare_events{$other_pos} = "het ".($novel ? "novel":"rare")." variant phased trans with het ".($other_novel ? "novel":"rare")." variant at $phase_chr:$pos";
+          $rare_samples{$pos} = [] unless exists $rare_samples{$pos};
+          $rare_samples{$other_pos} = [] unless exists $rare_samples{$pos};
+          push @{$rare_samples{$pos}}, $sample;
+          push @{$rare_samples{$other_pos}}, $other_sample;
+          # todo: could have more than two rare trans hets...how to report? (all reported aganist first one right now)
+          $sample_counts{$sample}++;
+          $sample_counts{$other_sample}++;
+        }
+        else{
+          $rare_phase{"$phase_chr:$phase_start-$phase_end"} = [$phase_group, $pos, $novel, $sample];  # note as first rare allele, in case we find another one later in the same gene
+        }
+      }
+    }
+  }
+  # Don't have rare events in all samples?
+  next if $trio and not exists $sample_counts{"Child"};
+  my $num_missing = scalar(@sample_names) - scalar(keys %sample_counts);
+  my $reason = "";
+  if(not $trio and $num_missing){
+    next if $num_allowed_missing < $num_missing;
+    # todo: check if missing from a sample due to low coverage
+    my @missing_list;
+    for my $name (@sample_names){
+      push @missing_list, $name unless exists $sample_counts{$name};
+    }
+    $reason = "Rare events for this gene are not found in all cohort samples (missing ". join(", ", @missing_list).") / ";
+    # next;
+  }
+
+  my $novelty = 0;
+  my $num_caveats = 0;
+  my $assume_dominance = 0;
+  my $benign = 0;
+  my $gene_length;
+  my @protein_events;
+  for my $pos (sort {$a <=> $b} keys %rare_events){
+    $novelty++ if $rare_events{$pos} eq "homo novel variant" or $rare_events{$pos} =~ /^het novel/; 
+    $assume_dominance++ if $rare_events{$pos} =~ /dominant/; 
+    $benign++ if $rare_events{$pos} =~ /benign/; 
+    $num_caveats++ if $pos2info{"$chr:$pos"}->[3] =~ /\S/; 
+    $gene_length = $pos2info{"$chr:$pos"}->[6] if not defined $gene_length; 
+    my @dna_info;
+    my @prot_info;
+    for my $hgvs (sort keys %{$pos2hgvs{"$chr:$pos"}}){
+      if($hgvs =~ /^p\./){
+        push @prot_info, join(",", @{$pos2hgvs{"$chr:$pos"}->{$hgvs}}).":$hgvs";
+      }
+      else{
+        push @dna_info, join(",", @{$pos2hgvs{"$chr:$pos"}->{$hgvs}}).":$hgvs";
+      }
+    }
+    push @protein_events, "\t$pos\t". join("/", @{$rare_samples{$pos}}). "\t". $rare_events{$pos}. "\t". join("\t", @{$pos2info{"$chr:$pos"}}). 
+                                "\t". join(";", @dna_info). "\t". join(";", @prot_info). "\n";
+  }
+  if($novelty){
+    if($gene_length > $gene_length_threshold){
+      $reason .= "Exceptionally long gene, prone to random novel events at heterogeneous loci";
+    }
+    elsif($num_caveats){
+      if($num_caveats+1 >= @protein_events or $num_caveats >= 0.9*@protein_events){
+        $reason .= "Most variant calls for this gene have caveats";
+      }
+      elsif($assume_dominance){
+        my $num_events += scalar(@protein_events)-$assume_dominance;
+        if($num_caveats+1 >= $num_events or $num_caveats >= 0.9*$num_events){
+          $reason .= "Apart from novel hets, most variant calls for this gene have caveats";
+        }
+      }
+    }
+  }
+  elsif($benign == @protein_events){
+    $reason .= "The gene probably only contains benign variants";
+  }
+  else{
+    $reason .= "Variants found in the general population, albeit rarely";
+  }
+
+  # fix formatting if missing sample is the only problem
+  $reason =~ s( / $)();
+
+  if($novelty and not $reason){
+    print NOVEL "$gene\t$gene2chr{$gene}\t$gene2desc{$gene}\n", @protein_events;
+  }
+  else{
+    print RARE "$gene\t$gene2chr{$gene}\t$gene2desc{$gene}\t$reason\n", @protein_events;
+  }
+}
+
+sub get_phase_group{
+  my ($father_variants, $mother_variants, $pos) = @_;
+
+  if(defined $father_variants and exists $father_variants->{$pos}){
+    if(defined $mother_variants->{$pos} and exists $mother_variants->{$pos}){
+      return "Either"; # in future, trio-based phasing that models meiotic recombination may help here (e.g PHASE).
+    }
+    else{
+      return "Father";
+    }
+  }
+  elsif(defined $mother_variants->{$pos} and exists $mother_variants->{$pos}){
+    return "Mother";
+  }
+  else{
+    return "De_novo";
+  }
+}
+
+sub is_benign{
+  my ($pos2info, $chr, $pos) = @_;
+  if($pos2info->{"$chr:$pos"}->[1] eq "1"){
+    return 1; # sift thinks it's completely benign
+  }
+  elsif($pos2info->{"$chr:$pos"}->[1] eq "NA"){
+    if($pos2info->{"$chr:$pos"}->[2] ne "benign"){ # sift doesn't know, and polyphen thinks it might be bad
+      return 0;
+    }
+  }
+  elsif($pos2info->{"$chr:$pos"}->[1] < 0.06){ # sift thinks it might be bad
+    return 0;
+  }
+  elsif($pos2info->{"$chr:$pos"}->[2] ne "benign"){ # polyphen thinks it might be bad
+    return 0;
+  }
+  return 1; # nobody thinks it's bad
+}