Mercurial > repos > yusuf > associate_phenotypes

diff associate_variant_phenotypes @ 0:6411ca16916e default tip
initial commit
author: Yusuf Ali <ali@yusuf.email>
date: Wed, 25 Mar 2015 13:23:29 -0600
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/associate_variant_phenotypes	Wed Mar 25 13:23:29 2015 -0600
@@ -0,0 +1,366 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Math::CDF qw(pchisq); # chi square for calculating Fisher's method of combining p-values
+use File::Basename;
+my $dirname = dirname(__FILE__);
+
+# configuration file stuff
+my %config;
+my $tool_data = shift @ARGV;
+if(not -e "$tool_data/associate_phenotypes.loc" ){
+  system("cp $dirname/tool-data/associate_phenotypes.loc $tool_data/associate_phenotypes.loc") >> 8 and die "Could not create config file: $!\n";
+}
+open CONFIG, '<', "$tool_data/associate_phenotypes.loc";
+while(<CONFIG>){
+  next if $_ =~ /^#/;
+  (my $key, my $value) = split(/\s+/,$_);
+  $config{$key} = $value;
+}
+close CONFIG;
+my $dbs_dir = $config{"dbs_dir"};
+
+@ARGV == 6 or die "Usage: $0 <outfiles prefix> <annotated input> <preselected gene list of interest> <human literature terms> <mouse knockout terms> <gene ontology terms>\n";
+
+my $final_confident_outfiles_prefix = shift @ARGV;
+my $confident_input = shift @ARGV;
+my $preselected_genes_file = shift @ARGV;
+my $human_lit_file = shift @ARGV;
+my $mouse_knockout_file = shift @ARGV;
+my $gene_ontology_file = shift @ARGV;
+
+my @genes;
+if(-e $preselected_genes_file){
+  open(GENES, $preselected_genes_file)
+    or die "Cannot open $preselected_genes_file for reading: $!\n";
+  while(<GENES>){
+    chomp;
+    next if /^#/ or not /\S/;
+    s/^\s+|\s+$//g; # get rid of trailing or leading spaces
+    push @genes, $_;
+  }
+  close(GENES);
+}
+else{
+  @genes = split / or /, $preselected_genes_file;
+}
+my %genes;
+for my $g (@genes){
+  $genes{lc($g)} = 1;
+}
+
+my @human_lit_query;
+if(-e $human_lit_file){
+  open(HUMAN, $human_lit_file)
+    or die "Cannot open $human_lit_file for reading: $!\n";
+  while(<HUMAN>){
+    chomp;
+    next if /^#/ or not /\S/;
+    s/^\s+|\s+$//g; # get rid of trailing or leading spaces
+    s/\s+/ /g; # normalize any other whitespace
+    # to do: stem query terms? exclude stop words?
+    push @human_lit_query, $_;
+  }
+  close(HUMAN);
+}
+else{
+  @human_lit_query = split / or /, $human_lit_file;
+}
+my $human_lit_query = join(" or ", @human_lit_query, @genes);
+
+my @mouse_knockout_query;
+if(-e $mouse_knockout_file){
+  open(MOUSE, $mouse_knockout_file)
+    or die "Cannot open $mouse_knockout_file for reading: $!\n";
+  while(<MOUSE>){
+    chomp;
+    next if /^#/ or not /\S/;
+    s/^\s+|\s+$//g; # get rid of trailing or leading spaces
+    s/\s+/ /g; # normalize any other whitespace
+    # to do: stem query terms? exclude stop words?
+    push @mouse_knockout_query, $_;
+  }
+  close(MOUSE);
+}
+else{
+  @mouse_knockout_query = split / or /, $mouse_knockout_file;
+}
+my $mouse_knockout_query = join(" or ", @mouse_knockout_query);
+
+my @go_query;
+if(-e $gene_ontology_file){
+  open(GO, $gene_ontology_file)
+    or die "Cannot open $gene_ontology_file for reading: $!\n";
+  while(<GO>){
+    chomp;
+    next if /^#/ or not /\S/;
+    s/^\s+|\s+$//g; # get rid of trailing or leading spaces
+    s/\s+/ /g; # normalize any other whitespace
+    # to do: stem query terms? exclude stop words?
+    push @go_query, $_;
+  }
+}
+else{
+  @go_query = split / or /, $gene_ontology_file;
+}
+my $go_query = join(" or ", @go_query);
+close(GO);
+
+my @cmds;
+
+if($human_lit_query){
+  # do pubmed first because it has to potentially download references from the internet, so better to do this with just a couple concurrent rather than a lot, which would stress the remote iHOP server
+  push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/IHOP/ PubMed $confident_input - '$human_lit_query'";
+  push @cmds, "$dirname/filter_by_susceptibility_loci_pipe $dbs_dir/GWAS/gwascatalog.txt - - '$human_lit_query'";
+  push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/OMIM/omim.txt. OMIM - - '$human_lit_query'";
+  push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/ClinVar/ClinVarFullRelease.xml. ClinVar - - '$human_lit_query'";
+  push @cmds, "$dirname/filter_by_human_phenotype_ontology_pipe $dbs_dir/HPO - - '$human_lit_query'";
+}
+
+if($mouse_knockout_query or $human_lit_query){
+  if($mouse_knockout_query){
+    if($human_lit_query){
+      $mouse_knockout_query .= " or $human_lit_query";
+    }
+  }
+  else{
+    $mouse_knockout_query = $human_lit_query;
+  }
+  if($human_lit_query){
+    push @cmds, "$dirname/filter_by_mouse_knockout_pipe $dbs_dir/MGI/2013-03-15 - - '$mouse_knockout_query'";
+  }
+  else{
+    push @cmds, "$dirname/filter_by_mouse_knockout_pipe $dbs_dir/MGI/2013-03-15 $confident_input - '$mouse_knockout_query'"
+  }
+}
+
+if($go_query or $human_lit_query){
+  if($go_query){
+    if(@human_lit_query){
+      $go_query .= " or ".join(" or ", @human_lit_query);
+    }
+  }
+  else{
+    $go_query = join(" or ", @human_lit_query);
+  }
+  if($mouse_knockout_query or $human_lit_query){
+    push @cmds, "$dirname/associate_phenotypes/filter_by_gene_ontology_pipe $dbs_dir/GOA - - '$go_query'";
+  }
+  else{
+    push @cmds, "$dirname/associate_phenotypes/filter_by_gene_ontology_pipe $dbs_dir/GOA $confident_input - '$go_query'";
+  }
+}
+
+&print_final_output($final_confident_outfiles_prefix, @cmds);
+
+# Use Fisher's Method to combine p-values from various phenotype sources into a single score for ranking
+# This is an okay method to use (rather than something more complicated like Brown's method), because our
+# experience with real queries is that there is surprsingly little correlation (Spearman's rank or Kendall's tau) between 
+# the p-values for different sources (primary or curated secondary).
+sub print_final_output{
+  my ($final_output_prefix, @cmds) = @_;
+
+  my $cmd = join("|", @cmds). "|"; # pipe output so we read the stream in the handle below
+  open(ORIG, $cmd)
+    or die "Could not run '$cmd': $!\n";
+  my $header = <ORIG>;
+  chomp $header;
+  my @orig_header = split /\t/, $header;  
+  my ($chr_column, $pos_column, $gene_column, $hgvs_aa_column, $maf_column, $srcs_column, @pvalue_columns, @pheno_match_columns);
+  for(my $i = 0; $i <= $#orig_header; $i++){
+    if($orig_header[$i] eq "Chr"){
+      $chr_column = $i;
+    }
+    elsif($orig_header[$i] eq "DNA From"){
+      $pos_column = $i;
+    }
+    elsif($orig_header[$i] eq "Gene Name"){
+      $gene_column = $i;
+    }
+    elsif($orig_header[$i] eq "Protein HGVS"){
+      $hgvs_aa_column = $i;
+    }
+    elsif($orig_header[$i] eq "Pop. freq."){
+      $maf_column = $i;
+    }
+    elsif($orig_header[$i] eq "Sources"){
+      $srcs_column = $i;
+    }
+    elsif($orig_header[$i] =~ /p-value/){ # columns of pheno association with a stat
+       push @pvalue_columns, $i;
+    }
+    elsif($orig_header[$i] =~ /\(matching/){
+       push @pheno_match_columns, $i;
+    }
+  }
+  if(not defined $chr_column){
+    die "Could not find the 'Chr' column in the header, aborting ($header)\n";
+  }
+  elsif(not defined $pos_column){
+    die "Could not find the 'DNA From' column in the header, aborting ($header)\n";
+  }
+  elsif(not defined $hgvs_aa_column){
+    die "Could not find the 'Protein HGVS' column in the header, aborting ($header)\n";
+  }
+  elsif(not defined $maf_column){
+    die "Could not find the 'Pop. freq.' column in the header, aborting ($header)\n";	
+  }
+  # Sources is optional
+
+  # all other headers from other output files generated will be appended to the original ones
+  my @final_header = (@orig_header, "Combined phenotype relevance P-value");
+  if(@genes){
+    push @final_header, "Targeted Gene?";
+  }
+  my %lines; # chr -> position -> [dataline1, dataline2, ...]
+  my %source; # no. lines per variant source
+  while(<ORIG>){
+      chomp;
+      next unless /\S/; # ignore blank lines
+      my @F = split /\t/, $_, -1; # keep trailing blank fields
+      my $chr = $F[$chr_column];
+      $chr =~ s/^chr//; # helps for sorting purposes
+      my $pos = $F[$pos_column];
+      $pos =~ s/-.*$//; # CNVs have a range
+      $lines{$chr} = {} if not exists $lines{$F[$chr_column]};
+      $lines{$chr}->{$pos} = [] if not exists $lines{$F[$chr_column]}->{$pos};
+      my @final_dataline = @F;  # fields that are the same in all files since they were in the original
+      for(my $i = 0; $i < $#final_dataline; $i++){
+        $final_dataline[$i] = "" if not defined $final_dataline[$i];
+      }
+      # Create aggregate phenotype relevance score using Fisher's method
+      # A combined p-value for k p-values (P1...Pk) is calculated using a chi-square value (with 2k degrees of freedom) derived by -2*sum(ln(Pi), i=1..k)
+      my $chi_sq = 0;
+      my $num_pvalues = 0;
+      my $last_pvalue = 1;
+      for my $pvalue_index (@pvalue_columns){
+        next if $F[$pvalue_index] eq "";
+        $last_pvalue = $F[$pvalue_index];
+        $F[$pvalue_index] = 0.00001 if not $F[$pvalue_index]; # avoids log(0) issue
+        $num_pvalues++;
+        $chi_sq += log($F[$pvalue_index]);
+      }
+      my $fisher_pvalue = 1;
+      if($num_pvalues > 1){
+        $chi_sq *= -2;
+        my $p = pchisq($chi_sq, 2*scalar(@pvalue_columns));
+        if(not defined $p){
+          print STDERR "($num_pvalues) No X2 test value for $chi_sq (";
+          for my $pvalue_index (@pvalue_columns){
+            if($F[$pvalue_index] eq ""){print STDERR "NA "}
+            else{print STDERR $F[$pvalue_index], " "}
+          }
+          print STDERR ")\n$_\n";
+        }
+        $fisher_pvalue = 1-$p;
+      }
+      elsif($num_pvalues == 1){
+        $fisher_pvalue = $last_pvalue; # no multiple testing correction
+      }
+      else{
+        for my $match_column (@pheno_match_columns){
+          next if $F[$match_column] eq ""; # give a token amount of positive score to ontology term matches
+          for my $match (split /\/\/|;/, $F[$match_column]){
+            last if $fisher_pvalue <= 0.001; # only make better if not realy close to zero anyway
+            $fisher_pvalue -= 0.001;
+          }
+        }
+      }
+      push @final_dataline, abs($fisher_pvalue);
+      if(@genes){
+        push @final_dataline, (grep({exists $genes{$_}} split(/; /, lc($F[$gene_column]))) ? "yes" : "no");
+      }
+      push @{$lines{$chr}->{$pos}}, \@final_dataline;
+
+      next unless defined $srcs_column and $F[$srcs_column] =~ /(?:^|\+| )(\S+?)(?=;|$)/;
+      $source{$1}++;
+  }
+
+  my @outfiles = ("$final_output_prefix.novel.hgvs.txt", "$final_output_prefix.very_rare.hgvs.txt", "$final_output_prefix.rare.hgvs.txt", "$final_output_prefix.common.hgvs.txt");
+  open(OUT_NOVEL, ">$outfiles[0]")
+    or die "Cannot open $outfiles[0] for writing: $!\n";
+  open(OUT_VERY_RARE, ">$outfiles[1]")
+    or die "Cannot open $outfiles[1] for writing: $!\n";
+  open(OUT_RARE, ">$outfiles[2]")
+    or die "Cannot open $outfiles[2] for writing: $!\n";
+  open(OUT_COMMON, ">$outfiles[3]")
+    or die "Cannot open $outfiles[3] for writing: $!\n";
+  print OUT_NOVEL join("\t", @final_header), "\n";
+  print OUT_VERY_RARE join("\t", @final_header), "\n";
+  print OUT_RARE join("\t", @final_header), "\n";
+  print OUT_COMMON join("\t", @final_header), "\n";
+  my @sorted_chrs = sort {$a =~ /^\d+$/ and $b =~ /^\d+$/ and $a <=> $b or $a cmp $b} keys %lines;
+  for my $chr (@sorted_chrs){
+    for my $pos (sort {$a <=> $b} keys %{$lines{$chr}}){
+      my $datalines_ref = $lines{$chr}->{$pos};
+      # The following sorting puts all protein coding effect for a variant before non-coding ones
+      my @sorted_dataline_refs = sort {$a ne "NA" and $b ne "NA" and $a->[$hgvs_aa_column] cmp $a->[$hgvs_aa_column] or $b cmp $a} @$datalines_ref;
+      for my $dataline_ref (@sorted_dataline_refs){
+        next unless defined $dataline_ref;
+        my $maf = $dataline_ref->[$maf_column];
+        my $tot_line_length = 0;
+        for(my $i = 0; $i < $#{$dataline_ref}; $i++){
+          if(not defined $dataline_ref->[$i]){
+            $dataline_ref->[$i] = ""; # so we don't get crappy warnings of undefined values
+          }
+          else{
+            $tot_line_length += length($dataline_ref->[$i]);
+          }
+          $tot_line_length++; # the tab
+        }
+        if($tot_line_length > 32000){ # Excel limit of 32767 characters in a cell. Also, undocumented bug that entire import line cannot exceeed this length. 
+                                      # If we don't truncate, the rest of the line (including entire contents of cells to the right) are unceremoniously dumped.
+                                      # Note that personal experience has shown that the limit is actually a bit below this, so rounding down to the nearest 1000 for safety
+          my $overage = $tot_line_length - 32000;
+          my $sum_of_large_cells = 0;
+          my $num_large_cells = 0;
+          for(my $i = 0; $i <= $#{$dataline_ref}; $i++){ # remove contents from the largest cells
+            if(length($dataline_ref->[$i]) > 3200){
+              $sum_of_large_cells += length($dataline_ref->[$i]); # all cells that are at least 10% of the max
+              $num_large_cells++;
+            }
+          }
+          my $cell_max_alloc = int((32000-($tot_line_length-$sum_of_large_cells))/$num_large_cells);
+          for(my $i = 0; $i <= $#{$dataline_ref}; $i++){ # truncate the bigger than average ones
+            if(length($dataline_ref->[$i]) > $cell_max_alloc){
+              $dataline_ref->[$i] = substr($dataline_ref->[$i], 0, $cell_max_alloc-37)."[...remainder truncated for length]";
+            }
+          }
+        }
+        if($maf eq "NA"){
+          print OUT_NOVEL join("\t", @$dataline_ref), "\n";
+        }
+        if($maf eq "NA" or $maf < 0.005){
+          print OUT_VERY_RARE join("\t", @$dataline_ref), "\n";
+        }
+        if($maf eq "NA" or $maf < 0.05){
+          print OUT_RARE join("\t", @$dataline_ref), "\n";
+        }
+        print OUT_COMMON join("\t", @$dataline_ref), "\n";
+      }
+    }
+  }
+  close(OUT_NOVEL);
+  close(OUT_VERY_RARE);
+  close(OUT_RARE);
+  close(OUT_COMMON);
+
+  # Print per-source tables (e.g. for each patient in a cohort)
+  for my $src (keys %source){
+    for my $outfile (@outfiles){
+      open(IN, $outfile)
+        or die "cannot open $outfile for reading: $!\n";
+      my $src_outfile = $outfile;
+      $src_outfile =~ s/$final_output_prefix/$final_output_prefix-$src/;
+      open(OUT, ">$src_outfile")
+        or die "Cannot open $src_outfile for writing: $!\n";
+      print OUT scalar(<IN>); # header line
+      while(<IN>){
+        print OUT $_ if /(?:^|\+| )($src)(?=;|$)/; 
+      }
+      close(OUT);
+    }
+  }
+}
+
author	Yusuf Ali <ali@yusuf.email>
date	Wed, 25 Mar 2015 13:23:29 -0600
parents
children