changeset 0:6411ca16916e default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:23:29 -0600
parents
children
files AssociatePhenotypes.xml associate_variant_phenotypes filter_by_gene_ontology_pipe filter_by_human_phenotype_ontology_pipe filter_by_index_gamma filter_by_mouse_knockout_pipe filter_by_susceptibility_loci_pipe tool-data/associate_phenotypes.loc
diffstat 8 files changed, 1769 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/AssociatePhenotypes.xml	Wed Mar 25 13:23:29 2015 -0600
@@ -0,0 +1,128 @@
+<?xml version="1.0"?>
+
+<tool id="hgvs_assoc_phenos" name="Associate phenotypes to an HGVS table">
+  <description>based on the medical literature</description>
+  <version_string>echo 1.0.0</version_string>
+  <command interpreter="perl">associate_variant_phenotypes $__tool_data_path__ pheno $input_hgvs_table 
+	## Handle preselected gene list of interest
+	#if $preselectedGenesSource.source == "file":
+		$preselectedGenesSource.file_of_genenames
+	#else:
+		#set $glist = str($preselectedGenesSource.genename_list).replace("__cr____cn__", " or ")
+		"$glist"
+	#end if
+
+	## Handle human literature terms
+	#if $litQuerySource.source == "file":
+		$litQuerySource.file_of_phenotypes 
+	#else:
+		#set $qlist = str($litQuerySource.phenotype_list).replace("__cr____cn__", " or ")
+		"$qlist"
+	#end if
+
+        ## Handle human phenotype ontology query
+	##if $hpQuerySource.source == "file":
+		##$hpQuerySource.file_of_mpterms
+	##else:
+		##set $hplist = str($hpQuerySource.autocomplete_OLS_HP).replace(";", " or ")
+		##"$hplist"
+	##end if
+
+        ## Handle mouse knockout query (Mammalian Phenotype Ontology)
+	#if $mpQuerySource.source == "file":
+		$mpQuerySource.file_of_mpterms
+	#else:
+		#set $mplist = str($mpQuerySource.autocomplete_OLS_MP).replace(";", " or ")
+		"$mplist"
+	#end if
+
+	## Handle gene ontology terms
+	#if $goQuerySource.source == "file":
+		$goQuerySource.file_of_goterms
+        #else:
+	#set $golist = str($goQuerySource.autocomplete_OLS_GO).replace(";", " or ")
+		"$golist"
+	#end if
+  </command>
+
+  <inputs>
+    <param name="outfiles_prefix" type="text" label="Prefix for output file names"/>
+    <param format="achri_annotated_snp_table" name="input_hgvs_table" type="data" label="Basic or functionally annotated HGVS variant table"/>
+    <conditional name="litQuerySource">
+       <param name="source" type="select" label="How would you like to specify the phenotypes of interest?">
+          <option value="list">A list</option>
+          <option value="file">A file</option>
+       </param>
+       <when value="file">
+         <param format="text" name="file_of_phenotypes" type="data" label="Text file with one phenotype per line, from most to least important" help="Phenotypes can have boolean operators to allow word order swaps. e.g. 'Develop and delay' will match both 'delayed development' and 'developmental delay'."/>
+       </when>
+       <when value="list">
+         <param name="phenotype_list" type="text" area="True" label="One phenotype per line, from most to least important" help="Phenotypes can have boolean operators to allow word order swaps. e.g. 'Develop AND delay' will match both 'delayed development' and 'developmental delay'."/>
+       </when>
+    </conditional>
+
+    <conditional name="preselectedGenesSource">
+       <param name="source" type="select" label="How would you like to specify preselected genes of interest?">
+          <option value="list">A list</option>
+          <option value="file">A file</option>
+       </param>
+       <when value="file">
+         <param format="text" name="file_of_genenames" type="data" label="Text file with one upper case gene name per line" help="It is recommended to include gene name synonyms to maximize the chance of reference recovery"/>
+       </when>
+       <when value="list">
+         <param name="genename_list" type="text" area="True" label="One upper case gene name per line, e.g. ADH1" help="It is recommended to include gene name synonyms to maximize the chance of reference recovery"/>
+       </when>
+     </conditional>
+
+    <!--<conditional name="hpQuerySource">
+	<param name="source" type="select" label="How would you like to specify Human Phenotype Ontology terms of interest?">
+		<option value="list">A list</option>
+		<option value="file">A file</option>
+	</param>
+	<when value="file">
+		<param format="text" name="file_of_hpterms" type="data" label="Text file with one Human Phenotype term (text) per line"/>
+	</when>
+        <when value="list">
+		<param name="autocomplete_OLS_HP" type="text" label="Semi-colon separated list of HP terms (with autocomplete)" help="For better search results, do not type punctuation or symbols. For example, if you are looking for 4'-(L-tryptophan), try typing 4 L tryp"/>
+	</when> 
+    </conditional>-->
+
+    <conditional name="mpQuerySource">
+	<param name="source" type="select" label="How would you like to specify Mammalian Phenotype terms of interest?">
+		<option value="list">A list</option>
+		<option value="file">A file</option>
+	</param>
+	<when value="file">
+		<param format="text" name="file_of_mpterms" type="data" label="Text file with one Mammalian Phenotype term (text) per line"/>
+	</when>
+        <when value="list">
+		<param name="autocomplete_OLS_MP" type="text" label="Semi-colon separated list of MP terms (with autocomplete)" help="For better search results, do not type punctuation or symbols. For example, if you are looking for 4'-(L-tryptophan), try typing 4 L tryp"/>
+	</when> 
+    </conditional>
+
+    <conditional name="goQuerySource">
+	<param name="source" type="select" label="How would you like to specify Gene Ontology terms of interest?">
+		<option value="list">A list</option>
+		<option value="file">A file</option>
+	</param>
+	<when value="file">
+		<param format="text" name="file_of_goterms" type="data" label="Text file with one gene ontology term (text) per line"/>
+	</when>
+        <when value="list">
+		<param name="autocomplete_OLS_GO" type="text" label="Semi-colon separated list of GO terms (with autocomplete)" help="For better search results, do not type punctuation or symbols. For example, if you are looking for 4'-(L-tryptophan), try typing 4 L tryp"/>
+	</when> 
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="achri_annotated_snp_table" name="out_hgvs_table" type="data" label="${outfiles_prefix} HGVS all variants table with geno-pheno correlates" from_work_dir="pheno.common.hgvs.txt"/>
+  </outputs>
+
+  <tests>
+  </tests>
+
+  <help>
+  This tools adds columns to an HGVS table that include all of the literature references from OMIM, PubMed, ClinVar, the Human Phenotype 
+  Ontology, the Mouse Knockout Phenotypes, and Gene Ontology that match a given set of clinical phenotype query terms. A combined
+  probability of gene-phenotype association is calculated to help the user rank potentially causative genes for presumed genetic disorders.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/associate_variant_phenotypes	Wed Mar 25 13:23:29 2015 -0600
@@ -0,0 +1,366 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Math::CDF qw(pchisq); # chi square for calculating Fisher's method of combining p-values
+use File::Basename;
+my $dirname = dirname(__FILE__);
+
+# configuration file stuff
+my %config;
+my $tool_data = shift @ARGV;
+if(not -e "$tool_data/associate_phenotypes.loc" ){
+  system("cp $dirname/tool-data/associate_phenotypes.loc $tool_data/associate_phenotypes.loc") >> 8 and die "Could not create config file: $!\n";
+}
+open CONFIG, '<', "$tool_data/associate_phenotypes.loc";
+while(<CONFIG>){
+  next if $_ =~ /^#/;
+  (my $key, my $value) = split(/\s+/,$_);
+  $config{$key} = $value;
+}
+close CONFIG;
+my $dbs_dir = $config{"dbs_dir"};
+
+@ARGV == 6 or die "Usage: $0 <outfiles prefix> <annotated input> <preselected gene list of interest> <human literature terms> <mouse knockout terms> <gene ontology terms>\n";
+
+my $final_confident_outfiles_prefix = shift @ARGV;
+my $confident_input = shift @ARGV;
+my $preselected_genes_file = shift @ARGV;
+my $human_lit_file = shift @ARGV;
+my $mouse_knockout_file = shift @ARGV;
+my $gene_ontology_file = shift @ARGV;
+
+my @genes;
+if(-e $preselected_genes_file){
+  open(GENES, $preselected_genes_file)
+    or die "Cannot open $preselected_genes_file for reading: $!\n";
+  while(<GENES>){
+    chomp;
+    next if /^#/ or not /\S/;
+    s/^\s+|\s+$//g; # get rid of trailing or leading spaces
+    push @genes, $_;
+  }
+  close(GENES);
+}
+else{
+  @genes = split / or /, $preselected_genes_file;
+}
+my %genes;
+for my $g (@genes){
+  $genes{lc($g)} = 1;
+}
+
+my @human_lit_query;
+if(-e $human_lit_file){
+  open(HUMAN, $human_lit_file)
+    or die "Cannot open $human_lit_file for reading: $!\n";
+  while(<HUMAN>){
+    chomp;
+    next if /^#/ or not /\S/;
+    s/^\s+|\s+$//g; # get rid of trailing or leading spaces
+    s/\s+/ /g; # normalize any other whitespace
+    # to do: stem query terms? exclude stop words?
+    push @human_lit_query, $_;
+  }
+  close(HUMAN);
+}
+else{
+  @human_lit_query = split / or /, $human_lit_file;
+}
+my $human_lit_query = join(" or ", @human_lit_query, @genes);
+
+my @mouse_knockout_query;
+if(-e $mouse_knockout_file){
+  open(MOUSE, $mouse_knockout_file)
+    or die "Cannot open $mouse_knockout_file for reading: $!\n";
+  while(<MOUSE>){
+    chomp;
+    next if /^#/ or not /\S/;
+    s/^\s+|\s+$//g; # get rid of trailing or leading spaces
+    s/\s+/ /g; # normalize any other whitespace
+    # to do: stem query terms? exclude stop words?
+    push @mouse_knockout_query, $_;
+  }
+  close(MOUSE);
+}
+else{
+  @mouse_knockout_query = split / or /, $mouse_knockout_file;
+}
+my $mouse_knockout_query = join(" or ", @mouse_knockout_query);
+
+my @go_query;
+if(-e $gene_ontology_file){
+  open(GO, $gene_ontology_file)
+    or die "Cannot open $gene_ontology_file for reading: $!\n";
+  while(<GO>){
+    chomp;
+    next if /^#/ or not /\S/;
+    s/^\s+|\s+$//g; # get rid of trailing or leading spaces
+    s/\s+/ /g; # normalize any other whitespace
+    # to do: stem query terms? exclude stop words?
+    push @go_query, $_;
+  }
+}
+else{
+  @go_query = split / or /, $gene_ontology_file;
+}
+my $go_query = join(" or ", @go_query);
+close(GO);
+
+my @cmds;
+
+if($human_lit_query){
+  # do pubmed first because it has to potentially download references from the internet, so better to do this with just a couple concurrent rather than a lot, which would stress the remote iHOP server
+  push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/IHOP/ PubMed $confident_input - '$human_lit_query'";
+  push @cmds, "$dirname/filter_by_susceptibility_loci_pipe $dbs_dir/GWAS/gwascatalog.txt - - '$human_lit_query'";
+  push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/OMIM/omim.txt. OMIM - - '$human_lit_query'";
+  push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/ClinVar/ClinVarFullRelease.xml. ClinVar - - '$human_lit_query'";
+  push @cmds, "$dirname/filter_by_human_phenotype_ontology_pipe $dbs_dir/HPO - - '$human_lit_query'";
+}
+
+if($mouse_knockout_query or $human_lit_query){
+  if($mouse_knockout_query){
+    if($human_lit_query){
+      $mouse_knockout_query .= " or $human_lit_query";
+    }
+  }
+  else{
+    $mouse_knockout_query = $human_lit_query;
+  }
+  if($human_lit_query){
+    push @cmds, "$dirname/filter_by_mouse_knockout_pipe $dbs_dir/MGI/2013-03-15 - - '$mouse_knockout_query'";
+  }
+  else{
+    push @cmds, "$dirname/filter_by_mouse_knockout_pipe $dbs_dir/MGI/2013-03-15 $confident_input - '$mouse_knockout_query'"
+  }
+}
+
+if($go_query or $human_lit_query){
+  if($go_query){
+    if(@human_lit_query){
+      $go_query .= " or ".join(" or ", @human_lit_query);
+    }
+  }
+  else{
+    $go_query = join(" or ", @human_lit_query);
+  }
+  if($mouse_knockout_query or $human_lit_query){
+    push @cmds, "$dirname/associate_phenotypes/filter_by_gene_ontology_pipe $dbs_dir/GOA - - '$go_query'";
+  }
+  else{
+    push @cmds, "$dirname/associate_phenotypes/filter_by_gene_ontology_pipe $dbs_dir/GOA $confident_input - '$go_query'";
+  }
+}
+
+&print_final_output($final_confident_outfiles_prefix, @cmds);
+
+# Use Fisher's Method to combine p-values from various phenotype sources into a single score for ranking
+# This is an okay method to use (rather than something more complicated like Brown's method), because our
+# experience with real queries is that there is surprsingly little correlation (Spearman's rank or Kendall's tau) between 
+# the p-values for different sources (primary or curated secondary).
+sub print_final_output{
+  my ($final_output_prefix, @cmds) = @_;
+
+  my $cmd = join("|", @cmds). "|"; # pipe output so we read the stream in the handle below
+  open(ORIG, $cmd)
+    or die "Could not run '$cmd': $!\n";
+  my $header = <ORIG>;
+  chomp $header;
+  my @orig_header = split /\t/, $header;  
+  my ($chr_column, $pos_column, $gene_column, $hgvs_aa_column, $maf_column, $srcs_column, @pvalue_columns, @pheno_match_columns);
+  for(my $i = 0; $i <= $#orig_header; $i++){
+    if($orig_header[$i] eq "Chr"){
+      $chr_column = $i;
+    }
+    elsif($orig_header[$i] eq "DNA From"){
+      $pos_column = $i;
+    }
+    elsif($orig_header[$i] eq "Gene Name"){
+      $gene_column = $i;
+    }
+    elsif($orig_header[$i] eq "Protein HGVS"){
+      $hgvs_aa_column = $i;
+    }
+    elsif($orig_header[$i] eq "Pop. freq."){
+      $maf_column = $i;
+    }
+    elsif($orig_header[$i] eq "Sources"){
+      $srcs_column = $i;
+    }
+    elsif($orig_header[$i] =~ /p-value/){ # columns of pheno association with a stat
+       push @pvalue_columns, $i;
+    }
+    elsif($orig_header[$i] =~ /\(matching/){
+       push @pheno_match_columns, $i;
+    }
+  }
+  if(not defined $chr_column){
+    die "Could not find the 'Chr' column in the header, aborting ($header)\n";
+  }
+  elsif(not defined $pos_column){
+    die "Could not find the 'DNA From' column in the header, aborting ($header)\n";
+  }
+  elsif(not defined $hgvs_aa_column){
+    die "Could not find the 'Protein HGVS' column in the header, aborting ($header)\n";
+  }
+  elsif(not defined $maf_column){
+    die "Could not find the 'Pop. freq.' column in the header, aborting ($header)\n";	
+  }
+  # Sources is optional
+
+  # all other headers from other output files generated will be appended to the original ones
+  my @final_header = (@orig_header, "Combined phenotype relevance P-value");
+  if(@genes){
+    push @final_header, "Targeted Gene?";
+  }
+  my %lines; # chr -> position -> [dataline1, dataline2, ...]
+  my %source; # no. lines per variant source
+  while(<ORIG>){
+      chomp;
+      next unless /\S/; # ignore blank lines
+      my @F = split /\t/, $_, -1; # keep trailing blank fields
+      my $chr = $F[$chr_column];
+      $chr =~ s/^chr//; # helps for sorting purposes
+      my $pos = $F[$pos_column];
+      $pos =~ s/-.*$//; # CNVs have a range
+      $lines{$chr} = {} if not exists $lines{$F[$chr_column]};
+      $lines{$chr}->{$pos} = [] if not exists $lines{$F[$chr_column]}->{$pos};
+      my @final_dataline = @F;  # fields that are the same in all files since they were in the original
+      for(my $i = 0; $i < $#final_dataline; $i++){
+        $final_dataline[$i] = "" if not defined $final_dataline[$i];
+      }
+      # Create aggregate phenotype relevance score using Fisher's method
+      # A combined p-value for k p-values (P1...Pk) is calculated using a chi-square value (with 2k degrees of freedom) derived by -2*sum(ln(Pi), i=1..k)
+      my $chi_sq = 0;
+      my $num_pvalues = 0;
+      my $last_pvalue = 1;
+      for my $pvalue_index (@pvalue_columns){
+        next if $F[$pvalue_index] eq "";
+        $last_pvalue = $F[$pvalue_index];
+        $F[$pvalue_index] = 0.00001 if not $F[$pvalue_index]; # avoids log(0) issue
+        $num_pvalues++;
+        $chi_sq += log($F[$pvalue_index]);
+      }
+      my $fisher_pvalue = 1;
+      if($num_pvalues > 1){
+        $chi_sq *= -2;
+        my $p = pchisq($chi_sq, 2*scalar(@pvalue_columns));
+        if(not defined $p){
+          print STDERR "($num_pvalues) No X2 test value for $chi_sq (";
+          for my $pvalue_index (@pvalue_columns){
+            if($F[$pvalue_index] eq ""){print STDERR "NA "}
+            else{print STDERR $F[$pvalue_index], " "}
+          }
+          print STDERR ")\n$_\n";
+        }
+        $fisher_pvalue = 1-$p;
+      }
+      elsif($num_pvalues == 1){
+        $fisher_pvalue = $last_pvalue; # no multiple testing correction
+      }
+      else{
+        for my $match_column (@pheno_match_columns){
+          next if $F[$match_column] eq ""; # give a token amount of positive score to ontology term matches
+          for my $match (split /\/\/|;/, $F[$match_column]){
+            last if $fisher_pvalue <= 0.001; # only make better if not realy close to zero anyway
+            $fisher_pvalue -= 0.001;
+          }
+        }
+      }
+      push @final_dataline, abs($fisher_pvalue);
+      if(@genes){
+        push @final_dataline, (grep({exists $genes{$_}} split(/; /, lc($F[$gene_column]))) ? "yes" : "no");
+      }
+      push @{$lines{$chr}->{$pos}}, \@final_dataline;
+
+      next unless defined $srcs_column and $F[$srcs_column] =~ /(?:^|\+| )(\S+?)(?=;|$)/;
+      $source{$1}++;
+  }
+
+  my @outfiles = ("$final_output_prefix.novel.hgvs.txt", "$final_output_prefix.very_rare.hgvs.txt", "$final_output_prefix.rare.hgvs.txt", "$final_output_prefix.common.hgvs.txt");
+  open(OUT_NOVEL, ">$outfiles[0]")
+    or die "Cannot open $outfiles[0] for writing: $!\n";
+  open(OUT_VERY_RARE, ">$outfiles[1]")
+    or die "Cannot open $outfiles[1] for writing: $!\n";
+  open(OUT_RARE, ">$outfiles[2]")
+    or die "Cannot open $outfiles[2] for writing: $!\n";
+  open(OUT_COMMON, ">$outfiles[3]")
+    or die "Cannot open $outfiles[3] for writing: $!\n";
+  print OUT_NOVEL join("\t", @final_header), "\n";
+  print OUT_VERY_RARE join("\t", @final_header), "\n";
+  print OUT_RARE join("\t", @final_header), "\n";
+  print OUT_COMMON join("\t", @final_header), "\n";
+  my @sorted_chrs = sort {$a =~ /^\d+$/ and $b =~ /^\d+$/ and $a <=> $b or $a cmp $b} keys %lines;
+  for my $chr (@sorted_chrs){
+    for my $pos (sort {$a <=> $b} keys %{$lines{$chr}}){
+      my $datalines_ref = $lines{$chr}->{$pos};
+      # The following sorting puts all protein coding effect for a variant before non-coding ones
+      my @sorted_dataline_refs = sort {$a ne "NA" and $b ne "NA" and $a->[$hgvs_aa_column] cmp $a->[$hgvs_aa_column] or $b cmp $a} @$datalines_ref;
+      for my $dataline_ref (@sorted_dataline_refs){
+        next unless defined $dataline_ref;
+        my $maf = $dataline_ref->[$maf_column];
+        my $tot_line_length = 0;
+        for(my $i = 0; $i < $#{$dataline_ref}; $i++){
+          if(not defined $dataline_ref->[$i]){
+            $dataline_ref->[$i] = ""; # so we don't get crappy warnings of undefined values
+          }
+          else{
+            $tot_line_length += length($dataline_ref->[$i]);
+          }
+          $tot_line_length++; # the tab
+        }
+        if($tot_line_length > 32000){ # Excel limit of 32767 characters in a cell. Also, undocumented bug that entire import line cannot exceeed this length. 
+                                      # If we don't truncate, the rest of the line (including entire contents of cells to the right) are unceremoniously dumped.
+                                      # Note that personal experience has shown that the limit is actually a bit below this, so rounding down to the nearest 1000 for safety
+          my $overage = $tot_line_length - 32000;
+          my $sum_of_large_cells = 0;
+          my $num_large_cells = 0;
+          for(my $i = 0; $i <= $#{$dataline_ref}; $i++){ # remove contents from the largest cells
+            if(length($dataline_ref->[$i]) > 3200){
+              $sum_of_large_cells += length($dataline_ref->[$i]); # all cells that are at least 10% of the max
+              $num_large_cells++;
+            }
+          }
+          my $cell_max_alloc = int((32000-($tot_line_length-$sum_of_large_cells))/$num_large_cells);
+          for(my $i = 0; $i <= $#{$dataline_ref}; $i++){ # truncate the bigger than average ones
+            if(length($dataline_ref->[$i]) > $cell_max_alloc){
+              $dataline_ref->[$i] = substr($dataline_ref->[$i], 0, $cell_max_alloc-37)."[...remainder truncated for length]";
+            }
+          }
+        }
+        if($maf eq "NA"){
+          print OUT_NOVEL join("\t", @$dataline_ref), "\n";
+        }
+        if($maf eq "NA" or $maf < 0.005){
+          print OUT_VERY_RARE join("\t", @$dataline_ref), "\n";
+        }
+        if($maf eq "NA" or $maf < 0.05){
+          print OUT_RARE join("\t", @$dataline_ref), "\n";
+        }
+        print OUT_COMMON join("\t", @$dataline_ref), "\n";
+      }
+    }
+  }
+  close(OUT_NOVEL);
+  close(OUT_VERY_RARE);
+  close(OUT_RARE);
+  close(OUT_COMMON);
+
+  # Print per-source tables (e.g. for each patient in a cohort)
+  for my $src (keys %source){
+    for my $outfile (@outfiles){
+      open(IN, $outfile)
+        or die "cannot open $outfile for reading: $!\n";
+      my $src_outfile = $outfile;
+      $src_outfile =~ s/$final_output_prefix/$final_output_prefix-$src/;
+      open(OUT, ">$src_outfile")
+        or die "Cannot open $src_outfile for writing: $!\n";
+      print OUT scalar(<IN>); # header line
+      while(<IN>){
+        print OUT $_ if /(?:^|\+| )($src)(?=;|$)/; 
+      }
+      close(OUT);
+    }
+  }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_gene_ontology_pipe	Wed Mar 25 13:23:29 2015 -0600
@@ -0,0 +1,196 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $quiet = 0;
+if(@ARGV and $ARGV[0] =~ /^-q/){
+  $quiet = 1;
+  shift @ARGV;
+}
+
+@ARGV == 4 or die "Usage: $0 [-q(uiet)] <GO data dir> <hgvs_annotated.txt> <output.txt> <query>\n",
+                  "Where query has the format \"this or that\", \"this and that\", etc.\n",
+                  "GOA and OBO files must be found in the GO data dir\n";
+
+my $go_dir = shift @ARGV;
+my $obo_file = "$go_dir/gene_ontology.1_2.obo";
+my $hgvs_file = shift @ARGV;
+my $out_file = shift @ARGV;
+my $query = shift @ARGV;
+
+# Terms with meronyms and hyponyms in their free text descriptions, causing overgeneralization problems
+my %problematic_terms = ("GO:0033647" => "host intracellular organelle",
+                         "GO:0006996" => "organelle organization",
+                         "GO:0043226" => "organelle",
+                         "GO:0043227" => "membrane-bounded organelle",
+                         "GO:0043229" => "intracellular organelle",
+                         "GO:0043231" => "intracellular membrane-bounded organelle",
+                         "GO:0044384" => "host cell outer membrane",
+                         "GO:0044422" => "organelle part",
+                         "GO:0044446" => "intracellular organelle part",
+                         "GO:0045202" => "synapse",
+                         "GO:0033648" => "host intracellular membrane-bounded organelle");
+
+#$query = quotemeta($query); # in case there are meta characters in the query, treat them as literals
+
+# convert the query to a regex
+my $orig_query = $query;
+my $and_query = 0;
+$query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg;
+if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){
+  $and_query = 1;
+}
+$query =~ s/\s+or\s+/|/gi;
+$query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name
+#print STDERR "Query regex for GO is $query\n" unless $quiet;
+
+open(OBO, $obo_file)
+  or die "Cannot open $obo_file for reading: $!\n";
+my %matched_go_ids;
+my %go_id2subtypes;
+my %go_id2name;
+my $record_count;
+$/ = "\n[Term]\n";
+<OBO>; # chuck header
+while(<OBO>){
+  next unless /^id:\s*(GO:\d+)/s;
+  my $id = $1;
+  next unless /\nname:\s*(.+?)\s*\n/s;
+  my $name = $1;
+  $go_id2name{$id} = $name;
+  $record_count++;
+  while(/\nis_a:\s*(GO:\d+)/g){
+    my $parent_id = $1;
+    $go_id2subtypes{$parent_id} = [] unless exists $go_id2subtypes{$parent_id};
+    push @{$go_id2subtypes{$parent_id}}, $id;
+  }
+  if(exists $problematic_terms{$id}){
+    if($name =~ /\b($query)/o){ # strict matching of name only if an entry with problematic free text
+      my $match = $1;
+      $match =~ tr/\t\n/  /;
+      $match =~ s/ {2,}/ /g;
+      if(not exists $matched_go_ids{$id}){
+        $matched_go_ids{$id} = $match;
+      }
+      elsif($matched_go_ids{$id} !~ /$match/){
+        $matched_go_ids{$id} .= "; $match";
+      }
+    }
+  }
+  elsif(/\b($query)/o){
+    my $match = $1;
+    $match =~ tr/\t\n/  /;
+    $match =~ s/ {2,}/ /g;
+    if(not exists $matched_go_ids{$id}){
+      $matched_go_ids{$id} = $match;
+    }
+    elsif($matched_go_ids{$id} !~ /$match/){
+      $matched_go_ids{$id} .= "; $match";
+    }
+    #print STDERR "Found match $match for $_\n";
+  }
+}
+close(OBO);
+#print STDERR "Found ", scalar(keys %matched_go_ids), "/$record_count go ontology terms matching the query\n";
+
+open(OUT, ">$out_file")
+  or die "Cannot open $out_file for writing: $!\n";
+
+# Implements term subsumption
+my @matched_go_ids = keys %matched_go_ids;
+for(my $i = 0; $i <= $#matched_go_ids; $i++){
+  my $go_id = $matched_go_ids[$i];
+  next unless exists $go_id2subtypes{$go_id};
+  for my $sub_type_id (@{$go_id2subtypes{$go_id}}){
+    if(not exists $matched_go_ids{$sub_type_id}){
+      $matched_go_ids{$sub_type_id} = $matched_go_ids{$go_id};
+      push @matched_go_ids, $sub_type_id;
+    }
+  }
+}
+
+$/="\n"; # record separator
+my %gene2go_ids;
+opendir(GOADIR, $go_dir)
+  or die "Cannot read directory $go_dir: $!\n";
+while($_ = readdir(GOADIR)){
+  next if /^\./; # hidden
+  next unless /\.goa$/; # not a goa formatted file
+  my $goa_file = $_;
+  open(GOA, "$go_dir/$goa_file")
+    or die "Cannot open $go_dir/$goa_file for reading: $!\n";
+  #print STDERR "Processing file $goa_file\n";
+  # example line:
+  # UniProtKB	A0ASJ9			GO:0001664	GO_REF:0000033	ISS	PANTHER:PTN000026392	F	G protein alpha subunit AgGq6	A0ASJ9_ANOGA	protein	taxon:7165	20110125	RefGenome
+  while(<GOA>){
+    next if /^!/;  # comment
+    chomp;
+    my @F = split /\t/, $_;
+    next unless $F[2] and $#F > 3; # does it have the gene name and go id fields?
+    my $genename = uc($F[2]); # standardize gene names to upper case
+    $genename =~ s/-//g;
+    my $go_id = $F[4];
+    $gene2go_ids{$genename} = [] unless exists $gene2go_ids{$genename};
+    push @{$gene2go_ids{$genename}}, $go_id;
+  }
+  close(GOA);
+}
+close(GOADIR);
+#print STDERR "Found ", scalar(keys %gene2go_ids), " total genes\n";
+
+# remove genes if they don't have a matching go term
+for my $genename (keys %gene2go_ids){
+  my $keep = 0;
+  for my $go_id (@{$gene2go_ids{$genename}}){
+    if(exists $matched_go_ids{$go_id}){
+      $keep = 1;
+      last;
+    }
+  }
+  delete $gene2go_ids{$genename} unless $keep;
+}
+#print STDERR "Found ", scalar(keys %gene2go_ids), " genes with gene ontology terms matching the query\n" unless $quiet;
+
+$/ = "\n"; # one line at, a time from the HGVS file please!
+open(HGVS, $hgvs_file)
+  or die "Cannot open $hgvs_file for reading: $!\n";
+my $header = <HGVS>;
+chomp $header;
+my @header_columns = split /\t/, $header;
+my $gene_name_column;
+for(my $i = 0; $i <= $#header_columns; $i++){
+  if($header_columns[$i] eq "Gene Name"){
+    $gene_name_column = $i;
+  }
+}
+if(not defined $gene_name_column){
+  die "Could not find 'Gene Name' column in the input header, aborting\n";
+}
+
+print OUT "$header\tQuickGO Gene Ontology Terms (matching $orig_query)\tQuickGO Gene Ontology Terms (other)\n";
+# Check if any of the variants in the annotated HGVS table are in knockout genes matching the target go term list
+while(<HGVS>){
+  chomp;
+  my @F = split /\t/, $_, -1;
+  my (@target_gos, @other_gos);
+  for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){
+    next unless exists $gene2go_ids{$gene_name};
+    for my $id (@{$gene2go_ids{$gene_name}}){
+      if(exists $matched_go_ids{$id}){
+        push @target_gos, $go_id2name{$id}."($matched_go_ids{$id})";
+      }
+      else{
+        push @other_gos, $go_id2name{$id};
+      }
+    }
+  }
+  if(@target_gos){
+    my (%t,%o);
+    # print unique terms
+    print OUT join("\t", @F, join("; ", sort grep {not $t{$_}++} @target_gos), join("; ", sort grep {not $o{$_}++} @other_gos)), "\n";
+  }
+  else{
+    print OUT join("\t", @F, "", ""), "\n";
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_human_phenotype_ontology_pipe	Wed Mar 25 13:23:29 2015 -0600
@@ -0,0 +1,210 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $quiet = 0;
+if(@ARGV and $ARGV[0] =~ /^-q/){
+  $quiet = 1;
+  shift @ARGV;
+}
+
+@ARGV == 4 or die "Usage: $0 [-q(uiet)] <human_phenotype_ontology_dir> <hgvs_annotated.txt> <output.txt> <query>\n",
+                  "Where query has the format \"this or that\", \"this and that\", etc.\n",
+                  "Human phenotype files are available from http://compbio.charite.de/svn/hpo/trunk/src/ontology/human-phenotype-ontology.obo and genes_to_phenotype.txt\n";
+
+my $hpo_dir = shift @ARGV;
+my $obo_file = "$hpo_dir/human-phenotype-ontology.obo";
+my $gene_pheno_file = "$hpo_dir/genes_to_phenotype.txt";
+my $hgvs_file = shift @ARGV;
+my $out_file = shift @ARGV;
+my $query = shift @ARGV;
+
+# Below is a list of ontology terms with definitions including hyponyms and meronyms, so only match text on title so as not to get too many false positives 
+my %problematic = qw(HP:0003271 Visceromegaly
+                     HP:0000246 Sinusitis);
+# Ontology terms with HPO in the free text, leading to nasty overmatching when searching for gene HPO
+my %self_referencing = (
+       "HP:0000118" => "Phenotypic abnormality",
+       "HP:0000455" => "Broad nasal tip",
+       "HP:0000968" => "Ectodermal dysplasia",
+       "HP:0002652" => "Skeletal dysplasia",
+       "HP:0003812" => "Phenotypic variability",
+       "HP:0003828" => "Variable expressivity",
+       "HP:0003829" => "Incomplete penetrance",
+       "HP:0004472" => "Mandibular hyperostosis",
+       "HP:0004495" => "Thin anteverted nares",
+       "HP:0005286" => "Hypoplastic, notched nares",
+       "HP:0005321" => "Mandibulofacial dysostosis",
+       "HP:0005871" => "Metaphyseal chondrodysplasia",
+       "HP:0007589" => "Aplasia cutis congenita on trunk or limbs",
+       "HP:0007819" => "Presenile cataracts");
+
+# Ignore metadata field words, so we can properly match gene names like HPO :-)
+my %stop_words = ("name:" => 1,
+                  "HPO:" => 1,
+                  "alt_id:" => 1,
+                  "def:" => 1,
+                  "synonym:" => 1,
+                  "EXACT" => 1,
+                  "xref:" => 1,
+                  "UMLS:" => 1,
+                  "is_a:" => 1,
+                  "created_by:" => 1,
+                  "comment:" => 1,
+                  "creation_date:" => 1);
+
+# convert the query to a regex
+my $orig_query = $query;
+my $and_query = 0;
+$query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg;
+if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){
+  $and_query = 1;
+}
+$query =~ s/\s+or\s+/|/gi;
+$query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name
+#print STDERR "Query regex is $query\n" unless $quiet;
+
+
+open(OBO, $obo_file)
+  or die "Cannot open $obo_file for reading: $!\n";
+my %matched_pheno_ids;
+my %pheno_id2subtypes;
+my %pheno_id2name;
+my $record_count;
+$/ = "\n[Term]\n";
+<OBO>; # chuck header
+while(<OBO>){
+  next unless /^id:\s*(HP:\d+)/s;
+  my $id = $1;
+  next unless /\nname:\s*(.+?)\s*\n/s;
+  my $name = $1;
+  $pheno_id2name{$id} = $name;
+  $record_count++;
+  while(/\nis_a:\s*(HP:\d+)/g){
+    my $parent_id = $1;
+    $pheno_id2subtypes{$parent_id} = [] unless exists $pheno_id2subtypes{$parent_id};
+    push @{$pheno_id2subtypes{$parent_id}}, $id;
+  }
+  s/(UMLS:\S+\s+")(.+?)(?=")/$1.lc($2)/eg;
+  if(exists $problematic{$id}){ # for overmatching terms due to their descriptions (hyponyms and meronyms included in 
+                                # parent entry), only match the title to not generate too many false poositives
+    while($name =~ /\b($query)(\S*?:?)/go){
+      next if exists $stop_words{$1.$2};
+      my $match = $1;
+      $match =~ tr/\t\n/  /;
+      $match =~ s/\s{2,}/ /g;
+      if(not exists $matched_pheno_ids{$id}){
+        $matched_pheno_ids{$id} = $match;
+      } 
+      elsif($matched_pheno_ids{$id} !~ /$match/){
+        $matched_pheno_ids{$id} .= "; $match";
+      }
+    }
+  }
+  else{ # normally, match anywhere in the entry
+    while(/\b($query)(\S*?:?)/go){
+      next if defined $2 and exists $stop_words{$1.$2} or $1 eq "HPO" and $self_referencing{$id};
+      my $match = $1;
+      $match =~ tr/\t\n/  /;
+      $match =~ s/\s{2,}/ /g;
+      if(not exists $matched_pheno_ids{$id}){
+        $matched_pheno_ids{$id} = $match;
+      } 
+      elsif($matched_pheno_ids{$id} !~ /\Q$match\E/){
+        $matched_pheno_ids{$id} .= "; $match";
+      }
+      #print STDERR "Match $match for $_\n";
+    }
+  }
+}
+close(OBO);
+#print STDERR "Found ", scalar(keys %matched_pheno_ids), "/$record_count phenotype ontology terms matching the query\n";
+
+
+# Implements term subsumption
+my @matched_pheno_ids = keys %matched_pheno_ids;
+for(my $i = 0; $i <= $#matched_pheno_ids; $i++){
+  my $pheno_id = $matched_pheno_ids[$i];
+  next unless exists $pheno_id2subtypes{$pheno_id};
+  for my $sub_type_id (@{$pheno_id2subtypes{$pheno_id}}){
+    if(not exists $matched_pheno_ids{$sub_type_id}){
+      $matched_pheno_ids{$sub_type_id} = $matched_pheno_ids{$pheno_id};
+      push @matched_pheno_ids, $sub_type_id;
+    }
+  }
+}
+
+$/="\n"; # record separator
+my %gene2pheno_ids;
+# Format: entrez-gene-id<tab>entrez-gene-symbol<tab>HPO-Term-Name<tab>HPO-Term-ID
+open(PHENO, $gene_pheno_file)
+  or die "Cannot open $gene_pheno_file for reading: $!\n";
+while(<PHENO>){
+  chomp;
+  my @F = split /\t/, $_;
+  next unless $#F > 2; # does it have the phenotype id field?
+  my $gene = $F[1];
+  my $pheno_id = $F[3];
+  $gene2pheno_ids{$gene} = [] unless exists $gene2pheno_ids{$gene};
+  push @{$gene2pheno_ids{$gene}}, $pheno_id;
+}
+
+# remove genes if they don't have a matching phenotype
+for my $gene (keys %gene2pheno_ids){
+  my $keep = 0;
+  for my $pheno_id (@{$gene2pheno_ids{$gene}}){
+    if(exists $matched_pheno_ids{$pheno_id}){
+      $keep = 1;
+      last;
+    }
+  }
+  delete $gene2pheno_ids{$gene} unless $keep;
+}
+#print STDERR "Found ", scalar(keys %gene2pheno_ids), " genes with human phenotype ontology terms matching the query\n" unless $quiet;
+
+$/ = "\n"; # one line at, a time from the HGVS file please!
+open(HGVS, $hgvs_file)
+  or die "Cannot open $hgvs_file for reading: $!\n";
+my $header = <HGVS>;
+chomp $header;
+my @header_columns = split /\t/, $header;
+my $gene_name_column;
+for(my $i = 0; $i <= $#header_columns; $i++){
+  if($header_columns[$i] eq "Gene Name"){
+    $gene_name_column = $i;
+  }
+}
+if(not defined $gene_name_column){
+  die "Could not find 'Gene Name' column in the input header, aborting\n";
+}
+open(OUT, ">$out_file")
+  or die "Cannot open $out_file for writing: $!\n";
+print OUT "$header\tHuman Phenotypes (matching $orig_query)\tHuman Phenotypes (other)\n";
+
+# Check if any of the variants in the annotated HGVS table are in knockout genes matching the target phenotypes list
+while(<HGVS>){
+  chomp;
+  my @F = split /\t/, $_, -1;
+  my (@target_phenos, @other_phenos);
+  for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){
+    next unless exists $gene2pheno_ids{$gene_name};
+    for my $id (@{$gene2pheno_ids{$gene_name}}){
+      next unless exists $pheno_id2name{$id};
+      if(exists $matched_pheno_ids{$id}){
+        push @target_phenos, $pheno_id2name{$id}."($matched_pheno_ids{$id})";
+      }
+      else{
+        push @other_phenos, $pheno_id2name{$id};
+      }
+    }
+  }
+  if(@target_phenos){
+    print OUT join("\t", @F, join("; ", @target_phenos), join("; ", @other_phenos)), "\n";
+  }
+  else{
+    print OUT join("\t", @F, "", ""), "\n";
+  }
+}
+close(OUT);
+close(HGVS);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_index_gamma	Wed Mar 25 13:23:29 2015 -0600
@@ -0,0 +1,492 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use DB_File;
+use Parse::BooleanLogic;
+use Math::CDF qw(pgamma qgamma); # relevance score -> gamma p-value
+use PDL qw(pdl);
+use PDL::Stats::Distr qw(mme_gamma); # gamma dist parameter estimates
+use vars qw($parser %cached_sentences %sentence_index);
+
+my $quiet = 0;
+if(@ARGV and $ARGV[0] =~ /^-q/){
+  $quiet = 1;
+  shift @ARGV;
+}
+
+@ARGV == 5 or die "Usage: $0 [-q(uiet)] <index filename base> <db name> <hgvs_annotated.txt> <output.txt> <query>\nWhere query has the format \"this or that\", \"this and that\", etc.\n";
+
+my $signal_p = 0.95; # signal is top 5% of scores
+my $index_filename_base = shift @ARGV;
+my $db_name = shift @ARGV;
+my $hgvs_file = shift @ARGV;
+my $out_file = shift @ARGV;
+my $orig_query = shift @ARGV;
+
+$parser = new Parse::BooleanLogic(operators => ['and', 'or']);
+my $query_tree = $parser->as_array($orig_query, error_cb => sub {die "Could not parse query: @_\n"});
+# For simplicity, turn the tree into a base set of or statements (which means expanding "A and (B or C)" into "A and B or A and C") a.k.a. "sum of products/minterms"
+my @query_terms = flatten_query($query_tree);
+
+my $df_index_filename = $index_filename_base."df_index";
+my %df_index;
+my $df_index_handle = tie %df_index, "DB_File", $df_index_filename, O_RDONLY, 0400, $DB_BTREE 
+			or die "Cannot open $df_index_filename: $!\n";
+my $gene_record_count = $df_index{"__DOC_COUNT__"};
+
+my $sentence_index_filename = $index_filename_base."sentence_index";
+my $sentence_index_handle = tie %sentence_index, "DB_File", $sentence_index_filename, O_RDONLY, 0400, $DB_HASH
+			or die "Cannot open $sentence_index_filename: $!\n";
+
+# Get the list of gene symbols we'll need
+open(HGVS, $hgvs_file)
+  or die "Cannot open $hgvs_file for reading: $!\n";
+my $header = <HGVS>;
+chomp $header;
+my @header_columns = split /\t/, $header;
+my ($gene_name_column, $chr_column, $from_column, $to_column);
+for(my $i = 0; $i <= $#header_columns; $i++){
+  if($header_columns[$i] eq "Gene Name"){
+    $gene_name_column = $i;
+  }
+  elsif($header_columns[$i] eq "Chr"){
+    $chr_column = $i;
+  }
+  elsif($header_columns[$i] eq "DNA From"){
+    $from_column = $i;
+  }
+  elsif($header_columns[$i] eq "DNA To"){
+    $to_column = $i;
+  }
+}
+my $blank_query = not @query_terms;
+# Special case of empty query means print all info for variant ranges listed in the input HGVS file (assuming the DB was indexed to include chr:pos keys)
+if($blank_query){
+  #print STDERR "Running blank query\n";
+  if(not defined $chr_column){
+    die "Could not find 'Chr' column in the input header, aborting\n";
+  }
+  if(not defined $from_column){
+    die "Could not find 'DNA From' column in the input header, aborting\n";
+  }
+  if(not defined $to_column){
+    die "Could not find 'DNA To' column in the input header, aborting\n";
+  }
+  # Build the list of locations that will need to be searched in the index
+
+  open(OUT, ">$out_file")
+    or die "Cannot open $out_file for writing: $!\n";
+  print OUT $header, "\t$db_name Text Matches\n";
+
+  while(<HGVS>){
+    chomp;
+    my @F = split /\t/, $_, -1;
+    my @pos_data;
+    for my $pos ($F[$from_column]..$F[$to_column]){ # for each position in the range
+      my $pos_match_data = fetch_sentence("$F[$chr_column]:$pos", -1); # fetch all data for this position
+      push @pos_data, "*$F[$chr_column]:$pos* ".$pos_match_data if defined $pos_match_data;
+    }
+    print OUT join("\t", @F, join(" // ", @pos_data)),"\n";
+  }
+  close(OUT);
+  exit;
+}
+elsif(not defined $gene_name_column){
+  die "Could not find 'Gene Name' column in the input header, aborting\n"; 
+}
+#print STDERR "Query terms: " , scalar(@query_terms), "\n";
+my %gene_to_query_match_ranges;
+# Determine the set of genes that might match the query, based on the word index
+for my $query_term (@query_terms){
+  #print STDERR "Query term $query_term\n";
+  my %doc_hits; # how many needed words match the document? 
+  my $contiguous = 1; #by default multiword queries must be contiguous
+  # Unless it's an AND query
+  if($query_term =~ s/ and / /g){
+    $contiguous = 0;
+  }
+
+  my @words = split /\s+/, $query_term; # can be multi-word term like "mental retardation"
+  for(my $i = 0; $i <= $#words; $i++){
+    my $word = mc($words[$i]); # can be a stem word, like hypoton 
+    #print STDERR "Checking word $word...";
+    if($i == 0){
+      my $first_word_docs = get_doc_offsets($df_index_handle, $word); # get all words' docs off this stem
+      #print STDERR scalar(keys %$first_word_docs), " documents found\n";
+      for my $doc (keys %$first_word_docs){
+        $doc_hits{$doc} = $first_word_docs->{$doc}; # populate initial hit list that'll be whittled down in subsequent outer loops of multiword phrase members
+      }
+      next;
+    }
+    my @candidate_docs = keys %doc_hits;
+    last if not @candidate_docs; # short circuit searches guaranteed to fail
+
+    # each additional word must directly follow an existing match
+    my $word_doc_offsets_ref = get_doc_offsets($df_index_handle, $word); # get all words' docs off this stem
+    #print STDERR scalar(keys %$word_doc_offsets_ref), " documents found\n";
+    for my $doc (@candidate_docs){
+      my $num_matches = 0;
+      if(not exists $word_doc_offsets_ref->{$doc}){ # required word missing, eliminate doc from consideration
+        delete $doc_hits{$doc};
+        next;
+      }
+      # see if any of the instances of the additional words directly follow the last word we successfully matched
+      my $so_far_matches_ref = $doc_hits{$doc};
+      my $next_word_matches_ref = $word_doc_offsets_ref->{$doc};
+      for (my $j=0; $j <= $#{$so_far_matches_ref}; $j++){
+        my $existing_match_extended = 0;
+        next unless defined $so_far_matches_ref->[$j]->[2]; # every once in a while there is no article id parsed
+        for (my $k=0; $k <= $#{$next_word_matches_ref}; $k++){
+          # Same article?
+          next unless defined $next_word_matches_ref->[$k]->[2] and $next_word_matches_ref->[$k]->[2] eq $so_far_matches_ref->[$j]->[2];
+          if(not $contiguous){
+            $so_far_matches_ref->[$j]->[4] .= " AND ".$next_word_matches_ref->[$k]->[4]; # update the matched term to include the extension too 
+            if(ref $so_far_matches_ref->[$j]->[3] ne "ARRAY"){ # match does not yet span multiple sentences
+              last if $next_word_matches_ref->[$k]->[3] == $so_far_matches_ref->[$j]->[3]; # same sentence
+              $so_far_matches_ref->[$j]->[3] = [$so_far_matches_ref->[$j]->[3], $next_word_matches_ref->[$k]->[3]]; # change from scalar to array (of sentence numbers)
+            }
+            elsif(not grep {$_ eq $next_word_matches_ref->[$k]->[3]} @{$so_far_matches_ref->[$j]->[3]}){
+              push @{$so_far_matches_ref->[$j]->[3]}, $next_word_matches_ref->[$k]->[3]; # add top spanning sentences list of not already there
+            }
+          }
+          # else contiguous word occurences required. 
+          # Same sentence?
+          next unless $next_word_matches_ref->[$k]->[3] == $so_far_matches_ref->[$j]->[3];
+
+          my $space_between_match_words = $next_word_matches_ref->[$k]->[0] - $so_far_matches_ref->[$j]->[1];
+          if($space_between_match_words <= 2){
+            $existing_match_extended = 1;
+            $so_far_matches_ref->[$j]->[1] = $next_word_matches_ref->[$k]->[1]; # move the match cursor to include the new extending word
+            $so_far_matches_ref->[$j]->[4] .= " ".$next_word_matches_ref->[$k]->[4]; # update the matched term to include the extension too
+            last;
+          }
+          elsif($space_between_match_words > 2){ # more than two typographical symbols between words, consider non-continuous
+            last;  # since the offsets are in order, any further k would only yield a larger spacing, so shortcircuit
+          }
+        }
+        if(not $existing_match_extended){
+          splice(@$so_far_matches_ref, $j, 1);
+          $j--;
+        }
+        else{
+          $num_matches++;
+        }
+      }
+      if(not $num_matches){
+        delete $doc_hits{$doc};
+      }
+    }
+  }
+  # the only keys that get to this point should be those that match all terms
+  for my $doc (keys %doc_hits){
+    $gene_to_query_match_ranges{$doc} = [] if not exists $gene_to_query_match_ranges{$doc};
+    push @{$gene_to_query_match_ranges{$doc}}, [$query_term, @{$doc_hits{$doc}}];
+  }
+}
+
+my @matched_genes = keys %gene_to_query_match_ranges;
+#print STDERR "Found ", scalar(@matched_genes), "/$gene_record_count records in cached iHOP matching the query\n" unless $quiet;
+my %query_gene_counts;
+my %ntf;
+for my $gene (keys %gene_to_query_match_ranges){
+  my $max_doc_word_count = $df_index{"__DOC_MAX_WC_$gene"};
+  for my $count_record (@{$gene_to_query_match_ranges{$gene}}){
+    my ($query_term, @query_term_match_ranges_in_this_gene) = @$count_record;
+    # next if $query_term eq $gene; # slightly controversial? exclude references to genes from the score if the gene is the record being talked about (obviously it will be highly scored)
+    # allows us to find first degree interactors (i.e. points for "A interacts with B", in the record describing A) without creating crazy high score for doc describing gene B if B was in the original query without any phenotype query terms
+    $query_gene_counts{$query_term}++;
+
+    $ntf{$gene} = {} unless exists $ntf{$gene};
+    # atypical use of log in order to weigh heavy use of a common term less than occasional use of a rare term
+    $ntf{$gene}->{$query_term} = log($#query_term_match_ranges_in_this_gene+2)/log($max_doc_word_count+1); 
+  }
+  #print STDERR "Doc max word count is $max_doc_word_count for $gene, ntf keys = ", keys %{$ntf{$gene}}, "\n";
+}
+
+my %idf;
+for my $query_term (@query_terms){ # convert %idf values from documents-with-the-query-term-count to actual IDF
+  next unless exists $query_gene_counts{$query_term}; # query not in the document collection
+  $idf{$query_term} = log($gene_record_count/$query_gene_counts{$query_term});
+  #print STDERR "$query_term IDF is $idf{$query_term}\n";
+}
+
+# Create a relevance score using a normalized term frequency - inverse document frequency summation
+my %relevance_score;
+my %matched_query_terms;
+for my $gene_symbol (keys %gene_to_query_match_ranges){
+  my $relevance_score = 0;
+  # Hmm, take average, sum or max of TF-IDFs? 
+  my $max_query_score = 0;
+  my @matched_query_terms;
+  my $query_score = 0;
+  for (my $i = 0; $i <= $#query_terms; $i++){
+    my $query_term = $query_terms[$i];
+    next unless exists $idf{$query_term};
+    next unless exists $ntf{$gene_symbol}->{$query_term};
+    $query_score += $ntf{$gene_symbol}->{$query_term}*$idf{$query_term};
+    push @matched_query_terms, $query_term;
+    $query_score *= 1-$i/scalar(@query_terms)/2 if scalar(@query_terms) > 2;# adjust the query score so the first terms are weighted more heavily if a bunch of terms are being searched
+    $max_query_score = $query_score if $query_score > $max_query_score;
+    $relevance_score += $query_score;
+  }
+  # this square root trick will not affect the score of a single term query, but will penalize a high total score that is comprised of a bunch of low value individual term scores)
+  $relevance_score{$gene_symbol} = sqrt($relevance_score*$max_query_score); 
+  #print STDERR "Relevance score for $gene_symbol is $relevance_score{$gene_symbol}\n";
+  $matched_query_terms{$gene_symbol} = \@matched_query_terms;
+}
+ 
+# Characterize relevance score as a gamma statistical distribution and convert to probability
+my $max_relevance_score = 0;
+for my $relevance_score (values %relevance_score){
+  $max_relevance_score = $relevance_score if $relevance_score > $max_relevance_score;
+}
+# Remove top end scores as signal, characterize the rest as noise.
+# Iterative estimation of gamma parameters and removing data within range where CDF>99%
+my $noise_data = pdl(values %relevance_score);
+my ($shape, $scale) = $noise_data->mme_gamma();
+#print STDERR "Initial gamma distribution estimates: $shape, $scale (max observation $max_relevance_score)\n";
+my $signal_cutoff = qgamma($signal_p, $shape, 1/$scale);
+my @noise_data;
+for my $gene_symbol (keys %relevance_score){
+  my $score = $relevance_score{$gene_symbol};
+  push @noise_data, $score if $score < $signal_cutoff;
+}
+$noise_data = pdl(@noise_data);
+($shape, $scale) = $noise_data->mme_gamma();
+#print STDERR "Revised gamma distribution estimates (noise estimate at $signal_cutoff (CDF $signal_p)): $shape, $scale\n";
+# Convert scores to probabilities
+for my $gene_symbol (keys %relevance_score){
+  $relevance_score{$gene_symbol} = 1-pgamma($relevance_score{$gene_symbol}, $shape, 1/$scale);
+}
+
+#TODO: create summary stats for each query term so the user gets an idea of each's contribution?
+
+my %pubmed_matches;
+for my $gene_symbol (keys %gene_to_query_match_ranges){
+  my $query_match_ranges_ref = $gene_to_query_match_ranges{$gene_symbol};
+  my %matching_sentences;
+  for my $count_record (@$query_match_ranges_ref){
+    my ($query_term, @query_term_match_ranges_in_this_gene) = @$count_record;  
+    for my $occ_info (@query_term_match_ranges_in_this_gene){ 
+      my $id = $occ_info->[2];
+      my $sentence_number = $occ_info->[3];
+      my $query_match_word = $occ_info->[4];
+      # Fetch the preparsed sentence from the sentence index based on id and sentence number
+      # Will automatically *HIGHLIGHT* the query terms fetched in the sentence over the course of this script
+      if(ref $sentence_number eq "ARRAY"){ # match spans multiple sentences
+        for my $s (@$sentence_number){
+          for my $word (split / AND /, $query_match_word){
+            #print STDERR "Highlighting $word in $id #$s for query term $query_term (multisentence match)\n";
+            $matching_sentences{fetch_sentence_key($id, $s, $word)}++;
+          }
+        }
+      }
+      else{ # single sentence match
+        #print STDERR "Highlighting $query_match_word in $id #$sentence_number for query term $query_term\n";
+        $matching_sentences{fetch_sentence_key($id, $sentence_number, $query_match_word)}++;
+      }
+    }
+  }
+  $gene_symbol =~ s/_/\//; # didn't have a forward slash in a gene name for disk caching purposes
+  if(keys %matching_sentences){
+    $pubmed_matches{$gene_symbol} = [] unless exists $pubmed_matches{$gene_symbol};
+    for my $new_match_ref (keys %matching_sentences){
+      push @{$pubmed_matches{$gene_symbol}}, $new_match_ref unless grep {$_ eq $new_match_ref} @{$pubmed_matches{$gene_symbol}}; # only put in new sentences, no need to dup
+    }
+  }
+}
+
+$orig_query =~ s/\s+/ /; # normalized whitespace
+$orig_query =~ s/ and / and /i; # lc()
+my @orig_query_terms = split /\s+or\s+/, $orig_query;
+
+open(OUT, ">$out_file")
+  or die "Cannot open $out_file for writing: $!\n";
+my $new_header = $header;
+$new_header .= "\t$db_name p-value (log normalized TF-IDF score, gamma dist)\t$db_name Matching Terms ($orig_query)\t$db_name Text Matches";
+print OUT $new_header, "\n";
+
+# Check if any of the variants in the annotated HGVS table are in genes from the OMIM match list
+while(<HGVS>){
+  chomp;
+  my @F = split /\t/, $_, -1;
+  # order the ids from highest number of sentence matches to lowest, from highest ranked term to least
+  my (%id2match_count, %id2sentences);
+  my @matched_genes;
+  my $relevance_score_final = 1;
+  my @matched_query_terms;
+  for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){
+    next unless exists $pubmed_matches{$gene_name};
+    push @matched_genes, $gene_name;
+    for my $sentence_ref (@{$pubmed_matches{$gene_name}}){ # 0 == always fetch the title which is stored in sentence index 0
+      my $pubmed_record = fetch_sentence($sentence_ref);
+      $id2match_count{$pubmed_record->[0]}++; # key = id
+      if(not exists $id2sentences{$pubmed_record->[0]}){
+        $id2sentences{$pubmed_record->[0]} = {};
+        my $title_record = fetch_sentence(fetch_sentence_key($pubmed_record->[0], 0, ""));
+        next unless $title_record->[0];
+        print STDERR "No $index_filename_base sentence number for ", $title_record->[0], "\n" if not defined $title_record->[1];
+        print STDERR "No $index_filename_base sentence text for ", $title_record->[0], " sentence #", $title_record->[1], "\n" if not defined $title_record->[2];
+        $id2sentences{$title_record->[0]}->{$title_record->[2]} = $title_record->[1];
+      }
+      # Only print sentences that match a query term other than the gene name for the record key, if that gene name is part of the query
+      my $non_self_query_ref = 0;
+      while($pubmed_record->[2] =~ /\*(.+?)\*/g){
+        if($1 ne $gene_name){
+          $non_self_query_ref = 1;
+          last;
+        }
+      }
+      #print STDERR "rejected $gene_name self-only sentence ",$pubmed_record->[2],"\n" unless $non_self_query_ref;
+      next unless $non_self_query_ref;
+      $id2sentences{$pubmed_record->[0]}->{$pubmed_record->[2]} = $pubmed_record->[1]; # value = sentence order within pubmed text
+    }
+    $relevance_score_final *= $relevance_score{$gene_name};
+    push @matched_query_terms, @{$matched_query_terms{$gene_name}};
+  }
+
+  # If we get here, there were matches
+  my @ordered_ids = sort {$id2match_count{$b} <=> $id2match_count{$a}} keys %id2match_count;
+
+  # print sentences in each id in order, with ellipsis if not contiguous
+  my %h;
+  print OUT join("\t", @F, ($relevance_score_final != 1 ? $relevance_score_final : ""), (@matched_query_terms ? join("; ", sort grep {not $h{$_}++} @matched_query_terms) : "")), "\t"; 
+  my $first_record = 1;
+  for my $id (@ordered_ids){
+    my $sentence2order = $id2sentences{$id};
+    my @ordered_sentences = sort {$sentence2order->{$a} <=> $sentence2order->{$b}} keys %$sentence2order;
+    next if scalar(@ordered_sentences) == 1; # due to self-gene only referencing filter above, we may have no matching sentences in a record.  Skip in this case.
+    if($first_record){
+      $first_record = 0;
+    }
+    else{
+      print OUT " // ";
+    }
+    my $title = shift(@ordered_sentences);
+    print OUT "$db_name $id",(defined $title ? " $title": ""),":"; # first sentence is always the record title
+    my $last_ordinal = 0;
+    for my $s (@ordered_sentences){
+      if($last_ordinal and $sentence2order->{$s} != $last_ordinal+1){
+        print OUT "..";
+      }
+      print OUT " ",$s;
+      $last_ordinal = $sentence2order->{$s};
+    }
+  }
+  print OUT "\n";
+}
+
+sub get_doc_offsets{
+  my ($db_handle, $word_stem) = @_;
+  my %doc2offsets;
+
+  my $is_uc = $word_stem =~ /^[A-Z0-9]+$/;
+  my $has_wildcard = $word_stem =~ s/\*$//;
+  my $value = 0;
+  my $cursor_key = $word_stem;
+  # retrieves the first 
+  for(my $status = $db_handle->seq($cursor_key, $value, R_CURSOR);
+      $status == 0;
+      $status = $db_handle->seq($cursor_key, $value, R_NEXT)){
+    if(CORE::index($cursor_key,$word_stem) != 0){
+      last; # outside the records that have the requested stem now
+    }
+    for my $record (split /\n/s, $value){
+      my ($doc, @occ_infos) = split /:/, $record;
+      $doc2offsets{$doc} = [] if not exists $doc2offsets{$doc};
+      for my $occ_info (@occ_infos){
+        my ($term_offset, $id, $sentence_number) = split /,/, $occ_info, -1;
+        # record start and end of word to facilitate partial key consecutive word matching algorithm used in this script
+        push @{$doc2offsets{$doc}}, [$term_offset, $term_offset+length($cursor_key), $id, $sentence_number, $cursor_key]; 
+      }
+    }
+    last if $is_uc and not $has_wildcard; # only exact matches for upper case words like gene names
+  }
+  return \%doc2offsets;
+}
+
+sub mc{
+  if($_[0] =~ /^[A-Z][a-z]+$/){
+    return lc($_[0]); # sentence case normalization to lower case for regular words
+  }
+  else{
+    return $_[0]; # as-is for gene names, etc
+  }
+}
+
+sub fetch_sentence_key{
+  my ($id, $sentence_number, $query_term) = @_;
+
+  $sentence_number = 0 if not defined $sentence_number;
+  return ":$sentence_number" if not $id;
+  my $key = "$id:$sentence_number";
+  if(not exists $cached_sentences{$key}){
+    my @sentences = split /\n/, $sentence_index{$id};
+    $cached_sentences{$key} = $sentences[$sentence_number];
+  }
+  $cached_sentences{$key} =~ s/\b\Q$query_term\E\b(?!\*)/"*".uc($query_term)."*"/ge unless $query_term eq "";
+  #print STDERR "Highlighted $query_term in $cached_sentences{$key}\n" if $query_term =~ /cirrhosis/;
+  return $key;
+}
+
+sub fetch_sentence{
+  if(@_ == 1){ # from cache
+    return [split(/:/, $_[0]), $cached_sentences{$_[0]}];
+  }
+  else{ # if more than one arg, DIRECT FROM index key as first arg, sentence # is second arg
+    return undef if not exists $sentence_index{$_[0]};
+    my @sentences = split /\n/, $sentence_index{$_[0]};
+    if($_[1] < 0){ # all sentences request
+      return join("; ", @sentences);
+    }
+    return $sentences[$_[1]];
+  }
+}
+
+
+# boolean operator tree to flat expanded single depth "or" op query
+sub flatten_query{
+  my $tree = shift @_;
+  my @or_queries;
+
+  # Base case: the tree is just a leaf (denoted by a hash reference). Return value of the operand it represents.
+  if(ref $tree eq "HASH"){
+    return ($tree->{"operand"});
+  }
+
+  elsif(not ref $tree){
+    return $tree;
+  }
+
+  # Otherwise it's an operation array
+  if(ref $tree ne "ARRAY"){
+    die "Could not parse $tree, logic error in the query parser\n";
+  }
+ 
+  # Deal with AND first since it has higher precedence
+  for (my $i = 1; $i < $#{$tree}; $i++){
+    if($tree->[$i] eq "and"){
+      my @expanded_term;
+      my @t1_terms = flatten_query($tree->[$i-1]);
+      my @t2_terms = flatten_query($tree->[$i+1]);
+      #print STDERR "need to expand ", $tree->[$i-1], "(@t1_terms) AND ", $tree->[$i+1], "(@t2_terms)\n";
+      for my $term1 (@t1_terms){
+        for my $term2 (@t2_terms){
+          #print STDERR "Expanding to $term1 and $term2\n";
+          push @expanded_term, "$term1 and $term2";
+        }
+      }
+      splice(@$tree, $i-1, 3, @expanded_term);
+      $i--; # list has been shortened
+    }
+  }
+  # Should be only "OR" ops left
+  # Resolve any OR subtrees
+  for(my $i = 0; $i <= $#{$tree}; $i++){
+    next if $tree->[$i] eq "or";
+    push @or_queries, flatten_query($tree->[$i]); # otherwise recursive parse
+  }
+
+  return @or_queries;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_mouse_knockout_pipe	Wed Mar 25 13:23:29 2015 -0600
@@ -0,0 +1,202 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $quiet = 0;
+if(@ARGV and $ARGV[0] =~ /^-q/){
+  $quiet = 1;
+  shift @ARGV;
+}
+
+@ARGV == 4 or die "Usage: $0 [-q(uiet)] <MGI knockout pheno data dir> <hgvs_annotated.txt> <output.txt> <query>\n",
+                  "Where query has the format \"this or that\", \"this and that\", etc.\n",
+                  "Knockout files are available from ftp://ftp.informatics.jax.org/pub/reports/MPheno_OBO.ontology and ftp://ftp.informatics.jax.org/pub/reports/HMD_HumanPhenotype.rpt\n";
+
+my $mgi_dir = shift @ARGV;
+my $obo_file = "$mgi_dir/MPheno_OBO.ontology";
+my $human_mouse_file = "$mgi_dir/HMD_HumanPhenotype.rpt";
+my $geno_pheno_file = "$mgi_dir/MGI_PhenoGenoMP.rpt";
+my $hgvs_file = shift @ARGV;
+my $out_file = shift @ARGV;
+my $query = shift @ARGV;
+
+#$query = quotemeta($query); # in case there are meta characters in the query, treat them as literals
+my %problematic_terms = ();
+
+# convert the query to a regex
+my $orig_query = $query;
+my $and_query = 0;
+$query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg;
+if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){
+  $and_query = 1;
+}
+$query =~ s/\s+or\s+/|/gi;
+$query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name
+#print STDERR "Query regex is $query\n" unless $quiet;
+
+open(OBO, $obo_file)
+  or die "Cannot open $obo_file for reading: $!\n";
+my %matched_pheno_ids;
+my %pheno_id2subtypes;
+my %pheno_id2name;
+my $record_count;
+$/ = "\n[Term]\n";
+<OBO>; # chuck header
+while(<OBO>){
+  next unless /^id:\s*(MP:\d+)/s;
+  my $id = $1;
+  next unless /\nname:\s*(.+?)\s*\n/s;
+  my $name = $1;
+  $pheno_id2name{$id} = $name;
+  $record_count++;
+  while(/\nis_a:\s*(MP:\d+)/g){
+    my $parent_id = $1;
+    $pheno_id2subtypes{$parent_id} = [] unless exists $pheno_id2subtypes{$parent_id};
+    push @{$pheno_id2subtypes{$parent_id}}, $id;
+  }
+  if(exists $problematic_terms{$id}){
+    if($name =~ /\b($query)/o){ # strict matching of name only if an entry with problematic free text
+      my $match = $1;
+      $match =~ tr/\t\n/  /;
+      $match =~ s/ {2,}/ /g;
+      if(not exists $matched_pheno_ids{$id}){
+        $matched_pheno_ids{$id} = $match;
+      }
+      elsif($matched_pheno_ids{$id} !~ /$match/){
+        $matched_pheno_ids{$id} .= "; $match";
+      }
+    }
+  }
+  elsif(/\b($query)/o){
+    my $match = $1;
+    $match =~ tr/\t\n/  /;
+    $match =~ s/ {2,}/ /g;
+    if(not exists $matched_pheno_ids{$id}){
+      $matched_pheno_ids{$id} = $match;
+    }
+    elsif($matched_pheno_ids{$id} !~ /$match/){
+      $matched_pheno_ids{$id} .= "; $match";
+    }
+  }
+}
+close(OBO);
+#print STDERR "Found ", scalar(keys %matched_pheno_ids), "/$record_count phenotype ontology terms matching the query\n";
+
+open(OUT, ">$out_file")
+  or die "Cannot open $out_file for writing: $!\n";
+
+# Implements term subsumption
+my @matched_pheno_ids = keys %matched_pheno_ids;
+for(my $i = 0; $i <= $#matched_pheno_ids; $i++){
+  my $pheno_id = $matched_pheno_ids[$i];
+  next unless exists $pheno_id2subtypes{$pheno_id};
+  for my $sub_type_id (@{$pheno_id2subtypes{$pheno_id}}){
+    if(not exists $matched_pheno_ids{$sub_type_id}){
+      $matched_pheno_ids{$sub_type_id} = $matched_pheno_ids{$pheno_id};
+      push @matched_pheno_ids, $sub_type_id;
+    }
+  }
+}
+
+$/="\n"; # record separator
+my %human2mouse;
+# example line:
+# WNT3A	  89780	Wnt3a	  MGI:98956	  MP:0003012 MP:0003631 ... MP:0010768
+open(HUMAN2MOUSE, $human_mouse_file)
+  or die "Cannot open $human_mouse_file for reading: $!\n";
+while(<HUMAN2MOUSE>){
+  my @F = split /\t/, $_;
+  $human2mouse{$F[0]} = $F[2];
+}
+close(HUMAN2MOUSE);
+
+my %gene2pheno_ids;
+# example line:
+# Rbpj<tm1Kyo>/Rbpj<tm1Kyo>	Rbpj<tm1Kyo>	involves: 129S2/SvPas * C57BL/6	MP:0001614	15466160	MGI:96522
+open(PHENO, $geno_pheno_file)
+  or die "Cannot open $geno_pheno_file for reading: $!\n";
+while(<PHENO>){
+  chomp;
+  my @F = split /\t/, $_;
+  next unless $#F > 2; # does it have the phenotype id field?
+  my $knockout = $F[0];
+  next if $knockout =~ /,/; # ignore double knockouts etc.
+  $knockout =~ s/^(\S+?)<.*/$1/; # keep only first gene name bit of knockout description
+  my $pheno_id = $F[3];
+  $gene2pheno_ids{$knockout} = [] unless exists $gene2pheno_ids{$knockout};
+  push @{$gene2pheno_ids{$knockout}}, [$pheno_id,$F[4]];
+}
+
+# remove genes if they don't have a matching phenotype
+for my $gene (keys %gene2pheno_ids){
+  my $keep = 0;
+  for my $pheno_id (@{$gene2pheno_ids{$gene}}){
+    if(exists $matched_pheno_ids{$pheno_id->[0]}){
+      $keep = 1;
+      last;
+    }
+  }
+  delete $gene2pheno_ids{$gene} unless $keep;
+}
+#print STDERR "Found ", scalar(keys %gene2pheno_ids), " genes with knockout phenotype ontology terms matching the query\n" unless $quiet;
+
+$/ = "\n"; # one line at, a time from the HGVS file please!
+open(HGVS, $hgvs_file)
+  or die "Cannot open $hgvs_file for reading: $!\n";
+my $header = <HGVS>;
+chomp $header;
+my @header_columns = split /\t/, $header;
+my $gene_name_column;
+for(my $i = 0; $i <= $#header_columns; $i++){
+  if($header_columns[$i] eq "Gene Name"){
+    $gene_name_column = $i;
+  }
+}
+if(not defined $gene_name_column){
+  die "Could not find 'Gene Name' column in the input header, aborting\n";
+}
+print OUT "$header\tMouse Knockout Phenotypes (matching $orig_query)\tMouse Phenotypes (other)\n";
+
+# Check if any of the variants in the annotated HGVS table are in knockout genes matching the target phenotypes list
+while(<HGVS>){
+  chomp;
+  my @F = split /\t/, $_, -1;
+  my (%target_phenos, %other_phenos);
+  for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){
+    next unless exists $human2mouse{$gene_name};
+    next unless exists $gene2pheno_ids{$human2mouse{$gene_name}};
+    for my $pheno_id (@{$gene2pheno_ids{$human2mouse{$gene_name}}}){
+      my ($id, $pmid) = @$pheno_id;
+      if(exists $matched_pheno_ids{$id}){
+        $target_phenos{$pmid} = [] unless exists $target_phenos{$pmid};
+        push @{$target_phenos{$pmid}}, $pheno_id2name{$id}."($matched_pheno_ids{$id})";
+      }
+      else{
+        $other_phenos{$pmid} = [] unless exists $other_phenos{$pmid};
+        push @{$other_phenos{$pmid}}, $pheno_id2name{$id};
+      }
+    }
+  }
+  if(%target_phenos){
+    print OUT join("\t", @F);
+    print OUT "\t";
+    my $count = 0;
+    for my $pmid (keys %target_phenos){
+       print OUT " // " if $count++;
+       print OUT "PubMed $pmid: ", join("; ", @{$target_phenos{$pmid}});
+    }
+    print OUT "\t";
+    $count = 0;
+    for my $pmid (keys %other_phenos){
+       print OUT " // " if $count++;
+       print OUT "PubMed $pmid: ", join("; ", @{$other_phenos{$pmid}});
+    }
+    print OUT "\n";
+  }
+  else{
+    print OUT join("\t", @F, "", ""), "\n";
+  }
+}
+close(HGVS);
+close(OUT);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_susceptibility_loci_pipe	Wed Mar 25 13:23:29 2015 -0600
@@ -0,0 +1,174 @@
+#!/usr/bin/env perl
+
+# Reports SNPs associated with susceptibility loci determined by GWAS
+use strict;
+use warnings;
+
+@ARGV == 4 or die "Usage: $0 <GWAS db.txt> <input.annotated.hgvs.txt> <output.txt> <pheno query>\n";
+my $gwas_file = shift @ARGV;
+my $hgvs_file = shift @ARGV;
+my $outfile = shift @ARGV;
+my $pheno_query = shift @ARGV;
+
+$pheno_query =~ s/^\s+|\s+$//g; # trim leading and trailing whitespace
+my $s;
+$pheno_query =~ s/\(.*?\)/$s=$&; $s=~s#\s+or\s+#|#g; $s/eg;
+my @or_terms = split /\s+or\s+/i, $pheno_query;
+for my $term (@or_terms){
+  $term = and_query($term); # recursively convert Boolean AND to equivalent regex
+}
+
+# Date Added to Catalog	PUBMEDID	First Author	Date	Journal	Link	Study	Disease/Trait	Initial Sample Size	Replication Sample Size	Region	Chr_id	Chr_pos	Reported Gene(s)	Mapped_gene	Upstream_gene_id	Downstream_gene_id	Snp_gene_ids	Upstream_gene_distance	Downstream_gene_distance	Strongest SNP-Risk Allele	SNPs	Merged	Snp_id_current	Context	Intergenic	Risk Allele Frequency	p-Value	Pvalue_mlog	p-Value (text)	OR or beta	95% CI (text)	Platform [SNPs passing QC]	CNV
+open(GWAS, $gwas_file)
+  or die "Cannot open GWAS file for reading: $!\n";
+my $header = <GWAS>;
+chomp $header;
+my @columns = split /\t/, $header;
+my $trait_column_index = -1;
+my $study_column_index = -1;
+my $chr_column_index = -1;
+my $pos_column_index = -1;
+my $allele_column_index = -1;
+my $context_column_index = -1;
+my $odds_column_index = -1;
+my $pubmed_column_index = -1;
+for(my $i = 0; $i < $#columns; $i++){
+  if($columns[$i] eq "Disease/Trait"){
+    $trait_column_index = $i;
+  }
+  elsif($columns[$i] eq "Study"){
+    $study_column_index = $i;
+  }
+  elsif($columns[$i] eq "Chr_id"){
+    $chr_column_index = $i;
+  }
+  elsif($columns[$i] eq "Chr_pos"){
+    $pos_column_index = $i;
+  }
+  elsif($columns[$i] eq "Strongest SNP-Risk Allele"){
+    $allele_column_index = $i;
+  }
+  elsif($columns[$i] eq "p-Value (text)"){
+    $context_column_index = $i;
+  }
+  elsif($columns[$i] eq "OR or beta"){
+    $odds_column_index = $i;
+  }
+  elsif($columns[$i] eq "PUBMEDID"){
+    $pubmed_column_index = $i;
+  }
+} 
+if($trait_column_index == -1){
+  die "Could not find Trait column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($study_column_index == -1){
+  die "Could not find Study column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($chr_column_index == -1){
+  die "Could not find chromosome name column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($pos_column_index == -1){
+  die "Could not find chromosomal position column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($allele_column_index == -1){
+  die "Could not find risk allele column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($context_column_index == -1){
+  die "Could not find p-value context column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($odds_column_index == -1){
+  die "Could not find odds ratio column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($pubmed_column_index == -1){
+  die "Could not find PubMed ID column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+
+my %snp2desc;
+while(<GWAS>){
+  chomp;
+  next unless /\S/;
+  my @F = split /\t/, $_;
+  my $chr = $F[$chr_column_index];
+  $chr =~ s/^chr//; # drop prefix if present
+  my $strongest_risk_allele = $F[$allele_column_index];
+  $strongest_risk_allele =~ s/^.*-//; # initially looks like rs4345897-G, strip to just letter
+  $snp2desc{$chr.":".$F[$pos_column_index].":".$strongest_risk_allele} = [$F[$trait_column_index], $F[$context_column_index], $F[$study_column_index], $F[$odds_column_index], $F[$pubmed_column_index]];
+}
+close(GWAS);
+
+open(MATCHOUT, ">$outfile")
+  or die "Cannot open $outfile for writing: $!\n";
+
+open(HGVS, $hgvs_file)
+  or die "Cannot open $hgvs_file for reading: $!\n";
+$header = <HGVS>;
+chomp $header;
+my @header_columns = split /\t/, $header;
+my ($chr_column, $obs_column, $pos_column);
+for(my $i = 0; $i <= $#header_columns; $i++){
+  if($header_columns[$i] eq "DNA From"){
+    $pos_column = $i;
+  }
+  elsif($header_columns[$i] eq "Obs base"){
+    $obs_column = $i;
+  }
+  elsif($header_columns[$i] eq "Chr"){
+    $chr_column = $i;
+  }
+}
+if(not defined $chr_column){
+  die "Could not find 'Chr' column in the input header, aborting\n";
+}
+if(not defined $pos_column){
+  die "Could not find 'DNA From' column in the input header, aborting\n";
+}
+if(not defined $obs_column){
+  die "Could not find 'Obs base' column in the input header, aborting\n";
+}
+
+print MATCHOUT $header, "\tPhenotype GWAS Catalog text matches (matching $pheno_query)\tGWAS odds ratio\tGWAS susceptibility description for this SNP\n";
+while(<HGVS>){
+  chomp;
+  my @F = split /\t/, $_, -1;
+  my $allele = $F[$obs_column];
+  $allele =~ s(/.*$)(); # ignore ref allele in het calls
+  my $chr = $F[$chr_column];
+  $chr =~ s/^chr//; # remove chr name prefix if present
+  my $key = $chr.":".$F[$pos_column].":".$allele;
+  #print STDERR "$key\n";
+  if(not exists $snp2desc{$key}){
+    print MATCHOUT join("\t", @F, "", "", ""), "\n";
+    next;
+  }
+  my @matches;
+  for my $term (@or_terms){
+    next if grep {$_ eq $term} @matches;
+    #print STDERR "Checking match for term $term...\n";
+    if($snp2desc{$key}->[0] =~ /\b$term/i or # trait
+       $snp2desc{$key}->[1] =~ /\b$term/i or # p-value context
+       $snp2desc{$key}->[2] =~ /\b$term/i){  # study name
+      push @matches, $term;
+      last;
+    }
+  }
+  chomp $F[$#F];
+  # print all GWAS, whether they match query or not
+  print MATCHOUT join("\t", @F, join("; ", sort @matches), $snp2desc{$key}->[3], "PubMedID ".$snp2desc{$key}->[4].": trait '".$snp2desc{$key}->[0]."'".($snp2desc{$key}->[1] =~ /\S/ ? " in context of '".$snp2desc{$key}->[1]."'" : "").". Study: ".$snp2desc{$key}->[2]), "\n";
+}
+close(HGVS);
+close(MATCHOUT);
+
+sub and_query{
+  my ($query) = @_;
+  if($query =~ /^(.+)\s+and\s+(.+)$/){
+    my $t1 = $1;
+    my $t2 = $2;
+    my $term1 = and_query($t1);
+    my $term2 = and_query($t2);
+    return "($term1.*$term2|$term2.*$term1)";
+  }
+  else{
+    return $query; # base case: single term
+  }
+  return 
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/associate_phenotypes.loc	Wed Mar 25 13:23:29 2015 -0600
@@ -0,0 +1,1 @@
+dbs_dir	/export/achri_galaxy/dbs/