Mercurial > repos > yusuf > associate_phenotypes
diff associate_variant_phenotypes @ 0:6411ca16916e default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:23:29 -0600 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/associate_variant_phenotypes Wed Mar 25 13:23:29 2015 -0600 @@ -0,0 +1,366 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Math::CDF qw(pchisq); # chi square for calculating Fisher's method of combining p-values +use File::Basename; +my $dirname = dirname(__FILE__); + +# configuration file stuff +my %config; +my $tool_data = shift @ARGV; +if(not -e "$tool_data/associate_phenotypes.loc" ){ + system("cp $dirname/tool-data/associate_phenotypes.loc $tool_data/associate_phenotypes.loc") >> 8 and die "Could not create config file: $!\n"; +} +open CONFIG, '<', "$tool_data/associate_phenotypes.loc"; +while(<CONFIG>){ + next if $_ =~ /^#/; + (my $key, my $value) = split(/\s+/,$_); + $config{$key} = $value; +} +close CONFIG; +my $dbs_dir = $config{"dbs_dir"}; + +@ARGV == 6 or die "Usage: $0 <outfiles prefix> <annotated input> <preselected gene list of interest> <human literature terms> <mouse knockout terms> <gene ontology terms>\n"; + +my $final_confident_outfiles_prefix = shift @ARGV; +my $confident_input = shift @ARGV; +my $preselected_genes_file = shift @ARGV; +my $human_lit_file = shift @ARGV; +my $mouse_knockout_file = shift @ARGV; +my $gene_ontology_file = shift @ARGV; + +my @genes; +if(-e $preselected_genes_file){ + open(GENES, $preselected_genes_file) + or die "Cannot open $preselected_genes_file for reading: $!\n"; + while(<GENES>){ + chomp; + next if /^#/ or not /\S/; + s/^\s+|\s+$//g; # get rid of trailing or leading spaces + push @genes, $_; + } + close(GENES); +} +else{ + @genes = split / or /, $preselected_genes_file; +} +my %genes; +for my $g (@genes){ + $genes{lc($g)} = 1; +} + +my @human_lit_query; +if(-e $human_lit_file){ + open(HUMAN, $human_lit_file) + or die "Cannot open $human_lit_file for reading: $!\n"; + while(<HUMAN>){ + chomp; + next if /^#/ or not /\S/; + s/^\s+|\s+$//g; # get rid of trailing or leading spaces + s/\s+/ /g; # normalize any other whitespace + # to do: stem query terms? exclude stop words? + push @human_lit_query, $_; + } + close(HUMAN); +} +else{ + @human_lit_query = split / or /, $human_lit_file; +} +my $human_lit_query = join(" or ", @human_lit_query, @genes); + +my @mouse_knockout_query; +if(-e $mouse_knockout_file){ + open(MOUSE, $mouse_knockout_file) + or die "Cannot open $mouse_knockout_file for reading: $!\n"; + while(<MOUSE>){ + chomp; + next if /^#/ or not /\S/; + s/^\s+|\s+$//g; # get rid of trailing or leading spaces + s/\s+/ /g; # normalize any other whitespace + # to do: stem query terms? exclude stop words? + push @mouse_knockout_query, $_; + } + close(MOUSE); +} +else{ + @mouse_knockout_query = split / or /, $mouse_knockout_file; +} +my $mouse_knockout_query = join(" or ", @mouse_knockout_query); + +my @go_query; +if(-e $gene_ontology_file){ + open(GO, $gene_ontology_file) + or die "Cannot open $gene_ontology_file for reading: $!\n"; + while(<GO>){ + chomp; + next if /^#/ or not /\S/; + s/^\s+|\s+$//g; # get rid of trailing or leading spaces + s/\s+/ /g; # normalize any other whitespace + # to do: stem query terms? exclude stop words? + push @go_query, $_; + } +} +else{ + @go_query = split / or /, $gene_ontology_file; +} +my $go_query = join(" or ", @go_query); +close(GO); + +my @cmds; + +if($human_lit_query){ + # do pubmed first because it has to potentially download references from the internet, so better to do this with just a couple concurrent rather than a lot, which would stress the remote iHOP server + push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/IHOP/ PubMed $confident_input - '$human_lit_query'"; + push @cmds, "$dirname/filter_by_susceptibility_loci_pipe $dbs_dir/GWAS/gwascatalog.txt - - '$human_lit_query'"; + push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/OMIM/omim.txt. OMIM - - '$human_lit_query'"; + push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/ClinVar/ClinVarFullRelease.xml. ClinVar - - '$human_lit_query'"; + push @cmds, "$dirname/filter_by_human_phenotype_ontology_pipe $dbs_dir/HPO - - '$human_lit_query'"; +} + +if($mouse_knockout_query or $human_lit_query){ + if($mouse_knockout_query){ + if($human_lit_query){ + $mouse_knockout_query .= " or $human_lit_query"; + } + } + else{ + $mouse_knockout_query = $human_lit_query; + } + if($human_lit_query){ + push @cmds, "$dirname/filter_by_mouse_knockout_pipe $dbs_dir/MGI/2013-03-15 - - '$mouse_knockout_query'"; + } + else{ + push @cmds, "$dirname/filter_by_mouse_knockout_pipe $dbs_dir/MGI/2013-03-15 $confident_input - '$mouse_knockout_query'" + } +} + +if($go_query or $human_lit_query){ + if($go_query){ + if(@human_lit_query){ + $go_query .= " or ".join(" or ", @human_lit_query); + } + } + else{ + $go_query = join(" or ", @human_lit_query); + } + if($mouse_knockout_query or $human_lit_query){ + push @cmds, "$dirname/associate_phenotypes/filter_by_gene_ontology_pipe $dbs_dir/GOA - - '$go_query'"; + } + else{ + push @cmds, "$dirname/associate_phenotypes/filter_by_gene_ontology_pipe $dbs_dir/GOA $confident_input - '$go_query'"; + } +} + +&print_final_output($final_confident_outfiles_prefix, @cmds); + +# Use Fisher's Method to combine p-values from various phenotype sources into a single score for ranking +# This is an okay method to use (rather than something more complicated like Brown's method), because our +# experience with real queries is that there is surprsingly little correlation (Spearman's rank or Kendall's tau) between +# the p-values for different sources (primary or curated secondary). +sub print_final_output{ + my ($final_output_prefix, @cmds) = @_; + + my $cmd = join("|", @cmds). "|"; # pipe output so we read the stream in the handle below + open(ORIG, $cmd) + or die "Could not run '$cmd': $!\n"; + my $header = <ORIG>; + chomp $header; + my @orig_header = split /\t/, $header; + my ($chr_column, $pos_column, $gene_column, $hgvs_aa_column, $maf_column, $srcs_column, @pvalue_columns, @pheno_match_columns); + for(my $i = 0; $i <= $#orig_header; $i++){ + if($orig_header[$i] eq "Chr"){ + $chr_column = $i; + } + elsif($orig_header[$i] eq "DNA From"){ + $pos_column = $i; + } + elsif($orig_header[$i] eq "Gene Name"){ + $gene_column = $i; + } + elsif($orig_header[$i] eq "Protein HGVS"){ + $hgvs_aa_column = $i; + } + elsif($orig_header[$i] eq "Pop. freq."){ + $maf_column = $i; + } + elsif($orig_header[$i] eq "Sources"){ + $srcs_column = $i; + } + elsif($orig_header[$i] =~ /p-value/){ # columns of pheno association with a stat + push @pvalue_columns, $i; + } + elsif($orig_header[$i] =~ /\(matching/){ + push @pheno_match_columns, $i; + } + } + if(not defined $chr_column){ + die "Could not find the 'Chr' column in the header, aborting ($header)\n"; + } + elsif(not defined $pos_column){ + die "Could not find the 'DNA From' column in the header, aborting ($header)\n"; + } + elsif(not defined $hgvs_aa_column){ + die "Could not find the 'Protein HGVS' column in the header, aborting ($header)\n"; + } + elsif(not defined $maf_column){ + die "Could not find the 'Pop. freq.' column in the header, aborting ($header)\n"; + } + # Sources is optional + + # all other headers from other output files generated will be appended to the original ones + my @final_header = (@orig_header, "Combined phenotype relevance P-value"); + if(@genes){ + push @final_header, "Targeted Gene?"; + } + my %lines; # chr -> position -> [dataline1, dataline2, ...] + my %source; # no. lines per variant source + while(<ORIG>){ + chomp; + next unless /\S/; # ignore blank lines + my @F = split /\t/, $_, -1; # keep trailing blank fields + my $chr = $F[$chr_column]; + $chr =~ s/^chr//; # helps for sorting purposes + my $pos = $F[$pos_column]; + $pos =~ s/-.*$//; # CNVs have a range + $lines{$chr} = {} if not exists $lines{$F[$chr_column]}; + $lines{$chr}->{$pos} = [] if not exists $lines{$F[$chr_column]}->{$pos}; + my @final_dataline = @F; # fields that are the same in all files since they were in the original + for(my $i = 0; $i < $#final_dataline; $i++){ + $final_dataline[$i] = "" if not defined $final_dataline[$i]; + } + # Create aggregate phenotype relevance score using Fisher's method + # A combined p-value for k p-values (P1...Pk) is calculated using a chi-square value (with 2k degrees of freedom) derived by -2*sum(ln(Pi), i=1..k) + my $chi_sq = 0; + my $num_pvalues = 0; + my $last_pvalue = 1; + for my $pvalue_index (@pvalue_columns){ + next if $F[$pvalue_index] eq ""; + $last_pvalue = $F[$pvalue_index]; + $F[$pvalue_index] = 0.00001 if not $F[$pvalue_index]; # avoids log(0) issue + $num_pvalues++; + $chi_sq += log($F[$pvalue_index]); + } + my $fisher_pvalue = 1; + if($num_pvalues > 1){ + $chi_sq *= -2; + my $p = pchisq($chi_sq, 2*scalar(@pvalue_columns)); + if(not defined $p){ + print STDERR "($num_pvalues) No X2 test value for $chi_sq ("; + for my $pvalue_index (@pvalue_columns){ + if($F[$pvalue_index] eq ""){print STDERR "NA "} + else{print STDERR $F[$pvalue_index], " "} + } + print STDERR ")\n$_\n"; + } + $fisher_pvalue = 1-$p; + } + elsif($num_pvalues == 1){ + $fisher_pvalue = $last_pvalue; # no multiple testing correction + } + else{ + for my $match_column (@pheno_match_columns){ + next if $F[$match_column] eq ""; # give a token amount of positive score to ontology term matches + for my $match (split /\/\/|;/, $F[$match_column]){ + last if $fisher_pvalue <= 0.001; # only make better if not realy close to zero anyway + $fisher_pvalue -= 0.001; + } + } + } + push @final_dataline, abs($fisher_pvalue); + if(@genes){ + push @final_dataline, (grep({exists $genes{$_}} split(/; /, lc($F[$gene_column]))) ? "yes" : "no"); + } + push @{$lines{$chr}->{$pos}}, \@final_dataline; + + next unless defined $srcs_column and $F[$srcs_column] =~ /(?:^|\+| )(\S+?)(?=;|$)/; + $source{$1}++; + } + + my @outfiles = ("$final_output_prefix.novel.hgvs.txt", "$final_output_prefix.very_rare.hgvs.txt", "$final_output_prefix.rare.hgvs.txt", "$final_output_prefix.common.hgvs.txt"); + open(OUT_NOVEL, ">$outfiles[0]") + or die "Cannot open $outfiles[0] for writing: $!\n"; + open(OUT_VERY_RARE, ">$outfiles[1]") + or die "Cannot open $outfiles[1] for writing: $!\n"; + open(OUT_RARE, ">$outfiles[2]") + or die "Cannot open $outfiles[2] for writing: $!\n"; + open(OUT_COMMON, ">$outfiles[3]") + or die "Cannot open $outfiles[3] for writing: $!\n"; + print OUT_NOVEL join("\t", @final_header), "\n"; + print OUT_VERY_RARE join("\t", @final_header), "\n"; + print OUT_RARE join("\t", @final_header), "\n"; + print OUT_COMMON join("\t", @final_header), "\n"; + my @sorted_chrs = sort {$a =~ /^\d+$/ and $b =~ /^\d+$/ and $a <=> $b or $a cmp $b} keys %lines; + for my $chr (@sorted_chrs){ + for my $pos (sort {$a <=> $b} keys %{$lines{$chr}}){ + my $datalines_ref = $lines{$chr}->{$pos}; + # The following sorting puts all protein coding effect for a variant before non-coding ones + my @sorted_dataline_refs = sort {$a ne "NA" and $b ne "NA" and $a->[$hgvs_aa_column] cmp $a->[$hgvs_aa_column] or $b cmp $a} @$datalines_ref; + for my $dataline_ref (@sorted_dataline_refs){ + next unless defined $dataline_ref; + my $maf = $dataline_ref->[$maf_column]; + my $tot_line_length = 0; + for(my $i = 0; $i < $#{$dataline_ref}; $i++){ + if(not defined $dataline_ref->[$i]){ + $dataline_ref->[$i] = ""; # so we don't get crappy warnings of undefined values + } + else{ + $tot_line_length += length($dataline_ref->[$i]); + } + $tot_line_length++; # the tab + } + if($tot_line_length > 32000){ # Excel limit of 32767 characters in a cell. Also, undocumented bug that entire import line cannot exceeed this length. + # If we don't truncate, the rest of the line (including entire contents of cells to the right) are unceremoniously dumped. + # Note that personal experience has shown that the limit is actually a bit below this, so rounding down to the nearest 1000 for safety + my $overage = $tot_line_length - 32000; + my $sum_of_large_cells = 0; + my $num_large_cells = 0; + for(my $i = 0; $i <= $#{$dataline_ref}; $i++){ # remove contents from the largest cells + if(length($dataline_ref->[$i]) > 3200){ + $sum_of_large_cells += length($dataline_ref->[$i]); # all cells that are at least 10% of the max + $num_large_cells++; + } + } + my $cell_max_alloc = int((32000-($tot_line_length-$sum_of_large_cells))/$num_large_cells); + for(my $i = 0; $i <= $#{$dataline_ref}; $i++){ # truncate the bigger than average ones + if(length($dataline_ref->[$i]) > $cell_max_alloc){ + $dataline_ref->[$i] = substr($dataline_ref->[$i], 0, $cell_max_alloc-37)."[...remainder truncated for length]"; + } + } + } + if($maf eq "NA"){ + print OUT_NOVEL join("\t", @$dataline_ref), "\n"; + } + if($maf eq "NA" or $maf < 0.005){ + print OUT_VERY_RARE join("\t", @$dataline_ref), "\n"; + } + if($maf eq "NA" or $maf < 0.05){ + print OUT_RARE join("\t", @$dataline_ref), "\n"; + } + print OUT_COMMON join("\t", @$dataline_ref), "\n"; + } + } + } + close(OUT_NOVEL); + close(OUT_VERY_RARE); + close(OUT_RARE); + close(OUT_COMMON); + + # Print per-source tables (e.g. for each patient in a cohort) + for my $src (keys %source){ + for my $outfile (@outfiles){ + open(IN, $outfile) + or die "cannot open $outfile for reading: $!\n"; + my $src_outfile = $outfile; + $src_outfile =~ s/$final_output_prefix/$final_output_prefix-$src/; + open(OUT, ">$src_outfile") + or die "Cannot open $src_outfile for writing: $!\n"; + print OUT scalar(<IN>); # header line + while(<IN>){ + print OUT $_ if /(?:^|\+| )($src)(?=;|$)/; + } + close(OUT); + } + } +} +