Mercurial > repos > yusuf > associate_phenotypes
view filter_by_gene_ontology_pipe @ 0:6411ca16916e default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:23:29 -0600 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use strict; use warnings; my $quiet = 0; if(@ARGV and $ARGV[0] =~ /^-q/){ $quiet = 1; shift @ARGV; } @ARGV == 4 or die "Usage: $0 [-q(uiet)] <GO data dir> <hgvs_annotated.txt> <output.txt> <query>\n", "Where query has the format \"this or that\", \"this and that\", etc.\n", "GOA and OBO files must be found in the GO data dir\n"; my $go_dir = shift @ARGV; my $obo_file = "$go_dir/gene_ontology.1_2.obo"; my $hgvs_file = shift @ARGV; my $out_file = shift @ARGV; my $query = shift @ARGV; # Terms with meronyms and hyponyms in their free text descriptions, causing overgeneralization problems my %problematic_terms = ("GO:0033647" => "host intracellular organelle", "GO:0006996" => "organelle organization", "GO:0043226" => "organelle", "GO:0043227" => "membrane-bounded organelle", "GO:0043229" => "intracellular organelle", "GO:0043231" => "intracellular membrane-bounded organelle", "GO:0044384" => "host cell outer membrane", "GO:0044422" => "organelle part", "GO:0044446" => "intracellular organelle part", "GO:0045202" => "synapse", "GO:0033648" => "host intracellular membrane-bounded organelle"); #$query = quotemeta($query); # in case there are meta characters in the query, treat them as literals # convert the query to a regex my $orig_query = $query; my $and_query = 0; $query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg; if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){ $and_query = 1; } $query =~ s/\s+or\s+/|/gi; $query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name #print STDERR "Query regex for GO is $query\n" unless $quiet; open(OBO, $obo_file) or die "Cannot open $obo_file for reading: $!\n"; my %matched_go_ids; my %go_id2subtypes; my %go_id2name; my $record_count; $/ = "\n[Term]\n"; <OBO>; # chuck header while(<OBO>){ next unless /^id:\s*(GO:\d+)/s; my $id = $1; next unless /\nname:\s*(.+?)\s*\n/s; my $name = $1; $go_id2name{$id} = $name; $record_count++; while(/\nis_a:\s*(GO:\d+)/g){ my $parent_id = $1; $go_id2subtypes{$parent_id} = [] unless exists $go_id2subtypes{$parent_id}; push @{$go_id2subtypes{$parent_id}}, $id; } if(exists $problematic_terms{$id}){ if($name =~ /\b($query)/o){ # strict matching of name only if an entry with problematic free text my $match = $1; $match =~ tr/\t\n/ /; $match =~ s/ {2,}/ /g; if(not exists $matched_go_ids{$id}){ $matched_go_ids{$id} = $match; } elsif($matched_go_ids{$id} !~ /$match/){ $matched_go_ids{$id} .= "; $match"; } } } elsif(/\b($query)/o){ my $match = $1; $match =~ tr/\t\n/ /; $match =~ s/ {2,}/ /g; if(not exists $matched_go_ids{$id}){ $matched_go_ids{$id} = $match; } elsif($matched_go_ids{$id} !~ /$match/){ $matched_go_ids{$id} .= "; $match"; } #print STDERR "Found match $match for $_\n"; } } close(OBO); #print STDERR "Found ", scalar(keys %matched_go_ids), "/$record_count go ontology terms matching the query\n"; open(OUT, ">$out_file") or die "Cannot open $out_file for writing: $!\n"; # Implements term subsumption my @matched_go_ids = keys %matched_go_ids; for(my $i = 0; $i <= $#matched_go_ids; $i++){ my $go_id = $matched_go_ids[$i]; next unless exists $go_id2subtypes{$go_id}; for my $sub_type_id (@{$go_id2subtypes{$go_id}}){ if(not exists $matched_go_ids{$sub_type_id}){ $matched_go_ids{$sub_type_id} = $matched_go_ids{$go_id}; push @matched_go_ids, $sub_type_id; } } } $/="\n"; # record separator my %gene2go_ids; opendir(GOADIR, $go_dir) or die "Cannot read directory $go_dir: $!\n"; while($_ = readdir(GOADIR)){ next if /^\./; # hidden next unless /\.goa$/; # not a goa formatted file my $goa_file = $_; open(GOA, "$go_dir/$goa_file") or die "Cannot open $go_dir/$goa_file for reading: $!\n"; #print STDERR "Processing file $goa_file\n"; # example line: # UniProtKB A0ASJ9 GO:0001664 GO_REF:0000033 ISS PANTHER:PTN000026392 F G protein alpha subunit AgGq6 A0ASJ9_ANOGA protein taxon:7165 20110125 RefGenome while(<GOA>){ next if /^!/; # comment chomp; my @F = split /\t/, $_; next unless $F[2] and $#F > 3; # does it have the gene name and go id fields? my $genename = uc($F[2]); # standardize gene names to upper case $genename =~ s/-//g; my $go_id = $F[4]; $gene2go_ids{$genename} = [] unless exists $gene2go_ids{$genename}; push @{$gene2go_ids{$genename}}, $go_id; } close(GOA); } close(GOADIR); #print STDERR "Found ", scalar(keys %gene2go_ids), " total genes\n"; # remove genes if they don't have a matching go term for my $genename (keys %gene2go_ids){ my $keep = 0; for my $go_id (@{$gene2go_ids{$genename}}){ if(exists $matched_go_ids{$go_id}){ $keep = 1; last; } } delete $gene2go_ids{$genename} unless $keep; } #print STDERR "Found ", scalar(keys %gene2go_ids), " genes with gene ontology terms matching the query\n" unless $quiet; $/ = "\n"; # one line at, a time from the HGVS file please! open(HGVS, $hgvs_file) or die "Cannot open $hgvs_file for reading: $!\n"; my $header = <HGVS>; chomp $header; my @header_columns = split /\t/, $header; my $gene_name_column; for(my $i = 0; $i <= $#header_columns; $i++){ if($header_columns[$i] eq "Gene Name"){ $gene_name_column = $i; } } if(not defined $gene_name_column){ die "Could not find 'Gene Name' column in the input header, aborting\n"; } print OUT "$header\tQuickGO Gene Ontology Terms (matching $orig_query)\tQuickGO Gene Ontology Terms (other)\n"; # Check if any of the variants in the annotated HGVS table are in knockout genes matching the target go term list while(<HGVS>){ chomp; my @F = split /\t/, $_, -1; my (@target_gos, @other_gos); for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){ next unless exists $gene2go_ids{$gene_name}; for my $id (@{$gene2go_ids{$gene_name}}){ if(exists $matched_go_ids{$id}){ push @target_gos, $go_id2name{$id}."($matched_go_ids{$id})"; } else{ push @other_gos, $go_id2name{$id}; } } } if(@target_gos){ my (%t,%o); # print unique terms print OUT join("\t", @F, join("; ", sort grep {not $t{$_}++} @target_gos), join("; ", sort grep {not $o{$_}++} @other_gos)), "\n"; } else{ print OUT join("\t", @F, "", ""), "\n"; } }