0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5
|
|
6 my $quiet = 0;
|
|
7 if(@ARGV and $ARGV[0] =~ /^-q/){
|
|
8 $quiet = 1;
|
|
9 shift @ARGV;
|
|
10 }
|
|
11
|
|
12 @ARGV == 4 or die "Usage: $0 [-q(uiet)] <GO data dir> <hgvs_annotated.txt> <output.txt> <query>\n",
|
|
13 "Where query has the format \"this or that\", \"this and that\", etc.\n",
|
|
14 "GOA and OBO files must be found in the GO data dir\n";
|
|
15
|
|
16 my $go_dir = shift @ARGV;
|
|
17 my $obo_file = "$go_dir/gene_ontology.1_2.obo";
|
|
18 my $hgvs_file = shift @ARGV;
|
|
19 my $out_file = shift @ARGV;
|
|
20 my $query = shift @ARGV;
|
|
21
|
|
22 # Terms with meronyms and hyponyms in their free text descriptions, causing overgeneralization problems
|
|
23 my %problematic_terms = ("GO:0033647" => "host intracellular organelle",
|
|
24 "GO:0006996" => "organelle organization",
|
|
25 "GO:0043226" => "organelle",
|
|
26 "GO:0043227" => "membrane-bounded organelle",
|
|
27 "GO:0043229" => "intracellular organelle",
|
|
28 "GO:0043231" => "intracellular membrane-bounded organelle",
|
|
29 "GO:0044384" => "host cell outer membrane",
|
|
30 "GO:0044422" => "organelle part",
|
|
31 "GO:0044446" => "intracellular organelle part",
|
|
32 "GO:0045202" => "synapse",
|
|
33 "GO:0033648" => "host intracellular membrane-bounded organelle");
|
|
34
|
|
35 #$query = quotemeta($query); # in case there are meta characters in the query, treat them as literals
|
|
36
|
|
37 # convert the query to a regex
|
|
38 my $orig_query = $query;
|
|
39 my $and_query = 0;
|
|
40 $query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg;
|
|
41 if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){
|
|
42 $and_query = 1;
|
|
43 }
|
|
44 $query =~ s/\s+or\s+/|/gi;
|
|
45 $query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name
|
|
46 #print STDERR "Query regex for GO is $query\n" unless $quiet;
|
|
47
|
|
48 open(OBO, $obo_file)
|
|
49 or die "Cannot open $obo_file for reading: $!\n";
|
|
50 my %matched_go_ids;
|
|
51 my %go_id2subtypes;
|
|
52 my %go_id2name;
|
|
53 my $record_count;
|
|
54 $/ = "\n[Term]\n";
|
|
55 <OBO>; # chuck header
|
|
56 while(<OBO>){
|
|
57 next unless /^id:\s*(GO:\d+)/s;
|
|
58 my $id = $1;
|
|
59 next unless /\nname:\s*(.+?)\s*\n/s;
|
|
60 my $name = $1;
|
|
61 $go_id2name{$id} = $name;
|
|
62 $record_count++;
|
|
63 while(/\nis_a:\s*(GO:\d+)/g){
|
|
64 my $parent_id = $1;
|
|
65 $go_id2subtypes{$parent_id} = [] unless exists $go_id2subtypes{$parent_id};
|
|
66 push @{$go_id2subtypes{$parent_id}}, $id;
|
|
67 }
|
|
68 if(exists $problematic_terms{$id}){
|
|
69 if($name =~ /\b($query)/o){ # strict matching of name only if an entry with problematic free text
|
|
70 my $match = $1;
|
|
71 $match =~ tr/\t\n/ /;
|
|
72 $match =~ s/ {2,}/ /g;
|
|
73 if(not exists $matched_go_ids{$id}){
|
|
74 $matched_go_ids{$id} = $match;
|
|
75 }
|
|
76 elsif($matched_go_ids{$id} !~ /$match/){
|
|
77 $matched_go_ids{$id} .= "; $match";
|
|
78 }
|
|
79 }
|
|
80 }
|
|
81 elsif(/\b($query)/o){
|
|
82 my $match = $1;
|
|
83 $match =~ tr/\t\n/ /;
|
|
84 $match =~ s/ {2,}/ /g;
|
|
85 if(not exists $matched_go_ids{$id}){
|
|
86 $matched_go_ids{$id} = $match;
|
|
87 }
|
|
88 elsif($matched_go_ids{$id} !~ /$match/){
|
|
89 $matched_go_ids{$id} .= "; $match";
|
|
90 }
|
|
91 #print STDERR "Found match $match for $_\n";
|
|
92 }
|
|
93 }
|
|
94 close(OBO);
|
|
95 #print STDERR "Found ", scalar(keys %matched_go_ids), "/$record_count go ontology terms matching the query\n";
|
|
96
|
|
97 open(OUT, ">$out_file")
|
|
98 or die "Cannot open $out_file for writing: $!\n";
|
|
99
|
|
100 # Implements term subsumption
|
|
101 my @matched_go_ids = keys %matched_go_ids;
|
|
102 for(my $i = 0; $i <= $#matched_go_ids; $i++){
|
|
103 my $go_id = $matched_go_ids[$i];
|
|
104 next unless exists $go_id2subtypes{$go_id};
|
|
105 for my $sub_type_id (@{$go_id2subtypes{$go_id}}){
|
|
106 if(not exists $matched_go_ids{$sub_type_id}){
|
|
107 $matched_go_ids{$sub_type_id} = $matched_go_ids{$go_id};
|
|
108 push @matched_go_ids, $sub_type_id;
|
|
109 }
|
|
110 }
|
|
111 }
|
|
112
|
|
113 $/="\n"; # record separator
|
|
114 my %gene2go_ids;
|
|
115 opendir(GOADIR, $go_dir)
|
|
116 or die "Cannot read directory $go_dir: $!\n";
|
|
117 while($_ = readdir(GOADIR)){
|
|
118 next if /^\./; # hidden
|
|
119 next unless /\.goa$/; # not a goa formatted file
|
|
120 my $goa_file = $_;
|
|
121 open(GOA, "$go_dir/$goa_file")
|
|
122 or die "Cannot open $go_dir/$goa_file for reading: $!\n";
|
|
123 #print STDERR "Processing file $goa_file\n";
|
|
124 # example line:
|
|
125 # UniProtKB A0ASJ9 GO:0001664 GO_REF:0000033 ISS PANTHER:PTN000026392 F G protein alpha subunit AgGq6 A0ASJ9_ANOGA protein taxon:7165 20110125 RefGenome
|
|
126 while(<GOA>){
|
|
127 next if /^!/; # comment
|
|
128 chomp;
|
|
129 my @F = split /\t/, $_;
|
|
130 next unless $F[2] and $#F > 3; # does it have the gene name and go id fields?
|
|
131 my $genename = uc($F[2]); # standardize gene names to upper case
|
|
132 $genename =~ s/-//g;
|
|
133 my $go_id = $F[4];
|
|
134 $gene2go_ids{$genename} = [] unless exists $gene2go_ids{$genename};
|
|
135 push @{$gene2go_ids{$genename}}, $go_id;
|
|
136 }
|
|
137 close(GOA);
|
|
138 }
|
|
139 close(GOADIR);
|
|
140 #print STDERR "Found ", scalar(keys %gene2go_ids), " total genes\n";
|
|
141
|
|
142 # remove genes if they don't have a matching go term
|
|
143 for my $genename (keys %gene2go_ids){
|
|
144 my $keep = 0;
|
|
145 for my $go_id (@{$gene2go_ids{$genename}}){
|
|
146 if(exists $matched_go_ids{$go_id}){
|
|
147 $keep = 1;
|
|
148 last;
|
|
149 }
|
|
150 }
|
|
151 delete $gene2go_ids{$genename} unless $keep;
|
|
152 }
|
|
153 #print STDERR "Found ", scalar(keys %gene2go_ids), " genes with gene ontology terms matching the query\n" unless $quiet;
|
|
154
|
|
155 $/ = "\n"; # one line at, a time from the HGVS file please!
|
|
156 open(HGVS, $hgvs_file)
|
|
157 or die "Cannot open $hgvs_file for reading: $!\n";
|
|
158 my $header = <HGVS>;
|
|
159 chomp $header;
|
|
160 my @header_columns = split /\t/, $header;
|
|
161 my $gene_name_column;
|
|
162 for(my $i = 0; $i <= $#header_columns; $i++){
|
|
163 if($header_columns[$i] eq "Gene Name"){
|
|
164 $gene_name_column = $i;
|
|
165 }
|
|
166 }
|
|
167 if(not defined $gene_name_column){
|
|
168 die "Could not find 'Gene Name' column in the input header, aborting\n";
|
|
169 }
|
|
170
|
|
171 print OUT "$header\tQuickGO Gene Ontology Terms (matching $orig_query)\tQuickGO Gene Ontology Terms (other)\n";
|
|
172 # Check if any of the variants in the annotated HGVS table are in knockout genes matching the target go term list
|
|
173 while(<HGVS>){
|
|
174 chomp;
|
|
175 my @F = split /\t/, $_, -1;
|
|
176 my (@target_gos, @other_gos);
|
|
177 for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){
|
|
178 next unless exists $gene2go_ids{$gene_name};
|
|
179 for my $id (@{$gene2go_ids{$gene_name}}){
|
|
180 if(exists $matched_go_ids{$id}){
|
|
181 push @target_gos, $go_id2name{$id}."($matched_go_ids{$id})";
|
|
182 }
|
|
183 else{
|
|
184 push @other_gos, $go_id2name{$id};
|
|
185 }
|
|
186 }
|
|
187 }
|
|
188 if(@target_gos){
|
|
189 my (%t,%o);
|
|
190 # print unique terms
|
|
191 print OUT join("\t", @F, join("; ", sort grep {not $t{$_}++} @target_gos), join("; ", sort grep {not $o{$_}++} @other_gos)), "\n";
|
|
192 }
|
|
193 else{
|
|
194 print OUT join("\t", @F, "", ""), "\n";
|
|
195 }
|
|
196 }
|