Mercurial > repos > yusuf > associate_phenotypes
comparison filter_by_gene_ontology_pipe @ 0:6411ca16916e default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:23:29 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6411ca16916e |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 | |
6 my $quiet = 0; | |
7 if(@ARGV and $ARGV[0] =~ /^-q/){ | |
8 $quiet = 1; | |
9 shift @ARGV; | |
10 } | |
11 | |
12 @ARGV == 4 or die "Usage: $0 [-q(uiet)] <GO data dir> <hgvs_annotated.txt> <output.txt> <query>\n", | |
13 "Where query has the format \"this or that\", \"this and that\", etc.\n", | |
14 "GOA and OBO files must be found in the GO data dir\n"; | |
15 | |
16 my $go_dir = shift @ARGV; | |
17 my $obo_file = "$go_dir/gene_ontology.1_2.obo"; | |
18 my $hgvs_file = shift @ARGV; | |
19 my $out_file = shift @ARGV; | |
20 my $query = shift @ARGV; | |
21 | |
22 # Terms with meronyms and hyponyms in their free text descriptions, causing overgeneralization problems | |
23 my %problematic_terms = ("GO:0033647" => "host intracellular organelle", | |
24 "GO:0006996" => "organelle organization", | |
25 "GO:0043226" => "organelle", | |
26 "GO:0043227" => "membrane-bounded organelle", | |
27 "GO:0043229" => "intracellular organelle", | |
28 "GO:0043231" => "intracellular membrane-bounded organelle", | |
29 "GO:0044384" => "host cell outer membrane", | |
30 "GO:0044422" => "organelle part", | |
31 "GO:0044446" => "intracellular organelle part", | |
32 "GO:0045202" => "synapse", | |
33 "GO:0033648" => "host intracellular membrane-bounded organelle"); | |
34 | |
35 #$query = quotemeta($query); # in case there are meta characters in the query, treat them as literals | |
36 | |
37 # convert the query to a regex | |
38 my $orig_query = $query; | |
39 my $and_query = 0; | |
40 $query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg; | |
41 if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){ | |
42 $and_query = 1; | |
43 } | |
44 $query =~ s/\s+or\s+/|/gi; | |
45 $query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name | |
46 #print STDERR "Query regex for GO is $query\n" unless $quiet; | |
47 | |
48 open(OBO, $obo_file) | |
49 or die "Cannot open $obo_file for reading: $!\n"; | |
50 my %matched_go_ids; | |
51 my %go_id2subtypes; | |
52 my %go_id2name; | |
53 my $record_count; | |
54 $/ = "\n[Term]\n"; | |
55 <OBO>; # chuck header | |
56 while(<OBO>){ | |
57 next unless /^id:\s*(GO:\d+)/s; | |
58 my $id = $1; | |
59 next unless /\nname:\s*(.+?)\s*\n/s; | |
60 my $name = $1; | |
61 $go_id2name{$id} = $name; | |
62 $record_count++; | |
63 while(/\nis_a:\s*(GO:\d+)/g){ | |
64 my $parent_id = $1; | |
65 $go_id2subtypes{$parent_id} = [] unless exists $go_id2subtypes{$parent_id}; | |
66 push @{$go_id2subtypes{$parent_id}}, $id; | |
67 } | |
68 if(exists $problematic_terms{$id}){ | |
69 if($name =~ /\b($query)/o){ # strict matching of name only if an entry with problematic free text | |
70 my $match = $1; | |
71 $match =~ tr/\t\n/ /; | |
72 $match =~ s/ {2,}/ /g; | |
73 if(not exists $matched_go_ids{$id}){ | |
74 $matched_go_ids{$id} = $match; | |
75 } | |
76 elsif($matched_go_ids{$id} !~ /$match/){ | |
77 $matched_go_ids{$id} .= "; $match"; | |
78 } | |
79 } | |
80 } | |
81 elsif(/\b($query)/o){ | |
82 my $match = $1; | |
83 $match =~ tr/\t\n/ /; | |
84 $match =~ s/ {2,}/ /g; | |
85 if(not exists $matched_go_ids{$id}){ | |
86 $matched_go_ids{$id} = $match; | |
87 } | |
88 elsif($matched_go_ids{$id} !~ /$match/){ | |
89 $matched_go_ids{$id} .= "; $match"; | |
90 } | |
91 #print STDERR "Found match $match for $_\n"; | |
92 } | |
93 } | |
94 close(OBO); | |
95 #print STDERR "Found ", scalar(keys %matched_go_ids), "/$record_count go ontology terms matching the query\n"; | |
96 | |
97 open(OUT, ">$out_file") | |
98 or die "Cannot open $out_file for writing: $!\n"; | |
99 | |
100 # Implements term subsumption | |
101 my @matched_go_ids = keys %matched_go_ids; | |
102 for(my $i = 0; $i <= $#matched_go_ids; $i++){ | |
103 my $go_id = $matched_go_ids[$i]; | |
104 next unless exists $go_id2subtypes{$go_id}; | |
105 for my $sub_type_id (@{$go_id2subtypes{$go_id}}){ | |
106 if(not exists $matched_go_ids{$sub_type_id}){ | |
107 $matched_go_ids{$sub_type_id} = $matched_go_ids{$go_id}; | |
108 push @matched_go_ids, $sub_type_id; | |
109 } | |
110 } | |
111 } | |
112 | |
113 $/="\n"; # record separator | |
114 my %gene2go_ids; | |
115 opendir(GOADIR, $go_dir) | |
116 or die "Cannot read directory $go_dir: $!\n"; | |
117 while($_ = readdir(GOADIR)){ | |
118 next if /^\./; # hidden | |
119 next unless /\.goa$/; # not a goa formatted file | |
120 my $goa_file = $_; | |
121 open(GOA, "$go_dir/$goa_file") | |
122 or die "Cannot open $go_dir/$goa_file for reading: $!\n"; | |
123 #print STDERR "Processing file $goa_file\n"; | |
124 # example line: | |
125 # UniProtKB A0ASJ9 GO:0001664 GO_REF:0000033 ISS PANTHER:PTN000026392 F G protein alpha subunit AgGq6 A0ASJ9_ANOGA protein taxon:7165 20110125 RefGenome | |
126 while(<GOA>){ | |
127 next if /^!/; # comment | |
128 chomp; | |
129 my @F = split /\t/, $_; | |
130 next unless $F[2] and $#F > 3; # does it have the gene name and go id fields? | |
131 my $genename = uc($F[2]); # standardize gene names to upper case | |
132 $genename =~ s/-//g; | |
133 my $go_id = $F[4]; | |
134 $gene2go_ids{$genename} = [] unless exists $gene2go_ids{$genename}; | |
135 push @{$gene2go_ids{$genename}}, $go_id; | |
136 } | |
137 close(GOA); | |
138 } | |
139 close(GOADIR); | |
140 #print STDERR "Found ", scalar(keys %gene2go_ids), " total genes\n"; | |
141 | |
142 # remove genes if they don't have a matching go term | |
143 for my $genename (keys %gene2go_ids){ | |
144 my $keep = 0; | |
145 for my $go_id (@{$gene2go_ids{$genename}}){ | |
146 if(exists $matched_go_ids{$go_id}){ | |
147 $keep = 1; | |
148 last; | |
149 } | |
150 } | |
151 delete $gene2go_ids{$genename} unless $keep; | |
152 } | |
153 #print STDERR "Found ", scalar(keys %gene2go_ids), " genes with gene ontology terms matching the query\n" unless $quiet; | |
154 | |
155 $/ = "\n"; # one line at, a time from the HGVS file please! | |
156 open(HGVS, $hgvs_file) | |
157 or die "Cannot open $hgvs_file for reading: $!\n"; | |
158 my $header = <HGVS>; | |
159 chomp $header; | |
160 my @header_columns = split /\t/, $header; | |
161 my $gene_name_column; | |
162 for(my $i = 0; $i <= $#header_columns; $i++){ | |
163 if($header_columns[$i] eq "Gene Name"){ | |
164 $gene_name_column = $i; | |
165 } | |
166 } | |
167 if(not defined $gene_name_column){ | |
168 die "Could not find 'Gene Name' column in the input header, aborting\n"; | |
169 } | |
170 | |
171 print OUT "$header\tQuickGO Gene Ontology Terms (matching $orig_query)\tQuickGO Gene Ontology Terms (other)\n"; | |
172 # Check if any of the variants in the annotated HGVS table are in knockout genes matching the target go term list | |
173 while(<HGVS>){ | |
174 chomp; | |
175 my @F = split /\t/, $_, -1; | |
176 my (@target_gos, @other_gos); | |
177 for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){ | |
178 next unless exists $gene2go_ids{$gene_name}; | |
179 for my $id (@{$gene2go_ids{$gene_name}}){ | |
180 if(exists $matched_go_ids{$id}){ | |
181 push @target_gos, $go_id2name{$id}."($matched_go_ids{$id})"; | |
182 } | |
183 else{ | |
184 push @other_gos, $go_id2name{$id}; | |
185 } | |
186 } | |
187 } | |
188 if(@target_gos){ | |
189 my (%t,%o); | |
190 # print unique terms | |
191 print OUT join("\t", @F, join("; ", sort grep {not $t{$_}++} @target_gos), join("; ", sort grep {not $o{$_}++} @other_gos)), "\n"; | |
192 } | |
193 else{ | |
194 print OUT join("\t", @F, "", ""), "\n"; | |
195 } | |
196 } |