Mercurial > repos > yusuf > associate_phenotypes
comparison filter_by_human_phenotype_ontology_pipe @ 0:6411ca16916e default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:23:29 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6411ca16916e |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 | |
6 my $quiet = 0; | |
7 if(@ARGV and $ARGV[0] =~ /^-q/){ | |
8 $quiet = 1; | |
9 shift @ARGV; | |
10 } | |
11 | |
12 @ARGV == 4 or die "Usage: $0 [-q(uiet)] <human_phenotype_ontology_dir> <hgvs_annotated.txt> <output.txt> <query>\n", | |
13 "Where query has the format \"this or that\", \"this and that\", etc.\n", | |
14 "Human phenotype files are available from http://compbio.charite.de/svn/hpo/trunk/src/ontology/human-phenotype-ontology.obo and genes_to_phenotype.txt\n"; | |
15 | |
16 my $hpo_dir = shift @ARGV; | |
17 my $obo_file = "$hpo_dir/human-phenotype-ontology.obo"; | |
18 my $gene_pheno_file = "$hpo_dir/genes_to_phenotype.txt"; | |
19 my $hgvs_file = shift @ARGV; | |
20 my $out_file = shift @ARGV; | |
21 my $query = shift @ARGV; | |
22 | |
23 # Below is a list of ontology terms with definitions including hyponyms and meronyms, so only match text on title so as not to get too many false positives | |
24 my %problematic = qw(HP:0003271 Visceromegaly | |
25 HP:0000246 Sinusitis); | |
26 # Ontology terms with HPO in the free text, leading to nasty overmatching when searching for gene HPO | |
27 my %self_referencing = ( | |
28 "HP:0000118" => "Phenotypic abnormality", | |
29 "HP:0000455" => "Broad nasal tip", | |
30 "HP:0000968" => "Ectodermal dysplasia", | |
31 "HP:0002652" => "Skeletal dysplasia", | |
32 "HP:0003812" => "Phenotypic variability", | |
33 "HP:0003828" => "Variable expressivity", | |
34 "HP:0003829" => "Incomplete penetrance", | |
35 "HP:0004472" => "Mandibular hyperostosis", | |
36 "HP:0004495" => "Thin anteverted nares", | |
37 "HP:0005286" => "Hypoplastic, notched nares", | |
38 "HP:0005321" => "Mandibulofacial dysostosis", | |
39 "HP:0005871" => "Metaphyseal chondrodysplasia", | |
40 "HP:0007589" => "Aplasia cutis congenita on trunk or limbs", | |
41 "HP:0007819" => "Presenile cataracts"); | |
42 | |
43 # Ignore metadata field words, so we can properly match gene names like HPO :-) | |
44 my %stop_words = ("name:" => 1, | |
45 "HPO:" => 1, | |
46 "alt_id:" => 1, | |
47 "def:" => 1, | |
48 "synonym:" => 1, | |
49 "EXACT" => 1, | |
50 "xref:" => 1, | |
51 "UMLS:" => 1, | |
52 "is_a:" => 1, | |
53 "created_by:" => 1, | |
54 "comment:" => 1, | |
55 "creation_date:" => 1); | |
56 | |
57 # convert the query to a regex | |
58 my $orig_query = $query; | |
59 my $and_query = 0; | |
60 $query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg; | |
61 if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){ | |
62 $and_query = 1; | |
63 } | |
64 $query =~ s/\s+or\s+/|/gi; | |
65 $query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name | |
66 #print STDERR "Query regex is $query\n" unless $quiet; | |
67 | |
68 | |
69 open(OBO, $obo_file) | |
70 or die "Cannot open $obo_file for reading: $!\n"; | |
71 my %matched_pheno_ids; | |
72 my %pheno_id2subtypes; | |
73 my %pheno_id2name; | |
74 my $record_count; | |
75 $/ = "\n[Term]\n"; | |
76 <OBO>; # chuck header | |
77 while(<OBO>){ | |
78 next unless /^id:\s*(HP:\d+)/s; | |
79 my $id = $1; | |
80 next unless /\nname:\s*(.+?)\s*\n/s; | |
81 my $name = $1; | |
82 $pheno_id2name{$id} = $name; | |
83 $record_count++; | |
84 while(/\nis_a:\s*(HP:\d+)/g){ | |
85 my $parent_id = $1; | |
86 $pheno_id2subtypes{$parent_id} = [] unless exists $pheno_id2subtypes{$parent_id}; | |
87 push @{$pheno_id2subtypes{$parent_id}}, $id; | |
88 } | |
89 s/(UMLS:\S+\s+")(.+?)(?=")/$1.lc($2)/eg; | |
90 if(exists $problematic{$id}){ # for overmatching terms due to their descriptions (hyponyms and meronyms included in | |
91 # parent entry), only match the title to not generate too many false poositives | |
92 while($name =~ /\b($query)(\S*?:?)/go){ | |
93 next if exists $stop_words{$1.$2}; | |
94 my $match = $1; | |
95 $match =~ tr/\t\n/ /; | |
96 $match =~ s/\s{2,}/ /g; | |
97 if(not exists $matched_pheno_ids{$id}){ | |
98 $matched_pheno_ids{$id} = $match; | |
99 } | |
100 elsif($matched_pheno_ids{$id} !~ /$match/){ | |
101 $matched_pheno_ids{$id} .= "; $match"; | |
102 } | |
103 } | |
104 } | |
105 else{ # normally, match anywhere in the entry | |
106 while(/\b($query)(\S*?:?)/go){ | |
107 next if defined $2 and exists $stop_words{$1.$2} or $1 eq "HPO" and $self_referencing{$id}; | |
108 my $match = $1; | |
109 $match =~ tr/\t\n/ /; | |
110 $match =~ s/\s{2,}/ /g; | |
111 if(not exists $matched_pheno_ids{$id}){ | |
112 $matched_pheno_ids{$id} = $match; | |
113 } | |
114 elsif($matched_pheno_ids{$id} !~ /\Q$match\E/){ | |
115 $matched_pheno_ids{$id} .= "; $match"; | |
116 } | |
117 #print STDERR "Match $match for $_\n"; | |
118 } | |
119 } | |
120 } | |
121 close(OBO); | |
122 #print STDERR "Found ", scalar(keys %matched_pheno_ids), "/$record_count phenotype ontology terms matching the query\n"; | |
123 | |
124 | |
125 # Implements term subsumption | |
126 my @matched_pheno_ids = keys %matched_pheno_ids; | |
127 for(my $i = 0; $i <= $#matched_pheno_ids; $i++){ | |
128 my $pheno_id = $matched_pheno_ids[$i]; | |
129 next unless exists $pheno_id2subtypes{$pheno_id}; | |
130 for my $sub_type_id (@{$pheno_id2subtypes{$pheno_id}}){ | |
131 if(not exists $matched_pheno_ids{$sub_type_id}){ | |
132 $matched_pheno_ids{$sub_type_id} = $matched_pheno_ids{$pheno_id}; | |
133 push @matched_pheno_ids, $sub_type_id; | |
134 } | |
135 } | |
136 } | |
137 | |
138 $/="\n"; # record separator | |
139 my %gene2pheno_ids; | |
140 # Format: entrez-gene-id<tab>entrez-gene-symbol<tab>HPO-Term-Name<tab>HPO-Term-ID | |
141 open(PHENO, $gene_pheno_file) | |
142 or die "Cannot open $gene_pheno_file for reading: $!\n"; | |
143 while(<PHENO>){ | |
144 chomp; | |
145 my @F = split /\t/, $_; | |
146 next unless $#F > 2; # does it have the phenotype id field? | |
147 my $gene = $F[1]; | |
148 my $pheno_id = $F[3]; | |
149 $gene2pheno_ids{$gene} = [] unless exists $gene2pheno_ids{$gene}; | |
150 push @{$gene2pheno_ids{$gene}}, $pheno_id; | |
151 } | |
152 | |
153 # remove genes if they don't have a matching phenotype | |
154 for my $gene (keys %gene2pheno_ids){ | |
155 my $keep = 0; | |
156 for my $pheno_id (@{$gene2pheno_ids{$gene}}){ | |
157 if(exists $matched_pheno_ids{$pheno_id}){ | |
158 $keep = 1; | |
159 last; | |
160 } | |
161 } | |
162 delete $gene2pheno_ids{$gene} unless $keep; | |
163 } | |
164 #print STDERR "Found ", scalar(keys %gene2pheno_ids), " genes with human phenotype ontology terms matching the query\n" unless $quiet; | |
165 | |
166 $/ = "\n"; # one line at, a time from the HGVS file please! | |
167 open(HGVS, $hgvs_file) | |
168 or die "Cannot open $hgvs_file for reading: $!\n"; | |
169 my $header = <HGVS>; | |
170 chomp $header; | |
171 my @header_columns = split /\t/, $header; | |
172 my $gene_name_column; | |
173 for(my $i = 0; $i <= $#header_columns; $i++){ | |
174 if($header_columns[$i] eq "Gene Name"){ | |
175 $gene_name_column = $i; | |
176 } | |
177 } | |
178 if(not defined $gene_name_column){ | |
179 die "Could not find 'Gene Name' column in the input header, aborting\n"; | |
180 } | |
181 open(OUT, ">$out_file") | |
182 or die "Cannot open $out_file for writing: $!\n"; | |
183 print OUT "$header\tHuman Phenotypes (matching $orig_query)\tHuman Phenotypes (other)\n"; | |
184 | |
185 # Check if any of the variants in the annotated HGVS table are in knockout genes matching the target phenotypes list | |
186 while(<HGVS>){ | |
187 chomp; | |
188 my @F = split /\t/, $_, -1; | |
189 my (@target_phenos, @other_phenos); | |
190 for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){ | |
191 next unless exists $gene2pheno_ids{$gene_name}; | |
192 for my $id (@{$gene2pheno_ids{$gene_name}}){ | |
193 next unless exists $pheno_id2name{$id}; | |
194 if(exists $matched_pheno_ids{$id}){ | |
195 push @target_phenos, $pheno_id2name{$id}."($matched_pheno_ids{$id})"; | |
196 } | |
197 else{ | |
198 push @other_phenos, $pheno_id2name{$id}; | |
199 } | |
200 } | |
201 } | |
202 if(@target_phenos){ | |
203 print OUT join("\t", @F, join("; ", @target_phenos), join("; ", @other_phenos)), "\n"; | |
204 } | |
205 else{ | |
206 print OUT join("\t", @F, "", ""), "\n"; | |
207 } | |
208 } | |
209 close(OUT); | |
210 close(HGVS); |