Mercurial > repos > yusuf > associate_phenotypes
comparison associate_variant_phenotypes @ 0:6411ca16916e default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:23:29 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6411ca16916e |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 use Math::CDF qw(pchisq); # chi square for calculating Fisher's method of combining p-values | |
6 use File::Basename; | |
7 my $dirname = dirname(__FILE__); | |
8 | |
9 # configuration file stuff | |
10 my %config; | |
11 my $tool_data = shift @ARGV; | |
12 if(not -e "$tool_data/associate_phenotypes.loc" ){ | |
13 system("cp $dirname/tool-data/associate_phenotypes.loc $tool_data/associate_phenotypes.loc") >> 8 and die "Could not create config file: $!\n"; | |
14 } | |
15 open CONFIG, '<', "$tool_data/associate_phenotypes.loc"; | |
16 while(<CONFIG>){ | |
17 next if $_ =~ /^#/; | |
18 (my $key, my $value) = split(/\s+/,$_); | |
19 $config{$key} = $value; | |
20 } | |
21 close CONFIG; | |
22 my $dbs_dir = $config{"dbs_dir"}; | |
23 | |
24 @ARGV == 6 or die "Usage: $0 <outfiles prefix> <annotated input> <preselected gene list of interest> <human literature terms> <mouse knockout terms> <gene ontology terms>\n"; | |
25 | |
26 my $final_confident_outfiles_prefix = shift @ARGV; | |
27 my $confident_input = shift @ARGV; | |
28 my $preselected_genes_file = shift @ARGV; | |
29 my $human_lit_file = shift @ARGV; | |
30 my $mouse_knockout_file = shift @ARGV; | |
31 my $gene_ontology_file = shift @ARGV; | |
32 | |
33 my @genes; | |
34 if(-e $preselected_genes_file){ | |
35 open(GENES, $preselected_genes_file) | |
36 or die "Cannot open $preselected_genes_file for reading: $!\n"; | |
37 while(<GENES>){ | |
38 chomp; | |
39 next if /^#/ or not /\S/; | |
40 s/^\s+|\s+$//g; # get rid of trailing or leading spaces | |
41 push @genes, $_; | |
42 } | |
43 close(GENES); | |
44 } | |
45 else{ | |
46 @genes = split / or /, $preselected_genes_file; | |
47 } | |
48 my %genes; | |
49 for my $g (@genes){ | |
50 $genes{lc($g)} = 1; | |
51 } | |
52 | |
53 my @human_lit_query; | |
54 if(-e $human_lit_file){ | |
55 open(HUMAN, $human_lit_file) | |
56 or die "Cannot open $human_lit_file for reading: $!\n"; | |
57 while(<HUMAN>){ | |
58 chomp; | |
59 next if /^#/ or not /\S/; | |
60 s/^\s+|\s+$//g; # get rid of trailing or leading spaces | |
61 s/\s+/ /g; # normalize any other whitespace | |
62 # to do: stem query terms? exclude stop words? | |
63 push @human_lit_query, $_; | |
64 } | |
65 close(HUMAN); | |
66 } | |
67 else{ | |
68 @human_lit_query = split / or /, $human_lit_file; | |
69 } | |
70 my $human_lit_query = join(" or ", @human_lit_query, @genes); | |
71 | |
72 my @mouse_knockout_query; | |
73 if(-e $mouse_knockout_file){ | |
74 open(MOUSE, $mouse_knockout_file) | |
75 or die "Cannot open $mouse_knockout_file for reading: $!\n"; | |
76 while(<MOUSE>){ | |
77 chomp; | |
78 next if /^#/ or not /\S/; | |
79 s/^\s+|\s+$//g; # get rid of trailing or leading spaces | |
80 s/\s+/ /g; # normalize any other whitespace | |
81 # to do: stem query terms? exclude stop words? | |
82 push @mouse_knockout_query, $_; | |
83 } | |
84 close(MOUSE); | |
85 } | |
86 else{ | |
87 @mouse_knockout_query = split / or /, $mouse_knockout_file; | |
88 } | |
89 my $mouse_knockout_query = join(" or ", @mouse_knockout_query); | |
90 | |
91 my @go_query; | |
92 if(-e $gene_ontology_file){ | |
93 open(GO, $gene_ontology_file) | |
94 or die "Cannot open $gene_ontology_file for reading: $!\n"; | |
95 while(<GO>){ | |
96 chomp; | |
97 next if /^#/ or not /\S/; | |
98 s/^\s+|\s+$//g; # get rid of trailing or leading spaces | |
99 s/\s+/ /g; # normalize any other whitespace | |
100 # to do: stem query terms? exclude stop words? | |
101 push @go_query, $_; | |
102 } | |
103 } | |
104 else{ | |
105 @go_query = split / or /, $gene_ontology_file; | |
106 } | |
107 my $go_query = join(" or ", @go_query); | |
108 close(GO); | |
109 | |
110 my @cmds; | |
111 | |
112 if($human_lit_query){ | |
113 # do pubmed first because it has to potentially download references from the internet, so better to do this with just a couple concurrent rather than a lot, which would stress the remote iHOP server | |
114 push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/IHOP/ PubMed $confident_input - '$human_lit_query'"; | |
115 push @cmds, "$dirname/filter_by_susceptibility_loci_pipe $dbs_dir/GWAS/gwascatalog.txt - - '$human_lit_query'"; | |
116 push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/OMIM/omim.txt. OMIM - - '$human_lit_query'"; | |
117 push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/ClinVar/ClinVarFullRelease.xml. ClinVar - - '$human_lit_query'"; | |
118 push @cmds, "$dirname/filter_by_human_phenotype_ontology_pipe $dbs_dir/HPO - - '$human_lit_query'"; | |
119 } | |
120 | |
121 if($mouse_knockout_query or $human_lit_query){ | |
122 if($mouse_knockout_query){ | |
123 if($human_lit_query){ | |
124 $mouse_knockout_query .= " or $human_lit_query"; | |
125 } | |
126 } | |
127 else{ | |
128 $mouse_knockout_query = $human_lit_query; | |
129 } | |
130 if($human_lit_query){ | |
131 push @cmds, "$dirname/filter_by_mouse_knockout_pipe $dbs_dir/MGI/2013-03-15 - - '$mouse_knockout_query'"; | |
132 } | |
133 else{ | |
134 push @cmds, "$dirname/filter_by_mouse_knockout_pipe $dbs_dir/MGI/2013-03-15 $confident_input - '$mouse_knockout_query'" | |
135 } | |
136 } | |
137 | |
138 if($go_query or $human_lit_query){ | |
139 if($go_query){ | |
140 if(@human_lit_query){ | |
141 $go_query .= " or ".join(" or ", @human_lit_query); | |
142 } | |
143 } | |
144 else{ | |
145 $go_query = join(" or ", @human_lit_query); | |
146 } | |
147 if($mouse_knockout_query or $human_lit_query){ | |
148 push @cmds, "$dirname/associate_phenotypes/filter_by_gene_ontology_pipe $dbs_dir/GOA - - '$go_query'"; | |
149 } | |
150 else{ | |
151 push @cmds, "$dirname/associate_phenotypes/filter_by_gene_ontology_pipe $dbs_dir/GOA $confident_input - '$go_query'"; | |
152 } | |
153 } | |
154 | |
155 &print_final_output($final_confident_outfiles_prefix, @cmds); | |
156 | |
157 # Use Fisher's Method to combine p-values from various phenotype sources into a single score for ranking | |
158 # This is an okay method to use (rather than something more complicated like Brown's method), because our | |
159 # experience with real queries is that there is surprsingly little correlation (Spearman's rank or Kendall's tau) between | |
160 # the p-values for different sources (primary or curated secondary). | |
161 sub print_final_output{ | |
162 my ($final_output_prefix, @cmds) = @_; | |
163 | |
164 my $cmd = join("|", @cmds). "|"; # pipe output so we read the stream in the handle below | |
165 open(ORIG, $cmd) | |
166 or die "Could not run '$cmd': $!\n"; | |
167 my $header = <ORIG>; | |
168 chomp $header; | |
169 my @orig_header = split /\t/, $header; | |
170 my ($chr_column, $pos_column, $gene_column, $hgvs_aa_column, $maf_column, $srcs_column, @pvalue_columns, @pheno_match_columns); | |
171 for(my $i = 0; $i <= $#orig_header; $i++){ | |
172 if($orig_header[$i] eq "Chr"){ | |
173 $chr_column = $i; | |
174 } | |
175 elsif($orig_header[$i] eq "DNA From"){ | |
176 $pos_column = $i; | |
177 } | |
178 elsif($orig_header[$i] eq "Gene Name"){ | |
179 $gene_column = $i; | |
180 } | |
181 elsif($orig_header[$i] eq "Protein HGVS"){ | |
182 $hgvs_aa_column = $i; | |
183 } | |
184 elsif($orig_header[$i] eq "Pop. freq."){ | |
185 $maf_column = $i; | |
186 } | |
187 elsif($orig_header[$i] eq "Sources"){ | |
188 $srcs_column = $i; | |
189 } | |
190 elsif($orig_header[$i] =~ /p-value/){ # columns of pheno association with a stat | |
191 push @pvalue_columns, $i; | |
192 } | |
193 elsif($orig_header[$i] =~ /\(matching/){ | |
194 push @pheno_match_columns, $i; | |
195 } | |
196 } | |
197 if(not defined $chr_column){ | |
198 die "Could not find the 'Chr' column in the header, aborting ($header)\n"; | |
199 } | |
200 elsif(not defined $pos_column){ | |
201 die "Could not find the 'DNA From' column in the header, aborting ($header)\n"; | |
202 } | |
203 elsif(not defined $hgvs_aa_column){ | |
204 die "Could not find the 'Protein HGVS' column in the header, aborting ($header)\n"; | |
205 } | |
206 elsif(not defined $maf_column){ | |
207 die "Could not find the 'Pop. freq.' column in the header, aborting ($header)\n"; | |
208 } | |
209 # Sources is optional | |
210 | |
211 # all other headers from other output files generated will be appended to the original ones | |
212 my @final_header = (@orig_header, "Combined phenotype relevance P-value"); | |
213 if(@genes){ | |
214 push @final_header, "Targeted Gene?"; | |
215 } | |
216 my %lines; # chr -> position -> [dataline1, dataline2, ...] | |
217 my %source; # no. lines per variant source | |
218 while(<ORIG>){ | |
219 chomp; | |
220 next unless /\S/; # ignore blank lines | |
221 my @F = split /\t/, $_, -1; # keep trailing blank fields | |
222 my $chr = $F[$chr_column]; | |
223 $chr =~ s/^chr//; # helps for sorting purposes | |
224 my $pos = $F[$pos_column]; | |
225 $pos =~ s/-.*$//; # CNVs have a range | |
226 $lines{$chr} = {} if not exists $lines{$F[$chr_column]}; | |
227 $lines{$chr}->{$pos} = [] if not exists $lines{$F[$chr_column]}->{$pos}; | |
228 my @final_dataline = @F; # fields that are the same in all files since they were in the original | |
229 for(my $i = 0; $i < $#final_dataline; $i++){ | |
230 $final_dataline[$i] = "" if not defined $final_dataline[$i]; | |
231 } | |
232 # Create aggregate phenotype relevance score using Fisher's method | |
233 # A combined p-value for k p-values (P1...Pk) is calculated using a chi-square value (with 2k degrees of freedom) derived by -2*sum(ln(Pi), i=1..k) | |
234 my $chi_sq = 0; | |
235 my $num_pvalues = 0; | |
236 my $last_pvalue = 1; | |
237 for my $pvalue_index (@pvalue_columns){ | |
238 next if $F[$pvalue_index] eq ""; | |
239 $last_pvalue = $F[$pvalue_index]; | |
240 $F[$pvalue_index] = 0.00001 if not $F[$pvalue_index]; # avoids log(0) issue | |
241 $num_pvalues++; | |
242 $chi_sq += log($F[$pvalue_index]); | |
243 } | |
244 my $fisher_pvalue = 1; | |
245 if($num_pvalues > 1){ | |
246 $chi_sq *= -2; | |
247 my $p = pchisq($chi_sq, 2*scalar(@pvalue_columns)); | |
248 if(not defined $p){ | |
249 print STDERR "($num_pvalues) No X2 test value for $chi_sq ("; | |
250 for my $pvalue_index (@pvalue_columns){ | |
251 if($F[$pvalue_index] eq ""){print STDERR "NA "} | |
252 else{print STDERR $F[$pvalue_index], " "} | |
253 } | |
254 print STDERR ")\n$_\n"; | |
255 } | |
256 $fisher_pvalue = 1-$p; | |
257 } | |
258 elsif($num_pvalues == 1){ | |
259 $fisher_pvalue = $last_pvalue; # no multiple testing correction | |
260 } | |
261 else{ | |
262 for my $match_column (@pheno_match_columns){ | |
263 next if $F[$match_column] eq ""; # give a token amount of positive score to ontology term matches | |
264 for my $match (split /\/\/|;/, $F[$match_column]){ | |
265 last if $fisher_pvalue <= 0.001; # only make better if not realy close to zero anyway | |
266 $fisher_pvalue -= 0.001; | |
267 } | |
268 } | |
269 } | |
270 push @final_dataline, abs($fisher_pvalue); | |
271 if(@genes){ | |
272 push @final_dataline, (grep({exists $genes{$_}} split(/; /, lc($F[$gene_column]))) ? "yes" : "no"); | |
273 } | |
274 push @{$lines{$chr}->{$pos}}, \@final_dataline; | |
275 | |
276 next unless defined $srcs_column and $F[$srcs_column] =~ /(?:^|\+| )(\S+?)(?=;|$)/; | |
277 $source{$1}++; | |
278 } | |
279 | |
280 my @outfiles = ("$final_output_prefix.novel.hgvs.txt", "$final_output_prefix.very_rare.hgvs.txt", "$final_output_prefix.rare.hgvs.txt", "$final_output_prefix.common.hgvs.txt"); | |
281 open(OUT_NOVEL, ">$outfiles[0]") | |
282 or die "Cannot open $outfiles[0] for writing: $!\n"; | |
283 open(OUT_VERY_RARE, ">$outfiles[1]") | |
284 or die "Cannot open $outfiles[1] for writing: $!\n"; | |
285 open(OUT_RARE, ">$outfiles[2]") | |
286 or die "Cannot open $outfiles[2] for writing: $!\n"; | |
287 open(OUT_COMMON, ">$outfiles[3]") | |
288 or die "Cannot open $outfiles[3] for writing: $!\n"; | |
289 print OUT_NOVEL join("\t", @final_header), "\n"; | |
290 print OUT_VERY_RARE join("\t", @final_header), "\n"; | |
291 print OUT_RARE join("\t", @final_header), "\n"; | |
292 print OUT_COMMON join("\t", @final_header), "\n"; | |
293 my @sorted_chrs = sort {$a =~ /^\d+$/ and $b =~ /^\d+$/ and $a <=> $b or $a cmp $b} keys %lines; | |
294 for my $chr (@sorted_chrs){ | |
295 for my $pos (sort {$a <=> $b} keys %{$lines{$chr}}){ | |
296 my $datalines_ref = $lines{$chr}->{$pos}; | |
297 # The following sorting puts all protein coding effect for a variant before non-coding ones | |
298 my @sorted_dataline_refs = sort {$a ne "NA" and $b ne "NA" and $a->[$hgvs_aa_column] cmp $a->[$hgvs_aa_column] or $b cmp $a} @$datalines_ref; | |
299 for my $dataline_ref (@sorted_dataline_refs){ | |
300 next unless defined $dataline_ref; | |
301 my $maf = $dataline_ref->[$maf_column]; | |
302 my $tot_line_length = 0; | |
303 for(my $i = 0; $i < $#{$dataline_ref}; $i++){ | |
304 if(not defined $dataline_ref->[$i]){ | |
305 $dataline_ref->[$i] = ""; # so we don't get crappy warnings of undefined values | |
306 } | |
307 else{ | |
308 $tot_line_length += length($dataline_ref->[$i]); | |
309 } | |
310 $tot_line_length++; # the tab | |
311 } | |
312 if($tot_line_length > 32000){ # Excel limit of 32767 characters in a cell. Also, undocumented bug that entire import line cannot exceeed this length. | |
313 # If we don't truncate, the rest of the line (including entire contents of cells to the right) are unceremoniously dumped. | |
314 # Note that personal experience has shown that the limit is actually a bit below this, so rounding down to the nearest 1000 for safety | |
315 my $overage = $tot_line_length - 32000; | |
316 my $sum_of_large_cells = 0; | |
317 my $num_large_cells = 0; | |
318 for(my $i = 0; $i <= $#{$dataline_ref}; $i++){ # remove contents from the largest cells | |
319 if(length($dataline_ref->[$i]) > 3200){ | |
320 $sum_of_large_cells += length($dataline_ref->[$i]); # all cells that are at least 10% of the max | |
321 $num_large_cells++; | |
322 } | |
323 } | |
324 my $cell_max_alloc = int((32000-($tot_line_length-$sum_of_large_cells))/$num_large_cells); | |
325 for(my $i = 0; $i <= $#{$dataline_ref}; $i++){ # truncate the bigger than average ones | |
326 if(length($dataline_ref->[$i]) > $cell_max_alloc){ | |
327 $dataline_ref->[$i] = substr($dataline_ref->[$i], 0, $cell_max_alloc-37)."[...remainder truncated for length]"; | |
328 } | |
329 } | |
330 } | |
331 if($maf eq "NA"){ | |
332 print OUT_NOVEL join("\t", @$dataline_ref), "\n"; | |
333 } | |
334 if($maf eq "NA" or $maf < 0.005){ | |
335 print OUT_VERY_RARE join("\t", @$dataline_ref), "\n"; | |
336 } | |
337 if($maf eq "NA" or $maf < 0.05){ | |
338 print OUT_RARE join("\t", @$dataline_ref), "\n"; | |
339 } | |
340 print OUT_COMMON join("\t", @$dataline_ref), "\n"; | |
341 } | |
342 } | |
343 } | |
344 close(OUT_NOVEL); | |
345 close(OUT_VERY_RARE); | |
346 close(OUT_RARE); | |
347 close(OUT_COMMON); | |
348 | |
349 # Print per-source tables (e.g. for each patient in a cohort) | |
350 for my $src (keys %source){ | |
351 for my $outfile (@outfiles){ | |
352 open(IN, $outfile) | |
353 or die "cannot open $outfile for reading: $!\n"; | |
354 my $src_outfile = $outfile; | |
355 $src_outfile =~ s/$final_output_prefix/$final_output_prefix-$src/; | |
356 open(OUT, ">$src_outfile") | |
357 or die "Cannot open $src_outfile for writing: $!\n"; | |
358 print OUT scalar(<IN>); # header line | |
359 while(<IN>){ | |
360 print OUT $_ if /(?:^|\+| )($src)(?=;|$)/; | |
361 } | |
362 close(OUT); | |
363 } | |
364 } | |
365 } | |
366 |