Mercurial > repos > yusuf > associate_phenotypes
comparison filter_by_susceptibility_loci_pipe @ 0:6411ca16916e default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:23:29 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6411ca16916e |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 # Reports SNPs associated with susceptibility loci determined by GWAS | |
4 use strict; | |
5 use warnings; | |
6 | |
7 @ARGV == 4 or die "Usage: $0 <GWAS db.txt> <input.annotated.hgvs.txt> <output.txt> <pheno query>\n"; | |
8 my $gwas_file = shift @ARGV; | |
9 my $hgvs_file = shift @ARGV; | |
10 my $outfile = shift @ARGV; | |
11 my $pheno_query = shift @ARGV; | |
12 | |
13 $pheno_query =~ s/^\s+|\s+$//g; # trim leading and trailing whitespace | |
14 my $s; | |
15 $pheno_query =~ s/\(.*?\)/$s=$&; $s=~s#\s+or\s+#|#g; $s/eg; | |
16 my @or_terms = split /\s+or\s+/i, $pheno_query; | |
17 for my $term (@or_terms){ | |
18 $term = and_query($term); # recursively convert Boolean AND to equivalent regex | |
19 } | |
20 | |
21 # Date Added to Catalog PUBMEDID First Author Date Journal Link Study Disease/Trait Initial Sample Size Replication Sample Size Region Chr_id Chr_pos Reported Gene(s) Mapped_gene Upstream_gene_id Downstream_gene_id Snp_gene_ids Upstream_gene_distance Downstream_gene_distance Strongest SNP-Risk Allele SNPs Merged Snp_id_current Context Intergenic Risk Allele Frequency p-Value Pvalue_mlog p-Value (text) OR or beta 95% CI (text) Platform [SNPs passing QC] CNV | |
22 open(GWAS, $gwas_file) | |
23 or die "Cannot open GWAS file for reading: $!\n"; | |
24 my $header = <GWAS>; | |
25 chomp $header; | |
26 my @columns = split /\t/, $header; | |
27 my $trait_column_index = -1; | |
28 my $study_column_index = -1; | |
29 my $chr_column_index = -1; | |
30 my $pos_column_index = -1; | |
31 my $allele_column_index = -1; | |
32 my $context_column_index = -1; | |
33 my $odds_column_index = -1; | |
34 my $pubmed_column_index = -1; | |
35 for(my $i = 0; $i < $#columns; $i++){ | |
36 if($columns[$i] eq "Disease/Trait"){ | |
37 $trait_column_index = $i; | |
38 } | |
39 elsif($columns[$i] eq "Study"){ | |
40 $study_column_index = $i; | |
41 } | |
42 elsif($columns[$i] eq "Chr_id"){ | |
43 $chr_column_index = $i; | |
44 } | |
45 elsif($columns[$i] eq "Chr_pos"){ | |
46 $pos_column_index = $i; | |
47 } | |
48 elsif($columns[$i] eq "Strongest SNP-Risk Allele"){ | |
49 $allele_column_index = $i; | |
50 } | |
51 elsif($columns[$i] eq "p-Value (text)"){ | |
52 $context_column_index = $i; | |
53 } | |
54 elsif($columns[$i] eq "OR or beta"){ | |
55 $odds_column_index = $i; | |
56 } | |
57 elsif($columns[$i] eq "PUBMEDID"){ | |
58 $pubmed_column_index = $i; | |
59 } | |
60 } | |
61 if($trait_column_index == -1){ | |
62 die "Could not find Trait column header in provided GWAS catalog file $gwas_file, aborting\n"; | |
63 } | |
64 if($study_column_index == -1){ | |
65 die "Could not find Study column header in provided GWAS catalog file $gwas_file, aborting\n"; | |
66 } | |
67 if($chr_column_index == -1){ | |
68 die "Could not find chromosome name column header in provided GWAS catalog file $gwas_file, aborting\n"; | |
69 } | |
70 if($pos_column_index == -1){ | |
71 die "Could not find chromosomal position column header in provided GWAS catalog file $gwas_file, aborting\n"; | |
72 } | |
73 if($allele_column_index == -1){ | |
74 die "Could not find risk allele column header in provided GWAS catalog file $gwas_file, aborting\n"; | |
75 } | |
76 if($context_column_index == -1){ | |
77 die "Could not find p-value context column header in provided GWAS catalog file $gwas_file, aborting\n"; | |
78 } | |
79 if($odds_column_index == -1){ | |
80 die "Could not find odds ratio column header in provided GWAS catalog file $gwas_file, aborting\n"; | |
81 } | |
82 if($pubmed_column_index == -1){ | |
83 die "Could not find PubMed ID column header in provided GWAS catalog file $gwas_file, aborting\n"; | |
84 } | |
85 | |
86 my %snp2desc; | |
87 while(<GWAS>){ | |
88 chomp; | |
89 next unless /\S/; | |
90 my @F = split /\t/, $_; | |
91 my $chr = $F[$chr_column_index]; | |
92 $chr =~ s/^chr//; # drop prefix if present | |
93 my $strongest_risk_allele = $F[$allele_column_index]; | |
94 $strongest_risk_allele =~ s/^.*-//; # initially looks like rs4345897-G, strip to just letter | |
95 $snp2desc{$chr.":".$F[$pos_column_index].":".$strongest_risk_allele} = [$F[$trait_column_index], $F[$context_column_index], $F[$study_column_index], $F[$odds_column_index], $F[$pubmed_column_index]]; | |
96 } | |
97 close(GWAS); | |
98 | |
99 open(MATCHOUT, ">$outfile") | |
100 or die "Cannot open $outfile for writing: $!\n"; | |
101 | |
102 open(HGVS, $hgvs_file) | |
103 or die "Cannot open $hgvs_file for reading: $!\n"; | |
104 $header = <HGVS>; | |
105 chomp $header; | |
106 my @header_columns = split /\t/, $header; | |
107 my ($chr_column, $obs_column, $pos_column); | |
108 for(my $i = 0; $i <= $#header_columns; $i++){ | |
109 if($header_columns[$i] eq "DNA From"){ | |
110 $pos_column = $i; | |
111 } | |
112 elsif($header_columns[$i] eq "Obs base"){ | |
113 $obs_column = $i; | |
114 } | |
115 elsif($header_columns[$i] eq "Chr"){ | |
116 $chr_column = $i; | |
117 } | |
118 } | |
119 if(not defined $chr_column){ | |
120 die "Could not find 'Chr' column in the input header, aborting\n"; | |
121 } | |
122 if(not defined $pos_column){ | |
123 die "Could not find 'DNA From' column in the input header, aborting\n"; | |
124 } | |
125 if(not defined $obs_column){ | |
126 die "Could not find 'Obs base' column in the input header, aborting\n"; | |
127 } | |
128 | |
129 print MATCHOUT $header, "\tPhenotype GWAS Catalog text matches (matching $pheno_query)\tGWAS odds ratio\tGWAS susceptibility description for this SNP\n"; | |
130 while(<HGVS>){ | |
131 chomp; | |
132 my @F = split /\t/, $_, -1; | |
133 my $allele = $F[$obs_column]; | |
134 $allele =~ s(/.*$)(); # ignore ref allele in het calls | |
135 my $chr = $F[$chr_column]; | |
136 $chr =~ s/^chr//; # remove chr name prefix if present | |
137 my $key = $chr.":".$F[$pos_column].":".$allele; | |
138 #print STDERR "$key\n"; | |
139 if(not exists $snp2desc{$key}){ | |
140 print MATCHOUT join("\t", @F, "", "", ""), "\n"; | |
141 next; | |
142 } | |
143 my @matches; | |
144 for my $term (@or_terms){ | |
145 next if grep {$_ eq $term} @matches; | |
146 #print STDERR "Checking match for term $term...\n"; | |
147 if($snp2desc{$key}->[0] =~ /\b$term/i or # trait | |
148 $snp2desc{$key}->[1] =~ /\b$term/i or # p-value context | |
149 $snp2desc{$key}->[2] =~ /\b$term/i){ # study name | |
150 push @matches, $term; | |
151 last; | |
152 } | |
153 } | |
154 chomp $F[$#F]; | |
155 # print all GWAS, whether they match query or not | |
156 print MATCHOUT join("\t", @F, join("; ", sort @matches), $snp2desc{$key}->[3], "PubMedID ".$snp2desc{$key}->[4].": trait '".$snp2desc{$key}->[0]."'".($snp2desc{$key}->[1] =~ /\S/ ? " in context of '".$snp2desc{$key}->[1]."'" : "").". Study: ".$snp2desc{$key}->[2]), "\n"; | |
157 } | |
158 close(HGVS); | |
159 close(MATCHOUT); | |
160 | |
161 sub and_query{ | |
162 my ($query) = @_; | |
163 if($query =~ /^(.+)\s+and\s+(.+)$/){ | |
164 my $t1 = $1; | |
165 my $t2 = $2; | |
166 my $term1 = and_query($t1); | |
167 my $term2 = and_query($t2); | |
168 return "($term1.*$term2|$term2.*$term1)"; | |
169 } | |
170 else{ | |
171 return $query; # base case: single term | |
172 } | |
173 return | |
174 } |