0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 # Reports SNPs associated with susceptibility loci determined by GWAS
|
|
4 use strict;
|
|
5 use warnings;
|
|
6
|
|
7 @ARGV == 4 or die "Usage: $0 <GWAS db.txt> <input.annotated.hgvs.txt> <output.txt> <pheno query>\n";
|
|
8 my $gwas_file = shift @ARGV;
|
|
9 my $hgvs_file = shift @ARGV;
|
|
10 my $outfile = shift @ARGV;
|
|
11 my $pheno_query = shift @ARGV;
|
|
12
|
|
13 $pheno_query =~ s/^\s+|\s+$//g; # trim leading and trailing whitespace
|
|
14 my $s;
|
|
15 $pheno_query =~ s/\(.*?\)/$s=$&; $s=~s#\s+or\s+#|#g; $s/eg;
|
|
16 my @or_terms = split /\s+or\s+/i, $pheno_query;
|
|
17 for my $term (@or_terms){
|
|
18 $term = and_query($term); # recursively convert Boolean AND to equivalent regex
|
|
19 }
|
|
20
|
|
21 # Date Added to Catalog PUBMEDID First Author Date Journal Link Study Disease/Trait Initial Sample Size Replication Sample Size Region Chr_id Chr_pos Reported Gene(s) Mapped_gene Upstream_gene_id Downstream_gene_id Snp_gene_ids Upstream_gene_distance Downstream_gene_distance Strongest SNP-Risk Allele SNPs Merged Snp_id_current Context Intergenic Risk Allele Frequency p-Value Pvalue_mlog p-Value (text) OR or beta 95% CI (text) Platform [SNPs passing QC] CNV
|
|
22 open(GWAS, $gwas_file)
|
|
23 or die "Cannot open GWAS file for reading: $!\n";
|
|
24 my $header = <GWAS>;
|
|
25 chomp $header;
|
|
26 my @columns = split /\t/, $header;
|
|
27 my $trait_column_index = -1;
|
|
28 my $study_column_index = -1;
|
|
29 my $chr_column_index = -1;
|
|
30 my $pos_column_index = -1;
|
|
31 my $allele_column_index = -1;
|
|
32 my $context_column_index = -1;
|
|
33 my $odds_column_index = -1;
|
|
34 my $pubmed_column_index = -1;
|
|
35 for(my $i = 0; $i < $#columns; $i++){
|
|
36 if($columns[$i] eq "Disease/Trait"){
|
|
37 $trait_column_index = $i;
|
|
38 }
|
|
39 elsif($columns[$i] eq "Study"){
|
|
40 $study_column_index = $i;
|
|
41 }
|
|
42 elsif($columns[$i] eq "Chr_id"){
|
|
43 $chr_column_index = $i;
|
|
44 }
|
|
45 elsif($columns[$i] eq "Chr_pos"){
|
|
46 $pos_column_index = $i;
|
|
47 }
|
|
48 elsif($columns[$i] eq "Strongest SNP-Risk Allele"){
|
|
49 $allele_column_index = $i;
|
|
50 }
|
|
51 elsif($columns[$i] eq "p-Value (text)"){
|
|
52 $context_column_index = $i;
|
|
53 }
|
|
54 elsif($columns[$i] eq "OR or beta"){
|
|
55 $odds_column_index = $i;
|
|
56 }
|
|
57 elsif($columns[$i] eq "PUBMEDID"){
|
|
58 $pubmed_column_index = $i;
|
|
59 }
|
|
60 }
|
|
61 if($trait_column_index == -1){
|
|
62 die "Could not find Trait column header in provided GWAS catalog file $gwas_file, aborting\n";
|
|
63 }
|
|
64 if($study_column_index == -1){
|
|
65 die "Could not find Study column header in provided GWAS catalog file $gwas_file, aborting\n";
|
|
66 }
|
|
67 if($chr_column_index == -1){
|
|
68 die "Could not find chromosome name column header in provided GWAS catalog file $gwas_file, aborting\n";
|
|
69 }
|
|
70 if($pos_column_index == -1){
|
|
71 die "Could not find chromosomal position column header in provided GWAS catalog file $gwas_file, aborting\n";
|
|
72 }
|
|
73 if($allele_column_index == -1){
|
|
74 die "Could not find risk allele column header in provided GWAS catalog file $gwas_file, aborting\n";
|
|
75 }
|
|
76 if($context_column_index == -1){
|
|
77 die "Could not find p-value context column header in provided GWAS catalog file $gwas_file, aborting\n";
|
|
78 }
|
|
79 if($odds_column_index == -1){
|
|
80 die "Could not find odds ratio column header in provided GWAS catalog file $gwas_file, aborting\n";
|
|
81 }
|
|
82 if($pubmed_column_index == -1){
|
|
83 die "Could not find PubMed ID column header in provided GWAS catalog file $gwas_file, aborting\n";
|
|
84 }
|
|
85
|
|
86 my %snp2desc;
|
|
87 while(<GWAS>){
|
|
88 chomp;
|
|
89 next unless /\S/;
|
|
90 my @F = split /\t/, $_;
|
|
91 my $chr = $F[$chr_column_index];
|
|
92 $chr =~ s/^chr//; # drop prefix if present
|
|
93 my $strongest_risk_allele = $F[$allele_column_index];
|
|
94 $strongest_risk_allele =~ s/^.*-//; # initially looks like rs4345897-G, strip to just letter
|
|
95 $snp2desc{$chr.":".$F[$pos_column_index].":".$strongest_risk_allele} = [$F[$trait_column_index], $F[$context_column_index], $F[$study_column_index], $F[$odds_column_index], $F[$pubmed_column_index]];
|
|
96 }
|
|
97 close(GWAS);
|
|
98
|
|
99 open(MATCHOUT, ">$outfile")
|
|
100 or die "Cannot open $outfile for writing: $!\n";
|
|
101
|
|
102 open(HGVS, $hgvs_file)
|
|
103 or die "Cannot open $hgvs_file for reading: $!\n";
|
|
104 $header = <HGVS>;
|
|
105 chomp $header;
|
|
106 my @header_columns = split /\t/, $header;
|
|
107 my ($chr_column, $obs_column, $pos_column);
|
|
108 for(my $i = 0; $i <= $#header_columns; $i++){
|
|
109 if($header_columns[$i] eq "DNA From"){
|
|
110 $pos_column = $i;
|
|
111 }
|
|
112 elsif($header_columns[$i] eq "Obs base"){
|
|
113 $obs_column = $i;
|
|
114 }
|
|
115 elsif($header_columns[$i] eq "Chr"){
|
|
116 $chr_column = $i;
|
|
117 }
|
|
118 }
|
|
119 if(not defined $chr_column){
|
|
120 die "Could not find 'Chr' column in the input header, aborting\n";
|
|
121 }
|
|
122 if(not defined $pos_column){
|
|
123 die "Could not find 'DNA From' column in the input header, aborting\n";
|
|
124 }
|
|
125 if(not defined $obs_column){
|
|
126 die "Could not find 'Obs base' column in the input header, aborting\n";
|
|
127 }
|
|
128
|
|
129 print MATCHOUT $header, "\tPhenotype GWAS Catalog text matches (matching $pheno_query)\tGWAS odds ratio\tGWAS susceptibility description for this SNP\n";
|
|
130 while(<HGVS>){
|
|
131 chomp;
|
|
132 my @F = split /\t/, $_, -1;
|
|
133 my $allele = $F[$obs_column];
|
|
134 $allele =~ s(/.*$)(); # ignore ref allele in het calls
|
|
135 my $chr = $F[$chr_column];
|
|
136 $chr =~ s/^chr//; # remove chr name prefix if present
|
|
137 my $key = $chr.":".$F[$pos_column].":".$allele;
|
|
138 #print STDERR "$key\n";
|
|
139 if(not exists $snp2desc{$key}){
|
|
140 print MATCHOUT join("\t", @F, "", "", ""), "\n";
|
|
141 next;
|
|
142 }
|
|
143 my @matches;
|
|
144 for my $term (@or_terms){
|
|
145 next if grep {$_ eq $term} @matches;
|
|
146 #print STDERR "Checking match for term $term...\n";
|
|
147 if($snp2desc{$key}->[0] =~ /\b$term/i or # trait
|
|
148 $snp2desc{$key}->[1] =~ /\b$term/i or # p-value context
|
|
149 $snp2desc{$key}->[2] =~ /\b$term/i){ # study name
|
|
150 push @matches, $term;
|
|
151 last;
|
|
152 }
|
|
153 }
|
|
154 chomp $F[$#F];
|
|
155 # print all GWAS, whether they match query or not
|
|
156 print MATCHOUT join("\t", @F, join("; ", sort @matches), $snp2desc{$key}->[3], "PubMedID ".$snp2desc{$key}->[4].": trait '".$snp2desc{$key}->[0]."'".($snp2desc{$key}->[1] =~ /\S/ ? " in context of '".$snp2desc{$key}->[1]."'" : "").". Study: ".$snp2desc{$key}->[2]), "\n";
|
|
157 }
|
|
158 close(HGVS);
|
|
159 close(MATCHOUT);
|
|
160
|
|
161 sub and_query{
|
|
162 my ($query) = @_;
|
|
163 if($query =~ /^(.+)\s+and\s+(.+)$/){
|
|
164 my $t1 = $1;
|
|
165 my $t2 = $2;
|
|
166 my $term1 = and_query($t1);
|
|
167 my $term2 = and_query($t2);
|
|
168 return "($term1.*$term2|$term2.*$term1)";
|
|
169 }
|
|
170 else{
|
|
171 return $query; # base case: single term
|
|
172 }
|
|
173 return
|
|
174 }
|