comparison filter_by_susceptibility_loci_pipe @ 0:6411ca16916e default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:23:29 -0600
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6411ca16916e
1 #!/usr/bin/env perl
2
3 # Reports SNPs associated with susceptibility loci determined by GWAS
4 use strict;
5 use warnings;
6
7 @ARGV == 4 or die "Usage: $0 <GWAS db.txt> <input.annotated.hgvs.txt> <output.txt> <pheno query>\n";
8 my $gwas_file = shift @ARGV;
9 my $hgvs_file = shift @ARGV;
10 my $outfile = shift @ARGV;
11 my $pheno_query = shift @ARGV;
12
13 $pheno_query =~ s/^\s+|\s+$//g; # trim leading and trailing whitespace
14 my $s;
15 $pheno_query =~ s/\(.*?\)/$s=$&; $s=~s#\s+or\s+#|#g; $s/eg;
16 my @or_terms = split /\s+or\s+/i, $pheno_query;
17 for my $term (@or_terms){
18 $term = and_query($term); # recursively convert Boolean AND to equivalent regex
19 }
20
21 # Date Added to Catalog PUBMEDID First Author Date Journal Link Study Disease/Trait Initial Sample Size Replication Sample Size Region Chr_id Chr_pos Reported Gene(s) Mapped_gene Upstream_gene_id Downstream_gene_id Snp_gene_ids Upstream_gene_distance Downstream_gene_distance Strongest SNP-Risk Allele SNPs Merged Snp_id_current Context Intergenic Risk Allele Frequency p-Value Pvalue_mlog p-Value (text) OR or beta 95% CI (text) Platform [SNPs passing QC] CNV
22 open(GWAS, $gwas_file)
23 or die "Cannot open GWAS file for reading: $!\n";
24 my $header = <GWAS>;
25 chomp $header;
26 my @columns = split /\t/, $header;
27 my $trait_column_index = -1;
28 my $study_column_index = -1;
29 my $chr_column_index = -1;
30 my $pos_column_index = -1;
31 my $allele_column_index = -1;
32 my $context_column_index = -1;
33 my $odds_column_index = -1;
34 my $pubmed_column_index = -1;
35 for(my $i = 0; $i < $#columns; $i++){
36 if($columns[$i] eq "Disease/Trait"){
37 $trait_column_index = $i;
38 }
39 elsif($columns[$i] eq "Study"){
40 $study_column_index = $i;
41 }
42 elsif($columns[$i] eq "Chr_id"){
43 $chr_column_index = $i;
44 }
45 elsif($columns[$i] eq "Chr_pos"){
46 $pos_column_index = $i;
47 }
48 elsif($columns[$i] eq "Strongest SNP-Risk Allele"){
49 $allele_column_index = $i;
50 }
51 elsif($columns[$i] eq "p-Value (text)"){
52 $context_column_index = $i;
53 }
54 elsif($columns[$i] eq "OR or beta"){
55 $odds_column_index = $i;
56 }
57 elsif($columns[$i] eq "PUBMEDID"){
58 $pubmed_column_index = $i;
59 }
60 }
61 if($trait_column_index == -1){
62 die "Could not find Trait column header in provided GWAS catalog file $gwas_file, aborting\n";
63 }
64 if($study_column_index == -1){
65 die "Could not find Study column header in provided GWAS catalog file $gwas_file, aborting\n";
66 }
67 if($chr_column_index == -1){
68 die "Could not find chromosome name column header in provided GWAS catalog file $gwas_file, aborting\n";
69 }
70 if($pos_column_index == -1){
71 die "Could not find chromosomal position column header in provided GWAS catalog file $gwas_file, aborting\n";
72 }
73 if($allele_column_index == -1){
74 die "Could not find risk allele column header in provided GWAS catalog file $gwas_file, aborting\n";
75 }
76 if($context_column_index == -1){
77 die "Could not find p-value context column header in provided GWAS catalog file $gwas_file, aborting\n";
78 }
79 if($odds_column_index == -1){
80 die "Could not find odds ratio column header in provided GWAS catalog file $gwas_file, aborting\n";
81 }
82 if($pubmed_column_index == -1){
83 die "Could not find PubMed ID column header in provided GWAS catalog file $gwas_file, aborting\n";
84 }
85
86 my %snp2desc;
87 while(<GWAS>){
88 chomp;
89 next unless /\S/;
90 my @F = split /\t/, $_;
91 my $chr = $F[$chr_column_index];
92 $chr =~ s/^chr//; # drop prefix if present
93 my $strongest_risk_allele = $F[$allele_column_index];
94 $strongest_risk_allele =~ s/^.*-//; # initially looks like rs4345897-G, strip to just letter
95 $snp2desc{$chr.":".$F[$pos_column_index].":".$strongest_risk_allele} = [$F[$trait_column_index], $F[$context_column_index], $F[$study_column_index], $F[$odds_column_index], $F[$pubmed_column_index]];
96 }
97 close(GWAS);
98
99 open(MATCHOUT, ">$outfile")
100 or die "Cannot open $outfile for writing: $!\n";
101
102 open(HGVS, $hgvs_file)
103 or die "Cannot open $hgvs_file for reading: $!\n";
104 $header = <HGVS>;
105 chomp $header;
106 my @header_columns = split /\t/, $header;
107 my ($chr_column, $obs_column, $pos_column);
108 for(my $i = 0; $i <= $#header_columns; $i++){
109 if($header_columns[$i] eq "DNA From"){
110 $pos_column = $i;
111 }
112 elsif($header_columns[$i] eq "Obs base"){
113 $obs_column = $i;
114 }
115 elsif($header_columns[$i] eq "Chr"){
116 $chr_column = $i;
117 }
118 }
119 if(not defined $chr_column){
120 die "Could not find 'Chr' column in the input header, aborting\n";
121 }
122 if(not defined $pos_column){
123 die "Could not find 'DNA From' column in the input header, aborting\n";
124 }
125 if(not defined $obs_column){
126 die "Could not find 'Obs base' column in the input header, aborting\n";
127 }
128
129 print MATCHOUT $header, "\tPhenotype GWAS Catalog text matches (matching $pheno_query)\tGWAS odds ratio\tGWAS susceptibility description for this SNP\n";
130 while(<HGVS>){
131 chomp;
132 my @F = split /\t/, $_, -1;
133 my $allele = $F[$obs_column];
134 $allele =~ s(/.*$)(); # ignore ref allele in het calls
135 my $chr = $F[$chr_column];
136 $chr =~ s/^chr//; # remove chr name prefix if present
137 my $key = $chr.":".$F[$pos_column].":".$allele;
138 #print STDERR "$key\n";
139 if(not exists $snp2desc{$key}){
140 print MATCHOUT join("\t", @F, "", "", ""), "\n";
141 next;
142 }
143 my @matches;
144 for my $term (@or_terms){
145 next if grep {$_ eq $term} @matches;
146 #print STDERR "Checking match for term $term...\n";
147 if($snp2desc{$key}->[0] =~ /\b$term/i or # trait
148 $snp2desc{$key}->[1] =~ /\b$term/i or # p-value context
149 $snp2desc{$key}->[2] =~ /\b$term/i){ # study name
150 push @matches, $term;
151 last;
152 }
153 }
154 chomp $F[$#F];
155 # print all GWAS, whether they match query or not
156 print MATCHOUT join("\t", @F, join("; ", sort @matches), $snp2desc{$key}->[3], "PubMedID ".$snp2desc{$key}->[4].": trait '".$snp2desc{$key}->[0]."'".($snp2desc{$key}->[1] =~ /\S/ ? " in context of '".$snp2desc{$key}->[1]."'" : "").". Study: ".$snp2desc{$key}->[2]), "\n";
157 }
158 close(HGVS);
159 close(MATCHOUT);
160
161 sub and_query{
162 my ($query) = @_;
163 if($query =~ /^(.+)\s+and\s+(.+)$/){
164 my $t1 = $1;
165 my $t2 = $2;
166 my $term1 = and_query($t1);
167 my $term2 = and_query($t2);
168 return "($term1.*$term2|$term2.*$term1)";
169 }
170 else{
171 return $query; # base case: single term
172 }
173 return
174 }