annotate gfapts/gfap_r1.0_allvar_genomic_annotater.pl @ 0:f753b30013e6 draft

Uploaded
author rdaveau
date Fri, 29 Jun 2012 10:20:55 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1 #!/usr/bin/perl
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
3 use strict;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
4 use warnings FATAL => qw[ numeric uninitialized ];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
5 use File::Basename;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
6 use Getopt::Long;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
7
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
8 sub sepind{
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
9 $_=shift @_ foreach my($str, $sep);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
10 my($pos, @pos);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
11 $pos=0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
12 while(1){
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
13 $pos=index($str, $sep, $pos);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
14 last if($pos<0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
15 push @pos, $pos++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
16 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
17 return \@pos;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
18 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
19
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
20 my($varfile, $buildver, $refseq_dir, $cosmic_dir, $refseq_release, $cosmic_release, $annovar_release, $outdir, $noncoding, $coding, $cos, $ogs, $mid, $pid, $cno, $pno);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
21 my(@buffer, @header, @ogs, @mid, @cno, @pno, @sep, %buffer, %mid, %ogs, %opts);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
22
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
23 GetOptions(\%opts, "varfile=s", "buildver=s", "refseq_dir=s", "refseq_release=s", "cosmic_dir=s", "cosmic_release=s", "annovar_release=s", "outdir=s", "noncoding=s", "coding=s");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
24 $varfile = $opts{varfile};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
25 $buildver = $opts{buildver};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
26 $refseq_dir = $opts{refseq_dir};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
27 $refseq_release = $opts{refseq_release};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
28 $cosmic_dir = $opts{cosmic_dir};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
29 $cosmic_release = $opts{cosmic_release};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
30 $annovar_release = $opts{annovar_release};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
31 $outdir = $opts{outdir};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
32 $noncoding = $opts{noncoding};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
33 $coding = $opts{coding};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
34
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
35 my %legend=(
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
36 'unk' => 'undefined column',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
37 'chr' => 'chromosome identifier',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
38 'start' => "${buildver} 1-based start position",
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
39 'end' => "${buildver} 1-based end position",
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
40 'ref' => 'reference allele',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
41 'alt' => 'alternate allele',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
42 'annot' => 'ig:intergenic; pp:1kb-upstream; 5|3u:UTR; in:intronic; ss:splice; nc:ncRNA',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
43 'ogs' => 'official gene symbol(s)',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
44 'cos' => "gene listed in cosmic ${cosmic_release} release",
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
45 'mid' => "RefSeq mRNA identifier(s) from human.protein.gpff ${refseq_release} release",
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
46 'pid' => "RefSeq protein identifier(s) from human.protein.gpff ${refseq_release} release",
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
47 'c.x' => 'ATG-based variant descriptor in mRNA',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
48 'p.x' => 'ATG-based variant descriptor in protein'
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
49 );
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
50
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
51 my $annovar_src_dir = 'inc/annovar';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
52 my $annovar_db_dir = "db/annovar/${annovar_release}";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
53
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
54 my $fname = readlink($varfile) || $varfile;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
55 $fname = basename($fname);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
56
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
57 `${annovar_src_dir}/annotate_variation.pl -buildver $buildver ${outdir}/${fname} $annovar_db_dir 2> /dev/null` and die $!;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
58
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
59 open IN, "<${refseq_dir}/mid2pid_${refseq_release}.txt" or die $!;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
60 while(<IN>){
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
61 next if /^#/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
62 /^(\S+)\s+(\S+)/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
63 $mid{$1}=$2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
64 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
65 close IN;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
66
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
67 open IN, "<${cosmic_dir}/${buildver}_cosmic_ogs_${cosmic_release}.txt" or die $!;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
68 chomp and $ogs{$_}++ while(<IN>);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
69 close IN;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
70
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
71 open IN, "<${outdir}/${fname}" or die $!;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
72 while(<IN>){
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
73 last if $_!~/^#/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
74 last if $_!~/=/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
75 chomp;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
76 /^#(\S+)\s{1}=\s{1}(.+)/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
77 push @header, $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
78 $legend{$1}=$2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
79 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
80 if(!scalar(@header)){
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
81 @header=('chr', 'start', 'end', 'ref', 'alt');
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
82 $_=readline(IN);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
83 @_=split /\t/, $_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
84 $_=$#_-4;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
85 push @header, ('unk')x$_ if($_!=0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
86 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
87 close IN;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
88 push @header, ('annot', 'ogs', 'cos');
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
89 open OUT, ">${outdir}/${fname}.nc" or die $!;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
90 print OUT "#", join(' = ', $_, $legend{$_}), "\n" foreach @header;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
91 print OUT "#", join("\t", @header), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
92 open IN, "<${outdir}/${fname}.variant_function" or die $!;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
93 while(<IN>){
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
94 next if /exonic/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
95 s/^downstream/ig/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
96 s/;downstream//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
97 s/,/:/g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
98 s/(UTR(3|5))|(upstream)|(intronic)|(splicing)|(ncRNA)|(intergenic)/$1?"${2}u":$3?'pp':$4?'in':$5?'ss':$6?'nc':'ig'/eg;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
99 chomp;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
100 @buffer=split /\t/, $_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
101 $buffer[1]='na' if $buffer[0] eq 'ig';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
102 $buffer[1]=~s/([^;]+);(?:\S+)$/$1/ if $buffer[0]!~/;/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
103 print OUT join("\t", @buffer[2..$#buffer, 0..1], ($buffer[1] eq 'na')?'na':(exists $ogs{$buffer[1]})?'TRUE':'FALSE'), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
104 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
105 close IN;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
106 close OUT;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
107
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
108 $legend{annot}='fd:frameshift deletion; fi:frameshift insertion; nd:nonframeshift deletion; ni:nonframeshift insertion; bs:block substitution; ss:synonymous SNV; ns:nonsynonymous SNV; sg:stopgain; sl:stoploss; na:unknown';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
109 push @header, ('mid', 'pid', 'c.x', 'p.x');
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
110
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
111 open IN, "${outdir}/${fname}.exonic_variant_function" or die $!;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
112 open OUT, ">${outdir}/${fname}.cds" or die $!;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
113 print OUT "#", join(' = ', $_, $legend{$_}), "\n" foreach @header;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
114 print OUT "#", join("\t", @header), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
115 while(<IN>){
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
116 next if /unknown/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
117 s/^\S+\s+//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
118 chomp;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
119 %buffer=();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
120 @{$_}=() foreach (\@ogs, \@mid, \@cno, \@pno, \@sep);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
121 @buffer=split /\t/, $_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
122 $buffer[0]=~s/(nonf\w+\s{1}(d|i|s)\w+)|(\w+\s{1}(d|i)\w+)|(stop(\w){1}.+)|(^(n|s).+)|(.+)/$1?(($2 eq 's')?'b':'n').$2:$3?"f$4":$5?"s$6":$7?"${8}s":'na'/eg;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
123 foreach (split /,/, $buffer[1]){
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
124 @_=split /:/, $_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
125 splice(@_, 2, 1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
126 $_=shift(@_) || 'na' foreach ($ogs, $mid, $cno, $pno);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
127 $buffer{ogs}->{$ogs}->{$cno}->{$mid}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
128 $buffer{ono}->{$cno}=$pno;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
129 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
130 $cos=0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
131 foreach $ogs (@ogs=keys %{$buffer{ogs}}){
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
132 push @cno, join('|', (@_=keys %{$buffer{ogs}->{$ogs}}));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
133 unshift @pno, $buffer{ono}->{$_} foreach reverse @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
134 $pno=join('|', @pno[0..$#_]);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
135 splice(@pno, 0, $#_+1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
136 push @pno, $pno;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
137 foreach $cno (@_){
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
138 push @mid, join(':', keys %{$buffer{ogs}->{$ogs}->{$cno}});
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
139 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
140 $cos++ if exists $ogs{$ogs};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
141 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
142 $mid=join('|', @mid);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
143 $cno=join(';', @cno);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
144 if($#ogs!=0){
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
145 (my $sep=$cno)=~s/[^;\|:]+//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
146 @_=@{sepind($mid, '|')}[@{sepind($sep, ';')}];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
147 substr($mid, $_, 1)=';' foreach @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
148 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
149 ($pid=$mid)=~s/([^;\|:]+)/$mid{$1} || 'na'/eg;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
150 push @buffer, shift @buffer, join(';', @ogs), ($cos!=0)?'TRUE':'FALSE', $mid, $pid, $cno, join(';', @pno);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
151 shift @buffer;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
152 print OUT join("\t", @buffer), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
153 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
154 close IN;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
155 close OUT;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
156 system "rm $noncoding $coding ${outdir}/${fname}*variant_function ${outdir}/${fname}*invalid*; ln -s ${outdir}/${fname}.nc $noncoding; ln -s ${outdir}/${fname}.cds $coding" and die $!;