Mercurial > repos > rdaveau > gfap
changeset 0:f753b30013e6 draft
Uploaded
author | rdaveau |
---|---|
date | Fri, 29 Jun 2012 10:20:55 -0400 |
parents | |
children | 028f435b6cfb |
files | gfapts/README gfapts/gfap_r1.0_allvar_genomic_annotater.pl gfapts/gfap_r1.0_allvar_genomic_annotater.xml gfapts/gfap_r1.0_cdsvar_functional_annotater.pl gfapts/gfap_r1.0_cdsvar_functional_annotater.xml gfapts/gfap_r1.0_known_var_finder.pl gfapts/gfap_r1.0_known_var_finder.xml gfapts/gfap_r1.0_samvcf_data_parser.pl gfapts/gfap_r1.0_samvcf_data_parser.xml gfapts/inc/R/samvcf_data_parser.R gfapts/inc/annovar/annotate_variation.pl gfapts/inc/annovar/convert2annovar.pl gfapts/inc/perlmod/ngsutil.pm |
diffstat | 13 files changed, 5589 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/README Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,12 @@ +## What is gfap ? +The Genomic and Functional Annotation Pipeline (gfap) consists in a set of perl and R tools that +aim at annotating human genetic SNVs/InDels identified by 2nd-generation sequencing technologies. + +## How does it work ? +gfap starts with a samtools-formatted VCF-file i.e. the DP4-tag from the INFO field is mandatory. +gfap performs its annotation in a 4-steps linear procedure i.e.: +1> SAMVCF.data.parser 2> Known.variants.finder 3> Map.to.genomic.features 4> Map.to.dbNSFP.features + +## Pre-requisites +In gfap_r1.0_samvcf_data_parser.pl LINE 13 the $rbin assumes that your R binary is set to /usr/bin/R +So, you will have to properly edit and reset this $rbin if R is installed elsewhere on your system. \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/gfap_r1.0_allvar_genomic_annotater.pl Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,156 @@ +#!/usr/bin/perl + +use strict; +use warnings FATAL => qw[ numeric uninitialized ]; +use File::Basename; +use Getopt::Long; + +sub sepind{ + $_=shift @_ foreach my($str, $sep); + my($pos, @pos); + $pos=0; + while(1){ + $pos=index($str, $sep, $pos); + last if($pos<0); + push @pos, $pos++; + } + return \@pos; + } + +my($varfile, $buildver, $refseq_dir, $cosmic_dir, $refseq_release, $cosmic_release, $annovar_release, $outdir, $noncoding, $coding, $cos, $ogs, $mid, $pid, $cno, $pno); +my(@buffer, @header, @ogs, @mid, @cno, @pno, @sep, %buffer, %mid, %ogs, %opts); + +GetOptions(\%opts, "varfile=s", "buildver=s", "refseq_dir=s", "refseq_release=s", "cosmic_dir=s", "cosmic_release=s", "annovar_release=s", "outdir=s", "noncoding=s", "coding=s"); +$varfile = $opts{varfile}; +$buildver = $opts{buildver}; +$refseq_dir = $opts{refseq_dir}; +$refseq_release = $opts{refseq_release}; +$cosmic_dir = $opts{cosmic_dir}; +$cosmic_release = $opts{cosmic_release}; +$annovar_release = $opts{annovar_release}; +$outdir = $opts{outdir}; +$noncoding = $opts{noncoding}; +$coding = $opts{coding}; + +my %legend=( + 'unk' => 'undefined column', + 'chr' => 'chromosome identifier', + 'start' => "${buildver} 1-based start position", + 'end' => "${buildver} 1-based end position", + 'ref' => 'reference allele', + 'alt' => 'alternate allele', + 'annot' => 'ig:intergenic; pp:1kb-upstream; 5|3u:UTR; in:intronic; ss:splice; nc:ncRNA', + 'ogs' => 'official gene symbol(s)', + 'cos' => "gene listed in cosmic ${cosmic_release} release", + 'mid' => "RefSeq mRNA identifier(s) from human.protein.gpff ${refseq_release} release", + 'pid' => "RefSeq protein identifier(s) from human.protein.gpff ${refseq_release} release", + 'c.x' => 'ATG-based variant descriptor in mRNA', + 'p.x' => 'ATG-based variant descriptor in protein' +); + +my $annovar_src_dir = 'inc/annovar'; +my $annovar_db_dir = "db/annovar/${annovar_release}"; + +my $fname = readlink($varfile) || $varfile; +$fname = basename($fname); + +`${annovar_src_dir}/annotate_variation.pl -buildver $buildver ${outdir}/${fname} $annovar_db_dir 2> /dev/null` and die $!; + +open IN, "<${refseq_dir}/mid2pid_${refseq_release}.txt" or die $!; +while(<IN>){ + next if /^#/; + /^(\S+)\s+(\S+)/; + $mid{$1}=$2; + } +close IN; + +open IN, "<${cosmic_dir}/${buildver}_cosmic_ogs_${cosmic_release}.txt" or die $!; +chomp and $ogs{$_}++ while(<IN>); +close IN; + +open IN, "<${outdir}/${fname}" or die $!; +while(<IN>){ + last if $_!~/^#/; + last if $_!~/=/; + chomp; + /^#(\S+)\s{1}=\s{1}(.+)/; + push @header, $1; + $legend{$1}=$2; + } +if(!scalar(@header)){ + @header=('chr', 'start', 'end', 'ref', 'alt'); + $_=readline(IN); + @_=split /\t/, $_; + $_=$#_-4; + push @header, ('unk')x$_ if($_!=0); + } +close IN; +push @header, ('annot', 'ogs', 'cos'); +open OUT, ">${outdir}/${fname}.nc" or die $!; +print OUT "#", join(' = ', $_, $legend{$_}), "\n" foreach @header; +print OUT "#", join("\t", @header), "\n"; +open IN, "<${outdir}/${fname}.variant_function" or die $!; +while(<IN>){ + next if /exonic/; + s/^downstream/ig/; + s/;downstream//; + s/,/:/g; + s/(UTR(3|5))|(upstream)|(intronic)|(splicing)|(ncRNA)|(intergenic)/$1?"${2}u":$3?'pp':$4?'in':$5?'ss':$6?'nc':'ig'/eg; + chomp; + @buffer=split /\t/, $_; + $buffer[1]='na' if $buffer[0] eq 'ig'; + $buffer[1]=~s/([^;]+);(?:\S+)$/$1/ if $buffer[0]!~/;/; + print OUT join("\t", @buffer[2..$#buffer, 0..1], ($buffer[1] eq 'na')?'na':(exists $ogs{$buffer[1]})?'TRUE':'FALSE'), "\n"; + } +close IN; +close OUT; + +$legend{annot}='fd:frameshift deletion; fi:frameshift insertion; nd:nonframeshift deletion; ni:nonframeshift insertion; bs:block substitution; ss:synonymous SNV; ns:nonsynonymous SNV; sg:stopgain; sl:stoploss; na:unknown'; +push @header, ('mid', 'pid', 'c.x', 'p.x'); + +open IN, "${outdir}/${fname}.exonic_variant_function" or die $!; +open OUT, ">${outdir}/${fname}.cds" or die $!; +print OUT "#", join(' = ', $_, $legend{$_}), "\n" foreach @header; +print OUT "#", join("\t", @header), "\n"; +while(<IN>){ + next if /unknown/; + s/^\S+\s+//; + chomp; + %buffer=(); + @{$_}=() foreach (\@ogs, \@mid, \@cno, \@pno, \@sep); + @buffer=split /\t/, $_; + $buffer[0]=~s/(nonf\w+\s{1}(d|i|s)\w+)|(\w+\s{1}(d|i)\w+)|(stop(\w){1}.+)|(^(n|s).+)|(.+)/$1?(($2 eq 's')?'b':'n').$2:$3?"f$4":$5?"s$6":$7?"${8}s":'na'/eg; + foreach (split /,/, $buffer[1]){ + @_=split /:/, $_; + splice(@_, 2, 1); + $_=shift(@_) || 'na' foreach ($ogs, $mid, $cno, $pno); + $buffer{ogs}->{$ogs}->{$cno}->{$mid}++; + $buffer{ono}->{$cno}=$pno; + } + $cos=0; + foreach $ogs (@ogs=keys %{$buffer{ogs}}){ + push @cno, join('|', (@_=keys %{$buffer{ogs}->{$ogs}})); + unshift @pno, $buffer{ono}->{$_} foreach reverse @_; + $pno=join('|', @pno[0..$#_]); + splice(@pno, 0, $#_+1); + push @pno, $pno; + foreach $cno (@_){ + push @mid, join(':', keys %{$buffer{ogs}->{$ogs}->{$cno}}); + } + $cos++ if exists $ogs{$ogs}; + } + $mid=join('|', @mid); + $cno=join(';', @cno); + if($#ogs!=0){ + (my $sep=$cno)=~s/[^;\|:]+//g; + @_=@{sepind($mid, '|')}[@{sepind($sep, ';')}]; + substr($mid, $_, 1)=';' foreach @_; + } + ($pid=$mid)=~s/([^;\|:]+)/$mid{$1} || 'na'/eg; + push @buffer, shift @buffer, join(';', @ogs), ($cos!=0)?'TRUE':'FALSE', $mid, $pid, $cno, join(';', @pno); + shift @buffer; + print OUT join("\t", @buffer), "\n"; + } +close IN; +close OUT; +system "rm $noncoding $coding ${outdir}/${fname}*variant_function ${outdir}/${fname}*invalid*; ln -s ${outdir}/${fname}.nc $noncoding; ln -s ${outdir}/${fname}.cds $coding" and die $!; \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/gfap_r1.0_allvar_genomic_annotater.xml Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,123 @@ +<tool id="gfap_r1.0_allvar_genomic_annotater" name="Map to genomic features"> + <description>Annotate variants with ANNOVAR</description> + <command interpreter="perl">gfap_r1.0_allvar_genomic_annotater.pl -varfile=$varfile -buildver=$buildver -refseq_dir=db/refseq -refseq_release=$refseq_release -cosmic_dir=db/cosmic -cosmic_release=$cosmic_release -annovar_release=$annovar_release -outdir=$__new_file_path__/gfap -noncoding=$noncoding -coding=$coding</command> + <inputs> + <param name="varfile" format="txt" type="data" label="Input VAR or DBI file" /> + <param name="buildver" type="select" label="Human reference genome assembly"> + <option value="hg19">GRCh37 ie. hg19</option> + </param> + <param name="refseq_release" type="select" label="human.protein.gpff release"> + <option value="r16012012">Jan 16, 2012</option> + </param> + <param name="cosmic_release" type="select" label="COSMIC db release"> + <option value="v56">v56</option> + </param> + <param name="annovar_release" type="select" label="ANNOVAR db release"> + <option value="jan2012">Jan 2012</option> + </param> + </inputs> + <outputs> + <data format="txt" name="noncoding" label="${varfile.name}.nc" /> + <data format="txt" name="coding" label="${varfile.name}.cds" /> + </outputs> + <help> +.. class:: infomark + +**What it does** + +Annotate a VAR- or DBI-file with **genomic features** and discriminate between **coding** and **non-coding** regions. + +- This VAR- or DBI-file has to be generated by either the **SAMVCF_data_parser** or the **Known_variants_finder** gfap utility. +- The core annotation procedure is mainly performed by the third-party program **annotate_variation.pl** from the ANNOVAR software tools. + +.. class:: infomark + +**Third-party resources** + +- RefSeq: http://www.ncbi.nlm.nih.gov/RefSeq +- COSMIC: http://www.sanger.ac.uk/genetics/CGP/cosmic +- ANNOVAR: http://www.openbioinformatics.org/annovar + +---- + +**Input .dbi file**:: + + #chr start end ref alt NRF NRR NAF NAR DP AD AF QC P.str P.ref P.alt VCF.FILTER DPT.FILTER VAR.FILTER AF_ALL AF_AFR AF_AMR AF_ASN AF_EUR AF_COS cid rs dbsnp + chr1 14907 14907 A G 4 0 0 3 7 3 0.4290 9 3.33e-01 4.17e-02 8.33e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 14930 14930 A G 4 2 0 5 11 5 0.4550 37 1.83e-01 2.29e-01 2.08e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 68896 68896 G A 0 0 3 0 3 3 1.0000 18 8.33e-02 3.33e-01 8.33e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 69270 69270 A G 0 0 31 0 31 31 1.0000 179 3.10e-10 3.33e-01 3.10e-10 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 69511 69511 A G 0 0 13 12 25 25 1.0000 222 3.33e-01 3.33e-01 3.33e-01 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 69897 69897 T C 1 0 0 3 4 3 0.7500 14 2.08e-01 3.33e-01 8.33e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 129285 129285 G A 0 4 0 4 8 4 0.5000 56 2.60e-03 4.17e-02 4.17e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 567697 567697 G A 0 0 0 2 2 2 1.0000 30 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 569803 569803 G A 0 0 4 0 4 4 1.0000 50 4.17e-02 3.33e-01 4.17e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 808631 808631 G A 0 0 7 1 8 8 1.0000 142 2.34e-02 3.33e-01 2.34e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 808922 808922 G A 0 0 15 26 41 41 1.0000 222 3.91e-02 3.33e-01 3.91e-02 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 808928 808928 C T 0 0 14 31 45 45 1.0000 222 5.36e-03 3.33e-01 5.36e-03 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 816725 816725 A G 0 0 2 0 2 2 1.0000 22 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 821030 821030 G T 0 0 2 0 2 2 1.0000 36 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 821143 821143 G T 0 0 0 2 2 2 1.0000 8 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + + +---- + +**Output .nc file [non-coding]**:: + + #From [chr] to [dbsnp] = DBI file header + #annot = ig:intergenic; pp:1kb-upstream; 5|3u:UTR; in:intronic; ss:splice; nc:ncRNA + #ogs = official gene symbol(s) + #cos = gene listed in cosmic v56 release + #chr start end ref alt NRF NRR NAF NAR DP AD AF QC P.str P.ref P.alt VCF.FILTER DPT.FILTER VAR.FILTER AF_ALL AF_AFR AF_AMR AF_ASN AF_EUR AF_COS cid rs dbsnp annot ogs cos + chr1 14907 14907 A G 4 0 0 3 7 3 0.4290 9 3.33e-01 4.17e-02 8.33e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na nc WASH7P FALSE + chr1 14930 14930 A G 4 2 0 5 11 5 0.4550 37 1.83e-01 2.29e-01 2.08e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na nc WASH7P FALSE + chr1 68896 68896 G A 0 0 3 0 3 3 1.0000 18 8.33e-02 3.33e-01 8.33e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na pp OR4F5 TRUE + chr1 129285 129285 G A 0 4 0 4 8 4 0.5000 56 2.60e-03 4.17e-02 4.17e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ig na na + chr1 567697 567697 G A 0 0 0 2 2 2 1.0000 30 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ig na na + chr1 569803 569803 G A 0 0 4 0 4 4 1.0000 50 4.17e-02 3.33e-01 4.17e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ig na na + chr1 808631 808631 G A 0 0 7 1 8 8 1.0000 142 2.34e-02 3.33e-01 2.34e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na nc FAM41C FALSE + chr1 808922 808922 G A 0 0 15 26 41 41 1.0000 222 3.91e-02 3.33e-01 3.91e-02 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na nc FAM41C FALSE + chr1 808928 808928 C T 0 0 14 31 45 45 1.0000 222 5.36e-03 3.33e-01 5.36e-03 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na nc FAM41C FALSE + chr1 816725 816725 A G 0 0 2 0 2 2 1.0000 22 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ig na na + chr1 821030 821030 G T 0 0 2 0 2 2 1.0000 36 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ig na na + chr1 821143 821143 G T 0 0 0 2 2 2 1.0000 8 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ig na na + chr1 846489 846489 T C 0 0 1 1 2 2 1.0000 16 3.33e-01 3.33e-01 3.33e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ig na na + chr1 866319 866319 G A 0 0 2 1 3 3 1.0000 31 3.33e-01 3.33e-01 3.33e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na in SAMD11 TRUE + chr1 870903 870903 T C 0 0 3 0 3 3 1.0000 65 8.33e-02 3.33e-01 8.33e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na in SAMD11 TRUE + +---- + +**Output .cds file [coding]**:: + + #From [chr] to [dbsnp] = DBI file header + #annot = fd:frameshift deletion; fi:frameshift insertion; nd:nonframeshift deletion; ni:nonframeshift insertion; bs:block substitution; ss:synonymous SNV; ns:nonsynonymous SNV; sg:stopgain; sl:stoploss; na:unknown + #ogs = official gene symbol(s) + #cos = gene listed in cosmic v56 release + #mid = RefSeq mRNA identifier(s) from human.protein.gpff r16012012 release + #pid = RefSeq protein identifier(s) from human.protein.gpff r16012012 release + #c.x = ATG-based variant descriptor in mRNA + #p.x = ATG-based variant descriptor in protein + #chr start end ref alt NRF NRR NAF NAR DP AD AF QC P.str P.ref P.alt VCF.FILTER DPT.FILTER VAR.FILTER AF_ALL AF_AFR AF_AMR AF_ASN AF_EUR AF_COS cid rs dbsnp annot ogs cos mid pid c.x p.x + chr1 69270 69270 A G 0 0 31 0 31 31 1.0000 179 3.10e-10 3.33e-01 3.10e-10 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss OR4F5 TRUE NM_001005484 NP_001005484 c.A180G p.S60S + chr1 69511 69511 A G 0 0 13 12 25 25 1.0000 222 3.33e-01 3.33e-01 3.33e-01 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns OR4F5 TRUE NM_001005484 NP_001005484 c.A421G p.T141A + chr1 69897 69897 T C 1 0 0 3 4 3 0.7500 14 2.08e-01 3.33e-01 8.33e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss OR4F5 TRUE NM_001005484 NP_001005484 c.T807C p.S269S + chr1 881627 881627 G A 0 0 3 8 11 11 1.0000 88 7.55e-02 3.33e-01 7.55e-02 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss NOC2L TRUE NM_015658 NP_056473 c.C1843T p.L615L + chr1 887801 887801 A G 0 0 6 0 6 6 1.0000 56 1.04e-02 3.33e-01 1.04e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss NOC2L TRUE NM_015658 NP_056473 c.T1182C p.T394T + chr1 888639 888639 T C 0 0 4 9 13 13 1.0000 142 8.89e-02 3.33e-01 8.89e-02 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss NOC2L TRUE NM_015658 NP_056473 c.A918G p.E306E + chr1 888659 888659 T C 0 0 3 9 12 12 1.0000 146 4.87e-02 3.33e-01 4.87e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns NOC2L TRUE NM_015658 NP_056473 c.A898G p.I300V + chr1 897325 897325 G C 0 0 9 11 20 20 1.0000 188 2.75e-01 3.33e-01 2.75e-01 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss KLHL17 TRUE NM_198317 NP_938073 c.G609C p.A203A + chr1 909238 909238 G C 0 0 3 5 8 8 1.0000 130 2.42e-01 3.33e-01 2.42e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns PLEKHN1 FALSE NM_032129|NM_001160184 NP_115505|NP_001153656 c.G1460C|c.G1355C p.R487P|p.R452P + chr1 909242 909242 A G 2 4 1 2 9 3 0.3330 15 1.69e-01 2.29e-01 3.33e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss PLEKHN1 FALSE NM_032129|NM_001160184 NP_115505|NP_001153656 c.A1464G|c.A1359G p.G488G|p.G453G + chr1 935222 935222 C A 0 0 0 2 2 2 1.0000 10 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns HES4 FALSE NM_001142467 NP_001135939 c.G132T p.R44S + chr1 949654 949654 A G 0 0 11 9 20 20 1.0000 222 2.75e-01 3.33e-01 2.75e-01 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss ISG15 TRUE NM_005101 NP_005092 c.A294G p.V98V + chr1 981931 981931 A G 0 0 1 1 2 2 1.0000 36 3.33e-01 3.33e-01 3.33e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss AGRN TRUE NM_198576 NP_940978 c.A3066G p.S1022S + chr1 982994 982994 T C 0 0 13 16 29 29 1.0000 146 2.37e-01 3.33e-01 2.37e-01 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss AGRN TRUE NM_198576 NP_940978 c.T3558C p.F1186F + chr1 1021346 1021346 A G 0 2 2 3 7 5 0.7140 66 1.51e-01 1.67e-01 3.33e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss C1orf159 FALSE NM_017891 NP_060361 c.T357C p.I119I + +---- + +.. class:: infomark + +**Feedback**: romain.daveau@curie.fr + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/gfap_r1.0_cdsvar_functional_annotater.pl Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,102 @@ +#!/usr/bin/perl + +use strict; +use warnings FATAL => qw[ numeric uninitialized ]; +use List::Util qw[ sum min max ]; +use List::MoreUtils qw[ first_index ]; +use File::Basename; +use Getopt::Long; + +my($varfile, $buildver, $dbdir, $release, $outdir, $outfile, $max, $i, $k); +my(@buffer, @legend, @header, @k, @score, @tools, @Temp, %buffer, %AAS, %dbscore, %dbtools, %opts); + +GetOptions(\%opts, "varfile=s", "buildver=s", "dbdir=s", "release=s", "outdir=s", "outfile=s"); +$varfile = $opts{varfile}; +$buildver = $opts{buildver}; +$dbdir = $opts{dbdir}; +$release = $opts{release}; +$outdir = $opts{outdir}; +$outfile = $opts{outfile}; + +my $fname = readlink($varfile) || $varfile; +my $dbfile="${dbdir}/${buildver}_dbnsfp_${release}.txt"; +$fname = basename($fname); + +open IN, "<$varfile" or die $!; +open OUT, ">${outdir}/${fname}.Temp" or die $!; +while(<IN>){ + push @legend, $1 and next if /^#(.+=.+)/; + next if $_!~/\b(?:s(?:g|l))|ns\b/; + next if /\s+-\s+/; + /^(?:chr)*(\S+)\s+(\S+)/; + @{$buffer{($k=join('_', $1, $2))}->{dbnsfp}}=(); + push @k, $k; + print OUT $_; + } +close IN; +close OUT; + +$i=first_index{ /^annot/ } @legend; +@_=$legend[$i]=~/((?:s(?:g|l)|ns):[^;|\s]+)/g; +$legend[$i]=join(' = ', 'annot', join('; ', @_)); +push @legend, ( + 'AAS = Amino Acid Substitution(s)', + "FIS = Functional Impact Score(s) from dbnsfp ${release} release", + "OCC = number of tools from which FIS was/were calculated", + "FIS.max = highest score among FIS", + "OCC.max = number of tools from which FIS.max was calculated", + "PRED = qualitative ternary classifier ie. [L]ow; [M]edium; [H]igh" +); +foreach (@legend){ + /^(\S+)/; + push @header, $1; + } + +open IN, "<$dbfile" or die $!; +while(<IN>){ + next if /^#/; + /^(\S+)\s+(\S+)(?:\s+\S+){2}\s+(.+)/; + next if !exists $buffer{($k=join('_', $1, $2))}; + push @{$buffer{$k}->{dbnsfp}}, join(':', split /\t/, $3); + } +close IN; +open IN, "<${outdir}/${fname}.Temp" or die $!; +open OUT, ">${outdir}/${fname}.dbnsfp" or die $!; +print OUT "#", $_, "\n" foreach @legend; +print OUT "#", join("\t", @header), "\n"; +foreach $k (@k){ + $i=0; + $_=readline(IN); + chomp; + @buffer=split /\s+/, $_; + %{$_}=() foreach (\%AAS, \%dbscore, \%dbtools); + foreach (split(/[;\|]/, $buffer[-1])){ + $AAS{$1.$2}++ if /^p\.(\w{1})\d+(\w{1})$/; + } + if($#{$buffer{$k}->{dbnsfp}}<0){ + unshift @buffer, (%AAS)?(join(':', keys %AAS)):('na'), (join(':', ('na') x max(scalar(keys %AAS), 1))) x 2; + }elsif(%AAS){ + foreach (@{$buffer{$k}->{dbnsfp}}){ + @Temp=split /:/, $_; + $k=shift @Temp; + @{$_}=split(/;/, pop @Temp) foreach (\@tools, \@score); + foreach (split /;/, shift @Temp){ + $dbscore{$k.$_}=shift @score; + $dbtools{$k.$_}=shift @tools; + } + } + foreach (keys %AAS){ + push @score, $dbscore{$_} || 'na'; + push @tools, $dbtools{$_} || 'na'; + } + unshift @buffer, join(':', keys %AAS), join(':', @score), join(':', @tools); + } + push @buffer, shift @buffer for 1..3; + @{$_}=grep{ !/na/ } split(/:/, $buffer[--$i]) foreach (\@tools, \@score); + $max=max(@score) || 'na'; + push @buffer, (($max ne 'na')?($score[($i=first_index{ $max } @score)], $tools[$i], ($max<.3)?'L':($max<.7)?'M':'H'):(('na')x3)); + print OUT join("\t", @buffer), "\n"; + } +close IN; +close OUT; +system "rm ${outdir}/${fname}*Temp $outfile; ln -s ${outdir}/${fname}.dbnsfp $outfile" and die $!; \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/gfap_r1.0_cdsvar_functional_annotater.xml Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,89 @@ +<tool id="gfap_r1.0_cdsvar_functional_annotater" name="Map to dbNSFP features"> + <description>Annotate ms-/ns-SNVs with dbNSFP scores</description> + <command interpreter="perl">gfap_r1.0_cdsvar_functional_annotater.pl -varfile=$varfile -buildver=$buildver -dbdir=db/dbnsfp -release=$release -outdir=$__new_file_path__/gfap -outfile=$outfile</command> + <inputs> + <param name="varfile" format="txt" type="data" label="Input CDS file" /> + <param name="buildver" type="select" label="Human reference genome assembly"> + <option value="hg19">GRCh37 ie. hg19</option> + </param> + <param name="release" type="select" label="dbNSFP data release"> + <option value="light1.3">v1.3</option> + <option value="light1.2">v1.2</option> + <option value="light1.1">v1.1</option> + <option value="light">v1.0</option> + </param> + </inputs> + <outputs> + <data format="txt" name="outfile" label="${varfile.name}.dbnsfp" /> + </outputs> + <help> +.. class:: infomark + +**What it does** + +Annotate **mis-/non-sense** SNVs with **functional impact scores** of the **dbNSFP** database. + +Input CDS-file has to be generated by the **Map_to_genomic_features** gfap utility. + +.. class:: infomark + +**Third-party resources** + +dbNSFP: https://sites.google.com/site/jpopgen/dbNSFP + +---- + +**Input .cds file**:: + + #chr start end ref alt NRF NRR NAF NAR DP AD AF QC P.str P.ref P.alt VCF.FILTER DPT.FILTER VAR.FILTER AF_ALL AF_AFR AF_AMR AF_ASN AF_EUR AF_COS cid rs dbsnp annot ogs cos mid pid c.x p.x + chr1 69270 69270 A G 0 0 31 0 31 31 1.0000 179 3.10e-10 3.33e-01 3.10e-10 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss OR4F5 TRUE NM_001005484 NP_001005484 c.A180G p.S60S + chr1 69511 69511 A G 0 0 13 12 25 25 1.0000 222 3.33e-01 3.33e-01 3.33e-01 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns OR4F5 TRUE NM_001005484 NP_001005484 c.A421G p.T141A + chr1 69897 69897 T C 1 0 0 3 4 3 0.7500 14 2.08e-01 3.33e-01 8.33e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss OR4F5 TRUE NM_001005484 NP_001005484 c.T807C p.S269S + chr1 881627 881627 G A 0 0 3 8 11 11 1.0000 88 7.55e-02 3.33e-01 7.55e-02 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss NOC2L TRUE NM_015658 NP_056473 c.C1843T p.L615L + chr1 887801 887801 A G 0 0 6 0 6 6 1.0000 56 1.04e-02 3.33e-01 1.04e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss NOC2L TRUE NM_015658 NP_056473 c.T1182C p.T394T + chr1 888639 888639 T C 0 0 4 9 13 13 1.0000 142 8.89e-02 3.33e-01 8.89e-02 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss NOC2L TRUE NM_015658 NP_056473 c.A918G p.E306E + chr1 888659 888659 T C 0 0 3 9 12 12 1.0000 146 4.87e-02 3.33e-01 4.87e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns NOC2L TRUE NM_015658 NP_056473 c.A898G p.I300V + chr1 897325 897325 G C 0 0 9 11 20 20 1.0000 188 2.75e-01 3.33e-01 2.75e-01 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss KLHL17 TRUE NM_198317 NP_938073 c.G609C p.A203A + chr1 909238 909238 G C 0 0 3 5 8 8 1.0000 130 2.42e-01 3.33e-01 2.42e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns PLEKHN1 FALSE NM_032129|NM_001160184 NP_115505|NP_001153656 c.G1460C|c.G1355C p.R487P|p.R452P + chr1 909242 909242 A G 2 4 1 2 9 3 0.3330 15 1.69e-01 2.29e-01 3.33e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss PLEKHN1 FALSE NM_032129|NM_001160184 NP_115505|NP_001153656 c.A1464G|c.A1359G p.G488G|p.G453G + chr1 935222 935222 C A 0 0 0 2 2 2 1.0000 10 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns HES4 FALSE NM_001142467 NP_001135939 c.G132T p.R44S + chr1 949654 949654 A G 0 0 11 9 20 20 1.0000 222 2.75e-01 3.33e-01 2.75e-01 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss ISG15 TRUE NM_005101 NP_005092 c.A294G p.V98V + chr1 981931 981931 A G 0 0 1 1 2 2 1.0000 36 3.33e-01 3.33e-01 3.33e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss AGRN TRUE NM_198576 NP_940978 c.A3066G p.S1022S + chr1 982994 982994 T C 0 0 13 16 29 29 1.0000 146 2.37e-01 3.33e-01 2.37e-01 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss AGRN TRUE NM_198576 NP_940978 c.T3558C p.F1186F + chr1 1021346 1021346 A G 0 2 2 3 7 5 0.7140 66 1.51e-01 1.67e-01 3.33e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ss C1orf159 FALSE NM_017891 NP_060361 c.T357C p.I119I + +---- + +**Output .dbnsfp file**:: + + #From [chr] to [p.x] = CDS file header + #AAS = Amino Acid Substitution(s) + #FIS = Functional Impact Score(s) from dbnsfp light1.3 release + #OCC = number of tools from which FIS was/were calculated + #FIS.max = highest score among FIS + #OCC.max = number of tools from which FIS.max was calculated + #PRED = qualitative ternary classifier ie. [L]ow; [M]edium; [H]igh + #chr start end ref alt NRF NRR NAF NAR DP AD AF QC P.str P.ref P.alt VCF.FILTER DPT.FILTER VAR.FILTER AF_ALL AF_AFR AF_AMR AF_ASN AF_EUR AF_COS cid rs dbsnp annot ogs cos mid pid c.x p.x AAS FIS OCC FIS.max OCC.max PRED + chr1 69511 69511 A G 0 0 13 12 25 25 1.0000 222 3.33e-01 3.33e-01 3.33e-01 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns OR4F5 TRUE NM_001005484 NP_001005484 c.A421G p.T141A TA 0.38 5 0.38 5 M + chr1 888659 888659 T C 0 0 3 9 12 12 1.0000 146 4.87e-02 3.33e-01 4.87e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns NOC2L TRUE NM_015658 NP_056473 c.A898G p.I300V IV 0.46 4 0.46 4 M + chr1 909238 909238 G C 0 0 3 5 8 8 1.0000 130 2.42e-01 3.33e-01 2.42e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns PLEKHN1 FALSE NM_032129|NM_001160184 NP_115505|NP_001153656 c.G1460C|c.G1355C p.R487P|p.R452P RP 0.48 5 0.48 5 M + chr1 935222 935222 C A 0 0 0 2 2 2 1.0000 10 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns HES4 FALSE NM_001142467 NP_001135939 c.G132T p.R44S RS na na na na na + chr1 1269554 1269554 T C 0 0 5 1 6 6 1.0000 81 7.29e-02 3.33e-01 7.29e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns TAS1R3 TRUE NM_152228 NP_689414 c.T2269C p.C757R CR 0.35 5 0.35 5 M + chr1 1551927 1551927 T C 0 0 1 2 3 3 1.0000 40 3.33e-01 3.33e-01 3.33e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns MIB2 TRUE NM_080875:NM_001170688:NM_001170686:NM_001170687 NP_543151:NP_001164159:NP_001164157:NP_001164158 c.T214C p.F72L FL 0.06 4 0.06 4 L + chr1 1599812 1599812 C T 5 0 13 13 31 26 0.8390 225 1.58e-01 2.08e-02 3.33e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns SLC35E2B FALSE NM_001110781 NP_001104251 c.G934A p.V312I VI 0.63 5 0.63 5 M + chr1 1650787 1650787 T C 104 22 69 10 205 79 0.3850 225 4.44e-25 2.02e-14 1.85e-12 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns CDK11A;CDK11B TRUE NM_024011;NM_033486|NM_033489 NP_076916;NP_277021|NP_277024 c.A335G;c.A335G|c.A233G p.H112R;p.H112R|p.H78R HR 0.32 3 0.32 3 M + chr1 1650797 1650797 A G 110 28 76 9 223 85 0.3810 157 1.46e-25 3.64e-13 8.02e-15 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns CDK11A;CDK11B TRUE NM_024011:NM_033529;NM_033489|NM_033486 NP_076916:NP_277071;NP_277024|NP_277021 c.T325C;c.T223C|c.T325C p.C109R;p.C75R|p.C109R CR 0.33 3 0.33 3 M + chr1 1650832 1650832 A G 114 34 88 14 250 102 0.4080 138 3.92e-24 8.97e-12 9.25e-15 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns CDK11A;CDK11B TRUE NM_024011:NM_033529;NM_033489:NM_033488|NM_033486:NM_033492:NM_033493 NP_076916:NP_277071;NP_277024:NP_277023|NP_277021:NP_277027:NP_277028 c.T290C;c.T188C|c.T290C p.V97A;p.V63A|p.V97A VA 0.49 4 0.49 4 M + chr1 1650845 1650845 G A 93 25 100 36 254 136 0.5350 178 1.26e-17 7.09e-11 1.24e-08 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns CDK11A;CDK11B TRUE NM_024011:NM_033529;NM_033486:NM_033492:NM_033493|NM_033489:NM_033488 NP_076916:NP_277071;NP_277021:NP_277027:NP_277028|NP_277024:NP_277023 c.C277T;c.C277T|c.C175T p.R93W;p.R93W|p.R59W RW 0.97 4 0.97 4 H + chr1 1666251 1666251 G A 0 10 8 3 21 11 0.5240 136 1.28e-01 6.51e-04 7.55e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns SLC35E2 TRUE NM_001199787:NM_182838 NP_001186716:NP_878258 c.C610T p.R204W RW 0.49 4 0.49 4 M + chr1 1686040 1686040 G T 10 2 17 3 32 20 0.6250 105 3.77e-05 1.29e-02 8.59e-04 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns NADK TRUE NM_023018:NM_001198993|NM_001198994|NM_001198995 NP_075394:NP_001185922|NP_001185923|NP_001185924 c.C786A|c.C1221A|c.C690A p.N262K|p.N407K|p.N230K NK 0.58 5 0.58 5 M + chr1 1849529 1849529 A G 3 9 6 6 24 12 0.5000 206 1.02e-01 4.87e-02 3.33e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na ns TMEM52 FALSE NM_178545 NP_848640 c.T422C p.M141T MT 0.50 5 0.50 5 M + chr1 1887019 1887019 A G 28 25 26 20 99 46 0.4650 150 1.41e-01 2.61e-01 1.54e-01 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na sl KIAA1751 TRUE NM_001080484 NP_001073953 c.T2287C p.X763Q XQ na na na na na + +---- + +.. class:: infomark + +**Feedback**: romain.daveau@curie.fr + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/gfap_r1.0_known_var_finder.pl Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,98 @@ +#!/usr/bin/perl + +use strict; +use lib 'inc/perlmod'; +use ngsutil qw[ :DEFAULT &varscan ]; +use warnings FATAL => qw[ numeric uninitialized ]; +use File::Basename; +use Getopt::Long; + +my($varfile, $buildver, $outdir, $dir_1000g, $dir_dbsnp, $dir_cosmic, $release_1000g, $release_dbsnp, $release_cosmic, $outfile, $k, @buffer, @varlist, %opts, %varlist); + +GetOptions(\%opts, "varfile=s", "buildver=s", "outdir=s", "dir_1000g=s", "dir_dbsnp=s", "dir_cosmic=s", "release_1000g=s", "release_dbsnp=s", "release_cosmic=s", "outfile=s"); +$varfile = $opts{varfile}; +$buildver = $opts{buildver}; +$outdir = $opts{outdir}; +$dir_1000g = $opts{dir_1000g}; +$dir_dbsnp = $opts{dir_dbsnp}; +$dir_cosmic = $opts{dir_cosmic}; +$release_1000g = $opts{release_1000g}; +$release_dbsnp = $opts{release_dbsnp}; +$release_cosmic = $opts{release_cosmic}; +$outfile = $opts{outfile}; + +my $fname = readlink($varfile) || $varfile; +$fname = basename($fname); + +my %k=( + '1000g' => { + 'dir' => $dir_1000g, 'release' => $release_1000g, 'value' => join(':', ('0.00000')x5), 'header' => join(':', 'AF_ALL', 'AF_AFR', 'AF_AMR', 'AF_ASN', 'AF_EUR') + }, 'dbsnp' => { + 'dir' => $dir_dbsnp, 'release' => $release_dbsnp, 'value' => join(':', ('na')x2), 'header' => join(':', 'rs', 'dbsnp') + }, 'cosmic_var' => { + 'dir' => $dir_cosmic, 'release' => $release_cosmic, 'value' => join(':', '0.00000', 'na'), 'header' => join(':', 'AF_COS', 'cid') + } +); + +my %legend=( + 'chr' => 'chromosome identifier', + 'start' => "${buildver} 1-based start position", + 'end' => "${buildver} 1-based end position", + 'ref' => 'reference allele', + 'alt' => 'alternate allele', + 'QC' => 'Phred-scaled call quality', + 'NRF' => '#reads consistent w/ the reference allele on the F-strand', + 'NRR' => '#reads consistent w/ the reference allele on the R-strand', + 'NAF' => '#reads consistent w/ the alternate allele on the F-strand', + 'NAR' => '#reads consistent w/ the alternate allele on the R-strand', + 'DP' => 'total #reads in call ie. NRF+NRR+NAF+NAR', + 'AD' => 'total #reads consistent w/ the alternate allele ie. NAF+NAR', + 'AF' => 'alternate allele ratio ie. AD/DP', + 'VCF.FILTER' => 'FILTER field from the input vcf file', + 'DPT.FILTER' => 'check for heterogeneous depth in substituted blocks', + 'VAR.FILTER' => 'GFAP default FILTER to discriminate between TP and FP variants', + 'P.str' => 'NRF+NAF vs. NRR+NAR binomial test P-value ie. total strand bias', + 'P.ref' => 'NRF vs. NRR binomial test P-value ie. reference allele strand bias', + 'P.alt' => 'NAF vs. NAR binomial test P-value ie. alternate allele strand bias', + 'AF_ALL' => "global AF in ${release_1000g} 1000g data", + 'AF_AFR' => "AF in AFR ${release_1000g} 1000g data", + 'AF_AMR' => "AF in AMR ${release_1000g} 1000g data", + 'AF_ASN' => "AF in ASN ${release_1000g} 1000g data", + 'AF_EUR' => "AF in EUR ${release_1000g} 1000g data", + 'AF_COS' => "AF in ${release_cosmic} cosmic data", + 'rs' => "dbsnp rs identifier(s) from ${release_dbsnp} release", + 'dbsnp' => "dbsnp build version(s) from ${release_dbsnp} release", + 'cid' => "cosmic mutation identifier from ${release_cosmic} release" +); +my @header=('chr', 'start', 'end', 'ref', 'alt', 'DPT.FILTER', 'QC', 'NRF', 'NRR', 'NAF', 'NAR', 'VCF.FILTER', 'P.str', 'P.ref', 'P.alt', 'DP', 'AD', 'AF', 'VAR.FILTER'); +my @k=qw[ 1000g dbsnp cosmic_var ]; + +open IN, "<$varfile" or die $!; +while(<IN>){ + chomp; + @buffer=split /\s+/, $_; + $buffer[0]=~s/^chr(.+)$/$1/; + push @varlist, ($k=join(':', @buffer[0..2])); + shift(@buffer) for 0..2; + $varlist{$k}->{$_}=shift(@buffer) foreach qw[ ref alt ]; + $varlist{$k}->{cov}=join(':', (($buffer[0] eq 'unk')?'SKIP':'PASS'), @buffer[1..$#buffer]); + } +close IN; + +foreach $k (@k){ + push @header, split(/:/, $k{$k}->{header}); + varscan($k, $k{$k}->{file}, \%varlist); + } + +my @idx=(0..4,7..10,15..17,6,12..14,11,5,18..23,26..27,24..25); +open OUT, ">${outdir}/${fname}.dbi" or die $!; +print OUT '#', join(' = ', $_, $legend{$_}), "\n" foreach @header[@idx]; +print OUT '#', join("\t", @header[@idx]), "\n"; +foreach $k (@varlist){ + @buffer=(split(/:/, 'chr'.$k), $varlist{$k}->{ref}, $varlist{$k}->{alt}); + push @buffer, split(/:/, ($varlist{$k}->{$_} || $k{$_}->{value})) foreach ('cov', @k); + print OUT join("\t", @buffer[@idx]), "\n"; + } +close OUT; + +system "rm $outfile; ln -s ${outdir}/${fname}.dbi $outfile" and die $!; \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/gfap_r1.0_known_var_finder.xml Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,98 @@ +<tool id="gfap_r1.0_known_var_finder" name="Known variants finder"> + <description>Search the GFAP database for known variants</description> + <command interpreter="perl">gfap_r1.0_known_var_finder.pl -varfile=$varfile -buildver=$buildver -outdir=$__new_file_path__/gfap -dir_1000g=db/1000g -dir_dbsnp=db/dbsnp -dir_cosmic=db/cosmic -release_1000g=$release_1000g -release_dbsnp=$release_dbsnp -release_cosmic=$release_cosmic -outfile=$outfile</command> + <inputs> + <param name="varfile" format="txt" type="data" label="Input VAR file" /> + <param name="buildver" type="select" label="Human reference genome assembly"> + <option value="hg19">GRCh37 ie. hg19</option> + </param> + <param name="release_1000g" type="select" label="1000 Genomes data release"> + <option value="phase1_20101123">phase 1 r20101123 </option> + </param> + <param name="release_dbsnp" type="select" label="dbSNP data release"> + <option value="v135">v135</option> + </param> + <param name="release_cosmic" type="select" label="COSMIC data release"> + <option value="v56">v56</option> + </param> + </inputs> + <outputs> + <data format="txt" name="outfile" label="${varfile.name}.dbi" /> + </outputs> + <help> +.. class:: infomark + +**What it does** + +Annotate a VAR-file with **1000G**, **dbSNP** and **COSMIC** data. + +- This VAR-file has to be generated by the **SAMVCF_data_parser** gfap utility. +- Included in the gfap archive, variant databases mentioned ahead consist in **built-in pre-processed flat files**. + +.. class:: infomark + +**Third-party resources** + +- 1000G: http://www.1000genomes.org +- dbSNP: http://www.ncbi.nlm.nih.gov/projects/SNP +- COSMIC: http://www.sanger.ac.uk/genetics/CGP/cosmic + +---- + +**Input .var file**:: + + #chr start end ref alt ann QC NRF NRR NAF NAR VCF.FILTER P.str P.ref P.alt DP AD AF VAR.FILTER + chr1 14907 14907 A G het 9 4 0 0 3 NONE 3.33e-01 4.17e-02 8.33e-02 7 3 0.4290 SKIP + chr1 14930 14930 A G het 37 4 2 0 5 NONE 1.83e-01 2.29e-01 2.08e-02 11 5 0.4550 SKIP + chr1 68896 68896 G A hom 18 0 0 3 0 NONE 8.33e-02 3.33e-01 8.33e-02 3 3 1.0000 SKIP + chr1 69270 69270 A G hom 179 0 0 31 0 NONE 3.10e-10 3.33e-01 3.10e-10 31 31 1.0000 SKIP + chr1 69511 69511 A G hom 222 0 0 13 12 NONE 3.33e-01 3.33e-01 3.33e-01 25 25 1.0000 PASS + chr1 69897 69897 T C het 14 1 0 0 3 NONE 2.08e-01 3.33e-01 8.33e-02 4 3 0.7500 SKIP + chr1 129285 129285 G A het 56 0 4 0 4 NONE 2.60e-03 4.17e-02 4.17e-02 8 4 0.5000 SKIP + chr1 567697 567697 G A hom 30 0 0 0 2 NONE 1.67e-01 3.33e-01 1.67e-01 2 2 1.0000 SKIP + chr1 569803 569803 G A hom 50 0 0 4 0 NONE 4.17e-02 3.33e-01 4.17e-02 4 4 1.0000 SKIP + chr1 808631 808631 G A hom 142 0 0 7 1 NONE 2.34e-02 3.33e-01 2.34e-02 8 8 1.0000 SKIP + chr1 808922 808922 G A hom 222 0 0 15 26 NONE 3.91e-02 3.33e-01 3.91e-02 41 41 1.0000 PASS + chr1 808928 808928 C T hom 222 0 0 14 31 NONE 5.36e-03 3.33e-01 5.36e-03 45 45 1.0000 PASS + chr1 816725 816725 A G hom 22 0 0 2 0 NONE 1.67e-01 3.33e-01 1.67e-01 2 2 1.0000 SKIP + chr1 821030 821030 G T hom 36 0 0 2 0 NONE 1.67e-01 3.33e-01 1.67e-01 2 2 1.0000 SKIP + chr1 821143 821143 G T hom 8 0 0 0 2 NONE 1.67e-01 3.33e-01 1.67e-01 2 2 1.0000 SKIP + +---- + +**Output .dbi file**:: + + #From [chr] to [VAR.FILTER] = VAR file header; DPT.FILTER = check for heterogeneous depth in substituted blocks + #AF_ALL = global AF in phase1_20101123 1000g data + #AF_AFR = AF in AFR phase1_20101123 1000g data + #AF_AMR = AF in AMR phase1_20101123 1000g data + #AF_ASN = AF in ASN phase1_20101123 1000g data + #AF_EUR = AF in EUR phase1_20101123 1000g data + #AF_COS = AF in v56 cosmic data + #cid = cosmic mutation identifier from v56 release + #rs = dbsnp rs identifier(s) from v135 release + #dbsnp = dbsnp build version(s) from v135 release + #chr start end ref alt NRF NRR NAF NAR DP AD AF QC P.str P.ref P.alt VCF.FILTER DPT.FILTER VAR.FILTER AF_ALL AF_AFR AF_AMR AF_ASN AF_EUR AF_COS cid rs dbsnp + chr1 14907 14907 A G 4 0 0 3 7 3 0.4290 9 3.33e-01 4.17e-02 8.33e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 14930 14930 A G 4 2 0 5 11 5 0.4550 37 1.83e-01 2.29e-01 2.08e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 68896 68896 G A 0 0 3 0 3 3 1.0000 18 8.33e-02 3.33e-01 8.33e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 69270 69270 A G 0 0 31 0 31 31 1.0000 179 3.10e-10 3.33e-01 3.10e-10 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 69511 69511 A G 0 0 13 12 25 25 1.0000 222 3.33e-01 3.33e-01 3.33e-01 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 69897 69897 T C 1 0 0 3 4 3 0.7500 14 2.08e-01 3.33e-01 8.33e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 129285 129285 G A 0 4 0 4 8 4 0.5000 56 2.60e-03 4.17e-02 4.17e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 567697 567697 G A 0 0 0 2 2 2 1.0000 30 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 569803 569803 G A 0 0 4 0 4 4 1.0000 50 4.17e-02 3.33e-01 4.17e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 808631 808631 G A 0 0 7 1 8 8 1.0000 142 2.34e-02 3.33e-01 2.34e-02 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 808922 808922 G A 0 0 15 26 41 41 1.0000 222 3.91e-02 3.33e-01 3.91e-02 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 808928 808928 C T 0 0 14 31 45 45 1.0000 222 5.36e-03 3.33e-01 5.36e-03 NONE PASS PASS 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 816725 816725 A G 0 0 2 0 2 2 1.0000 22 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 821030 821030 G T 0 0 2 0 2 2 1.0000 36 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + chr1 821143 821143 G T 0 0 0 2 2 2 1.0000 8 1.67e-01 3.33e-01 1.67e-01 NONE PASS SKIP 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 na na na + +---- + +.. class:: infomark + +**Feedback**: romain.daveau@curie.fr + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/gfap_r1.0_samvcf_data_parser.pl Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,104 @@ +#!/usr/bin/perl + +use strict; +use lib 'inc/perlmod'; +use ngsutil qw[ :DEFAULT &explode_varcall ]; +use warnings FATAL => qw[ numeric uninitialized ]; +use List::Util qw[ sum min max ]; +use File::Basename; +use Getopt::Long; + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# PATH TO YOUR R-bin DIRECTORY +my $rbin = '/usr/bin/R'; +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +my $annovar_dir = 'inc/annovar'; +my $rdep = 'inc/R'; + +my($varfile, $outdir, $outfile, $i, @DP4, @buffer, @Temp, @previous, @fnames, %opts, %chr); + +GetOptions(\%opts, "varfile=s", "outdir=s", "outfile=s"); +$varfile = $opts{varfile}; +$outdir = $opts{outdir}; +$outfile = $opts{outfile}; + +my $fname = readlink($varfile) || $varfile; +$fname = basename($fname); + +my %fh=( + 'chr1' => *chr1, 'chr2' => *chr2, 'chr3' => *chr3, 'chr4' => *chr4, 'chr5' => *chr5, + 'chr6' => *chr6, 'chr7' => *chr7, 'chr8' => *chr8, 'chr9' => *chr9, 'chr10' => *chr10, + 'chr11' => *chr11, 'chr12' => *chr12, 'chr13' => *chr13, 'chr14' => *chr14, 'chr15' => *chr15, + 'chr16' => *chr16, 'chr17' => *chr17, 'chr18' => *chr18, 'chr19' => *chr19, 'chr20' => *chr20, + 'chr21' => *chr21, 'chr22' => *chr22, 'chrX' => *chrX, 'chrY' => *chrY, 'chrM' => *chrM +); + +`${annovar_dir}/convert2annovar.pl -format vcf4 $varfile -includeinfo > ${outdir}/${fname}_Temp-00 2> /dev/null` and die $!; + +open($fh{$_}, ">${outdir}/${fname}_${_}.Temp-00") or die $! foreach keys %fh; +open IN, "<${outdir}/${fname}_Temp-00" or die $!; +while(<IN>){ + /^(\S+)\s+(?:\S+\s+){2}(\S+)\s+(\S+)/; + next if !exists $fh{$1}; + if(min(length($2), length($3))!=1){ + chomp; + @buffer=split /\s+/, $_; + @Temp=explode_varcall(@buffer[1,3..4]); + for($i=0; $i<$#{$Temp[0]}; $i+=2){ + print{ $fh{$buffer[0]} } join("\t", $buffer[0], @{$Temp[0]}[$i..$i+1], @{$Temp[1]}[$i..$i+1], @buffer[6..$#buffer]), "\n"; + } + next; + } + print{ $fh{$1} } $_; + $chr{$1}++; + } +close IN; +foreach (keys %fh){ + close($fh{$_}); + next if !exists $chr{$_}; + `sort -k2,2n -k3,3n ${outdir}/${fname}_${_}.Temp-00 > ${outdir}/${fname}_${_}.Temp-01` and die $!; + open IN, "<${outdir}/${fname}_${_}.Temp-01" or die $!; + open OUT, ">${outdir}/${fname}_${_}.Temp-02" or die $!; + $_=readline(IN); + /^((?:\S+\s+){7})(?:\S+\s+){8}(\S+\s+\S+)/; + @buffer=split /\s+/, $1.$2; + ($_=pop(@buffer))=~s/.+DP4=([^;]+).+/$1/; + @DP4=split /,/, $_; + push @buffer, @DP4; + @previous=@buffer; + MAINLOOP: while(<IN>){ + /^((?:\S+\s+){7})(?:\S+\s+){8}(\S+\s+\S+)/; + @buffer=split /\s+/, $1.$2; + ($_=pop(@buffer))=~s/.+DP4=([^;]+).+/$1/; + @DP4=split /,/, $_; + push @buffer, @DP4; + while(($previous[0] eq $buffer[0]) && ($buffer[2]==$previous[2]+1) && (join('', @previous[3..4]) !~ /-/) && (join('', @buffer[3..4]) !~ /-/)){ + $previous[2]=$buffer[2]; + $previous[$_].=$buffer[$_] for 3..4; + $previous[5]='unk' if $previous[5] ne $buffer[5]; + $previous[7]='SKIP' if $previous[7] ne $buffer[7]; + for (6,8..11){ + $previous[$_]+=$buffer[$_]; + $previous[$_]/=2; + } + next MAINLOOP; + } + $previous[7]='NONE' if $previous[7] eq '.'; + $previous[$_]=sprintf("%.0f", $previous[$_]) for (6,8..11); + print OUT join("\t", @previous[0..6,8..11,7]), "\n"; + @Temp=@previous if eof; + @previous=@buffer; + } + $previous[7]='NONE' if $previous[7] eq '.'; + $previous[$_]=sprintf("%.0f", $previous[$_]) for (6,8..11); + print OUT join("\t", @previous[0..6,8..11,7]), "\n" if(join('_', @Temp[1..2]) ne join('_', @previous[1..2])); + close IN; + close OUT; + } +foreach (1..22, 'X', 'Y', 'M'){ + push @fnames, "${outdir}/${fname}_chr${_}.Temp-02" if exists $chr{"chr$_"}; + } +system join(' ', 'cat', @fnames, '>', "${outdir}/${fname}.Temp.2R") and die $!; +`${rbin} --vanilla --slave --args ${outdir}/${fname}.Temp.2R < ${rdep}/samvcf_data_parser.R` and die $!; +system "rm ${outdir}/${fname}*Temp* $outfile; ln -s ${outdir}/${fname}.var $outfile" and die $!; \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/gfap_r1.0_samvcf_data_parser.xml Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,91 @@ +<tool id="gfap_r1.0_samvcf_data_parser" name="SAMVCF data parser"> + <description>Convert VCF-formatted variant calls as ANNOVAR input</description> + <command interpreter="perl">gfap_r1.0_samvcf_data_parser.pl -varfile=$varfile -outdir=$__new_file_path__/gfap -outfile=$outfile</command> + <inputs> + <param name="varfile" format="vcf" type="data" label="Input VCF file" /> + </inputs> + <outputs> + <data format="txt" name="outfile" label="${varfile.name}.var" /> + </outputs> + <help> +.. class:: infomark + +**What it does** + +- Convert a samtools-formatted VCF-file as **ANNOVAR** input. +- Merge calls whenever possible and compute statistics on calls. + +.. class:: warningmark + +**As the DP4-tag in the INFO field is required, only samtools/bcftools VCF-files are currently supported**. + +.. class:: infomark + +**Third-party resources** + +- ANNOVAR: http://www.openbioinformatics.org/annovar +- samtools/bcftools: http://samtools.sourceforge.net + +---- + +**Input .vcf file**:: + + #CHROM POS ID REF ALT QUAL FILTER INFO + chr1 14522 . G A,C 7.8 . DP=2;AF1=0.9999;CI95=0.5,1;DP4=0,0,2,0;MQ=30;FQ=-30 GT:PL:GQ 1/1:37,3,0,38,1,35:41 + chr1 14653 . C T 12.3 . DP=4;AF1=0.5001;CI95=0.5,0.5;DP4=2,0,1,1;MQ=38;FQ=6.58;PV4=1,0.45,1,0.43 GT:PL:GQ 0/1:42,0,33:35 + chr1 69968 . A G 13 . DP=2;AF1=1;CI95=0.5,1;DP4=0,0,2,0;MQ=41;FQ=-33 GT:PL:GQ 1/1:44,6,0:49 + chr1 129285 . G A 37.8 . DP=2;AF1=1;CI95=0.5,1;DP4=0,0,0,2;MQ=60;FQ=-33 GT:PL:GQ 1/1:69,6,0:49 + chr1 808631 . G A 125 . DP=7;AF1=1;CI95=1,1;DP4=0,0,7,0;MQ=60;FQ=-48 GT:PL:GQ 1/1:158,21,0:84 + chr1 808922 . G A 222 . DP=47;AF1=1;CI95=1,1;DP4=1,0,23,16;MQ=51;FQ=-131;PV4=1,0.0026,1,0.47 GT:PL:GQ 1/1:255,104,0:99 + chr1 808928 . C T 219 . DP=47;AF1=1;CI95=1,1;DP4=1,0,23,16;MQ=52;FQ=-131;PV4=1,2.4e-05,1,0.22 GT:PL:GQ 1/1:252,104,0:99 + chr1 824115 . A C 7.8 . DP=3;AF1=0.5001;CI95=0.5,0.5;DP4=0,1,0,2;MQ=60;FQ=4.79;PV4=1,0.037,1,0.33 GT:PL:GQ 0/1:37,0,31:33 + chr1 824161 . C T 4.77 . DP=3;AF1=0.5001;CI95=0.5,0.5;DP4=0,1,1,1;MQ=53;FQ=4.06;PV4=1,0.11,0.33,1 GT:PL:GQ 0/1:33,0,31:32 + chr1 824215 . T C 8.44 . DP=2;AF1=1;CI95=0.5,1;DP4=0,0,1,1;MQ=49;FQ=-33 GT:PL:GQ 1/1:39,6,0:49 + chr1 852063 . G A 30.8 . DP=2;AF1=1;CI95=0.5,1;DP4=0,0,1,1;MQ=60;FQ=-33 GT:PL:GQ 1/1:62,6,0:49 + chr1 861630 . G A 58 . DP=4;AF1=1;CI95=0.5,1;DP4=0,0,0,3;MQ=60;FQ=-36 GT:PL:GQ 1/1:90,9,0:63 + chr1 861808 . A G 48.8 . DP=2;AF1=1;CI95=0.5,1;DP4=0,0,1,1;MQ=60;FQ=-33 GT:PL:GQ 1/1:80,6,0:49 + chr1 866319 . G A 33.5 . DP=6;AF1=1;CI95=0.5,1;DP4=0,0,4,0;MQ=60;FQ=-39 GT:PL:GQ 1/1:66,12,0:72 + chr1 870903 . T C 78.5 . DP=5;AF1=1;CI95=0.5,1;DP4=0,0,4,0;MQ=60;FQ=-39 GT:PL:GQ 1/1:111,12,0:72 + +---- + +**Output .var file**:: + + #From [chr] to [ann] = ANNOVAR required fields + #NRF = #reads consistent w/ the reference allele on the F-strand + #NRR = #reads consistent w/ the reference allele on the R-strand + #NAF = #reads consistent w/ the alternate allele on the F-strand + #NAR = #reads consistent w/ the alternate allele on the R-strand + #DP = total #reads in call ie. NRF+NRR+NAF+NAR + #AD = total #reads consistent w/ the alternate allele ie. NAF+NAR + #AF = alternate allele ratio ie. AD/DP + #QC = Phred-scaled call quality + #P.str = NRF+NAF vs. NRR+NAR binomial test P-value ie. total strand bias + #P.ref = NRF vs. NRR binomial test P-value ie. reference allele strand bias + #P.alt = NAF vs. NAR binomial test P-value ie. alternate allele strand bias + #VCF.FILTER = FILTER field from the input vcf file + #VAR.FILTER = GFAP default FILTER to discriminate between TP and FP variants + #chr start end ref alt ann QC NRF NRR NAF NAR VCF.FILTER P.str P.ref P.alt DP AD AF VAR.FILTER + chr1 14907 14907 A G het 9 4 0 0 3 NONE 3.33e-01 4.17e-02 8.33e-02 7 3 0.4290 SKIP + chr1 14930 14930 A G het 37 4 2 0 5 NONE 1.83e-01 2.29e-01 2.08e-02 11 5 0.4550 SKIP + chr1 68896 68896 G A hom 18 0 0 3 0 NONE 8.33e-02 3.33e-01 8.33e-02 3 3 1.0000 SKIP + chr1 69270 69270 A G hom 179 0 0 31 0 NONE 3.10e-10 3.33e-01 3.10e-10 31 31 1.0000 SKIP + chr1 69511 69511 A G hom 222 0 0 13 12 NONE 3.33e-01 3.33e-01 3.33e-01 25 25 1.0000 PASS + chr1 69897 69897 T C het 14 1 0 0 3 NONE 2.08e-01 3.33e-01 8.33e-02 4 3 0.7500 SKIP + chr1 129285 129285 G A het 56 0 4 0 4 NONE 2.60e-03 4.17e-02 4.17e-02 8 4 0.5000 SKIP + chr1 567697 567697 G A hom 30 0 0 0 2 NONE 1.67e-01 3.33e-01 1.67e-01 2 2 1.0000 SKIP + chr1 569803 569803 G A hom 50 0 0 4 0 NONE 4.17e-02 3.33e-01 4.17e-02 4 4 1.0000 SKIP + chr1 808631 808631 G A hom 142 0 0 7 1 NONE 2.34e-02 3.33e-01 2.34e-02 8 8 1.0000 SKIP + chr1 808922 808922 G A hom 222 0 0 15 26 NONE 3.91e-02 3.33e-01 3.91e-02 41 41 1.0000 PASS + chr1 808928 808928 C T hom 222 0 0 14 31 NONE 5.36e-03 3.33e-01 5.36e-03 45 45 1.0000 PASS + chr1 816725 816725 A G hom 22 0 0 2 0 NONE 1.67e-01 3.33e-01 1.67e-01 2 2 1.0000 SKIP + chr1 821030 821030 G T hom 36 0 0 2 0 NONE 1.67e-01 3.33e-01 1.67e-01 2 2 1.0000 SKIP + chr1 821143 821143 G T hom 8 0 0 0 2 NONE 1.67e-01 3.33e-01 1.67e-01 2 2 1.0000 SKIP + +---- + +.. class:: infomark + +**Feedback**: romain.daveau@curie.fr + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/inc/R/samvcf_data_parser.R Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,27 @@ +rm(list=ls()) +options(warn=-1) +args=commandArgs()[-c(1:4)] +infile=args[1] +outfile=sub("Temp.2R$", "var", infile) + +x=read.table(infile, sep="\t", header=FALSE, row.names=NULL, colClasses=c('factor', rep('integer', 2), rep('factor', 3), rep('integer', 5), 'factor'), + col.names=c('chr', 'start', 'end', 'ref', 'alt', 'zyg', 'QC', 'NRF', 'NRR', 'NAF', 'NAR', 'VCF.FILTER')) + +x=cbind(x, matrix( + as.numeric(format(sapply(1:nrow(x), function(i) + with(x[i, ], c(with(binom.test(c(sum(NRF, NAF), sum(NRR, NAR))), p.value), + with(binom.test(c(NRF, NRR)), p.value), + with(binom.test(c(NAF, NAR)), p.value)))), digits=3, scientific=TRUE)), + ncol=3, byrow=TRUE, dimnames=list(NULL, c('p.strand', 'p.ref', 'p.alt'))) +) + +x=cbind(subset((x=cbind(x, do.call('rbind', lapply(1:nrow(x), function(i) with(x[i, ], { + AD=sum(NAF, NAR) + DP=sum(NRF, NRR, AD) + AF=signif(AD/DP, digits=3) + VAR.FILTER=c(zyg!='unk' & QC>=20 & DP>=10 & AD>=5 & (p.strand>.05 | min(sum(NRF, NAF), sum(NRR, NAR))>=10) & (p.ref>.05 | min(NRF, NRR)>=10) & (p.alt>.05 | min(NAF, NAR)>=10)) + cbind(DP,AD,AF, VAR.FILTER) +}))))), select=-VAR.FILTER), VAR.FILTER=with(x, factor(VAR.FILTER, levels=c(0, 1), labels=c('SKIP', 'PASS')))) + +write.table(as.matrix(x), outfile, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) +q(runLast=FALSE) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/inc/annovar/annotate_variation.pl Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,2827 @@ +#!/usr/bin/perl +use warnings; +use strict; +use Pod::Usage; +use Getopt::Long; +use File::Spec; +use Cwd; + +our $VERSION = '$Revision: 466 $'; +our $LAST_CHANGED_DATE = '$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $'; + +our ($verbose, $help, $man); +our ($queryfile, $dbloc); +our ($outfile, $separate, $batchsize, $dbtype, $neargene, $genomebinsize, $geneanno, $regionanno, $filter, $downdb, $buildver, $score_threshold, $normscore_threshold, $minqueryfrac, $expandbin, $splicing_threshold, + $maf_threshold, $chromosome, $zerostart, $rawscore, $memfree, $memtotal, $sift_threshold, $gff3dbfile, $genericdbfile, $vcfdbfile, $time, $wget, $precedence, + $webfrom, $colsWanted, $comment, $scorecolumn, $transfun, $exonsort, $avcolumn, $bedfile); +our (%valichr, $dbtype1); +our (@precedence, @colsWanted, @avcolumn); +sub printerr; #declare a subroutine + +our %codon1 = (TTT=>"F", TTC=>"F", TCT=>"S", TCC=>"S", TAT=>"Y", TAC=>"Y", TGT=>"C", TGC=>"C", TTA=>"L", TCA=>"S", TAA=>"*", TGA=>"*", TTG=>"L", TCG=>"S", TAG=>"*", TGG=>"W", CTT=>"L", CTC=>"L", CCT=>"P", CCC=>"P", CAT=>"H", CAC=>"H", CGT=>"R", CGC=>"R", CTA=>"L", CTG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", ATT=>"I", ATC=>"I", ACT=>"T", ACC=>"T", AAT=>"N", AAC=>"N", AGT=>"S", AGC=>"S", ATA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", ATG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GTT=>"V", GTC=>"V", GCT=>"A", GCC=>"A", GAT=>"D", GAC=>"D", GGT=>"G", GGC=>"G", GTA=>"V", GTG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G"); +our %codon3 = (TTT=>"Phe", TTC=>"Phe", TCT=>"Ser", TCC=>"Ser", TAT=>"Tyr", TAC=>"Tyr", TGT=>"Cys", TGC=>"Cys", TTA=>"Leu", TCA=>"Ser", TAA=>"*", TGA=>"*", TTG=>"Leu", TCG=>"Ser", TAG=>"*", TGG=>"Trp", CTT=>"Leu", CTC=>"Leu", CCT=>"Pro", CCC=>"Pro", CAT=>"His", CAC=>"His", CGT=>"Arg", CGC=>"Arg", CTA=>"Leu", CTG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", ATT=>"Ile", ATC=>"Ile", ACT=>"Thr", ACC=>"Thr", AAT=>"Asn", AAC=>"Asn", AGT=>"Ser", AGC=>"Ser", ATA=>"Ile", ACA=>"Thr", AAA=>"Lys", AGA=>"Arg", ATG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"Arg", GTT=>"Val", GTC=>"Val", GCT=>"Ala", GCC=>"Ala", GAT=>"Asp", GAC=>"Asp", GGT=>"Gly", GGC=>"Gly", GTA=>"Val", GTG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly"); +our %codonfull = (TTT=>"Phenylalanine", TTC=>"Phenylalanine", TCT=>"Serine", TCC=>"Serine", TAT=>"Tyrosine", TAC=>"Tyrosine", TGT=>"Cysteine", TGC=>"Cysteine", TTA=>"Leucine", TCA=>"Serine", TAA=>"Stop", TGA=>"Stop", TTG=>"Leucine", TCG=>"Serine", TAG=>"Stop", TGG=>"Tryptophan", CTT=>"Leucine", CTC=>"Leucine", CCT=>"Proline", CCC=>"Proline", CAT=>"Histidine", CAC=>"Histidine", CGT=>"Arginine", CGC=>"Arginine", CTA=>"Leucine", CTG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", ATT=>"Isoleucine", ATC=>"Isoleucine", ACT=>"Threonine", ACC=>"Threonine", AAT=>"Asparagine", AAC=>"Asparagine", AGT=>"Serine", AGC=>"Serine", ATA=>"Isoleucine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Arginine", ATG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Arginine", GTT=>"Valine", GTC=>"Valine", GCT=>"Alanine", GCC=>"Alanine", GAT=>"Aspartic acid", GAC=>"Aspartic acid", GGT=>"Glycine", GGC=>"Glycine", GTA=>"Valine", GTG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine"); +our %codonr1 = (UUU=>"F", UUC=>"F", UCU=>"S", UCC=>"S", UAU=>"Y", UAC=>"Y", UGU=>"C", UGC=>"C", UUA=>"L", UCA=>"S", UAA=>"*", UGA=>"*", UUG=>"L", UCG=>"S", UAG=>"*", UGG=>"W", CUU=>"L", CUC=>"L", CCU=>"P", CCC=>"P", CAU=>"H", CAC=>"H", CGU=>"R", CGC=>"R", CUA=>"L", CUG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", AUU=>"I", AUC=>"I", ACU=>"T", ACC=>"T", AAU=>"N", AAC=>"N", AGU=>"S", AGC=>"S", AUA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", AUG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GUU=>"V", GUC=>"V", GCU=>"A", GCC=>"A", GAU=>"D", GAC=>"D", GGU=>"G", GGC=>"G", GUA=>"V", GUG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G"); +our %codonr3 = (UUU=>"Phe", UUC=>"Phe", UCU=>"Ser", UCC=>"Ser", UAU=>"Tyr", UAC=>"Tyr", UGU=>"Cys", UGC=>"Cys", UUA=>"Leu", UCA=>"Ser", UAA=>"*", UGA=>"*", UUG=>"Leu", UCG=>"Ser", UAG=>"*", UGG=>"Trp", CUU=>"Leu", CUC=>"Leu", CCU=>"Pro", CCC=>"Pro", CAU=>"His", CAC=>"His", CGU=>"Arg", CGC=>"Arg", CUA=>"Leu", CUG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", AUU=>"Ile", AUC=>"Ile", ACU=>"Thr", ACC=>"Thr", AAU=>"Asn", AAC=>"Asn", AGU=>"Ser", AGC=>"Ser", AUA=>"Ile", ACA=>"Thr", AAA=>"Lys", AGA=>"Arg", AUG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"Arg", GUU=>"Val", GUC=>"Val", GCU=>"Ala", GCC=>"Ala", GAU=>"Asp", GAC=>"Asp", GGU=>"Gly", GGC=>"Gly", GUA=>"Val", GUG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly"); +our %codonrfull = (UUU=>"Phenylalanine", UUC=>"Phenylalanine", UCU=>"Serine", UCC=>"Serine", UAU=>"Tyrosine", UAC=>"Tyrosine", UGU=>"Cysteine", UGC=>"Cysteine", UUA=>"Leucine", UCA=>"Serine", UAA=>"Stop", UGA=>"Stop", UUG=>"Leucine", UCG=>"Serine", UAG=>"Stop", UGG=>"Tryptophan", CUU=>"Leucine", CUC=>"Leucine", CCU=>"Proline", CCC=>"Proline", CAU=>"Histidine", CAC=>"Histidine", CGU=>"Arginine", CGC=>"Arginine", CUA=>"Leucine", CUG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", AUU=>"Isoleucine", AUC=>"Isoleucine", ACU=>"Threonine", ACC=>"Threonine", AAU=>"Asparagine", AAC=>"Asparagine", AGU=>"Serine", AGC=>"Serine", AUA=>"Isoleucine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Arginine", AUG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Arginine", GUU=>"Valine", GUC=>"Valine", GCU=>"Alanine", GCC=>"Alanine", GAU=>"Aspartic acid", GAC=>"Aspartic acid", GGU=>"Glycine", GGC=>"Glycine", GUA=>"Valine", GUG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine"); +our %iupac = (R=>'AG', Y=>'CT', S=>'GC', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-'); + +processArguments (); #process program arguments, set up default values, check for errors, check for existence of db files +if ($geneanno) { + annotateQueryByGene (); #generate gene-based annoations (classify variants into intergenic, introgenic, non-synonymous, synonymous, UTR, frameshift, etc) +} elsif ($regionanno) { + annotateQueryByRegion (); #generate region-based annotations (most conserved elements, transcription factor binding sites, etc) +} elsif ($filter) { + filterQuery (); #generate filter-based annotations (identify variants not reported in variation databases) +} elsif ($downdb) { + downloadDB (); #download annotation databases from Internet +} + +sub processArguments { + my @command_line = @ARGV; #command line argument + GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'outfile=s'=>\$outfile, 'separate'=>\$separate, + 'batchsize=s'=>\$batchsize, 'dbtype=s'=>\$dbtype, 'neargene=i'=>\$neargene, 'genomebinsize=s'=>\$genomebinsize, + 'geneanno'=>\$geneanno, 'regionanno'=>\$regionanno, , 'filter'=>\$filter, 'downdb'=>\$downdb, 'buildver=s'=>\$buildver, 'score_threshold=f'=>\$score_threshold, + 'normscore_threshold=i'=>\$normscore_threshold, 'minqueryfrac=f'=>\$minqueryfrac, 'expandbin=i'=>\$expandbin, 'splicing_threshold=i'=>\$splicing_threshold, + 'maf_threshold=f'=>\$maf_threshold, 'chromosome=s'=>\$chromosome, 'zerostart'=>\$zerostart, 'rawscore'=>\$rawscore, 'memfree=i'=>\$memfree, + 'memtotal=i'=>\$memtotal, 'sift_threshold=f'=>\$sift_threshold, 'gff3dbfile=s'=>\$gff3dbfile, 'genericdbfile=s'=>\$genericdbfile, 'vcfdbfile=s'=>\$vcfdbfile, + 'time'=>\$time, 'wget!'=>\$wget, 'precedence=s'=>\$precedence, 'webfrom=s'=>\$webfrom, 'colsWanted=s'=>\$colsWanted, 'comment'=>\$comment, + 'scorecolumn=i'=>\$scorecolumn, 'transcript_function'=>\$transfun, 'exonsort'=>\$exonsort, 'avcolumn=s'=>\$avcolumn, 'bedfile=s'=>\$bedfile) or pod2usage (); + + $help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT); + $man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT); + @ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT); + @ARGV == 2 or pod2usage ("Syntax error"); + + ($queryfile, $dbloc) = @ARGV; + + $dbloc =~ s/[\\\/]$//; #delete the trailing / or \ sign as part of the directory name + if (defined $batchsize) { + $batchsize =~ s/k$/000/; + $batchsize =~ s/m$/000000/; + $batchsize =~ m/^\d+$/ or pod2usage ("Error: the --batchsize argument must be a positive integer (suffix of k or m is okay)"); + } else { + $batchsize = 5_000_000; + } + if (defined $genomebinsize) { + $genomebinsize =~ s/k$/000/; + $genomebinsize =~ s/m$/000000/; + $genomebinsize =~ m/^\d+$/ or pod2usage ("Error: the --genomebinsize argument must be a positive integer (suffix of k or m is okay)"); + $genomebinsize > 1000 or pod2suage ("Error: the --genomebinsize argument must be larger than 1000"); + } else { + if ($geneanno) { + $genomebinsize = 100_000; #gene usually span large genomic regions + } else { + $genomebinsize = 10_000; #MCE, TFBS, miRNA, etc are small genomic regions + } + } + + $verbose ||= 0; #when it is not specified, it is zero + $neargene ||= 1_000; #for upstream/downstream annotation of variants, specify the distance threshold between variants and genes + $expandbin ||= int(2_000_000/$genomebinsize); #for gene-based annotations, when intergenic variants are found, expand to specified number of nearby bins to find closest genes + $outfile ||= $queryfile; #specify the prefix of output file names + + #set up log file + if ($downdb) { + if (not -d $dbloc) { + mkdir ($dbloc) or die "Error: the directory $dbloc does not exist and cannot be created\n"; + } + my $errfile = File::Spec->catfile ($dbloc, "annovar_downdb.log"); + open (LOG, ">$errfile") or die "Error: cannot write LOG information to log file $errfile: $!\n"; + } else { + open (LOG, ">$outfile.log") or die "Error: cannot write LOG information to log file $outfile.log: $!\n"; + } + print LOG "ANNOVAR Version:\n\t", q/$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $/, "\n"; + print LOG "ANNOVAR Information:\n\tFor questions, comments, documentation, bug reports and program update, please visit http://www.openbioinformatics.org/annovar/\n"; + print LOG "ANNOVAR Command:\n\t$0 @command_line\n"; + print LOG "ANNOVAR Started:\n\t", scalar (localtime), "\n"; + + my $num = 0; + $geneanno and $num++; + $downdb and $num++; + $filter and $num++; + $regionanno and $num++; + $num <= 1 or pod2usage ("Error in argument: please specify only one of --geneanno, -regionanno, --downdb, --filter"); + if (not $num) { + $geneanno++; + printerr "NOTICE: The --geneanno operation is set to ON by default\n"; + } + + my %dbtype1 = ('gene'=>'refGene', 'refgene'=>'refGene', 'knowngene'=>'knownGene', 'ensgene'=>'ensGene', 'band'=>'cytoBand', 'cytoband'=>'cytoBand', 'tfbs'=>'tfbsConsSites', 'mirna'=>'wgRna', + 'mirnatarget'=>'targetScanS', 'segdup'=>'genomicSuperDups', 'omimgene'=>'omimGene', 'gwascatalog'=>'gwasCatalog', + '1000g_ceu'=>'CEU.sites.2009_04', '1000g_yri'=>'YRI.sites.2009_04', '1000g_jptchb'=>'JPTCHB.sites.2009_04', + '1000g2010_ceu'=>'CEU.sites.2010_03', '1000g2010_yri'=>'YRI.sites.2010_03', '1000g2010_jptchb'=>'JPTCHB.sites.2010_03', + '1000g2010jul_ceu'=>'CEU.sites.2010_07', '1000g2010jul_yri'=>'YRI.sites.2010_07', '1000g2010jul_jptchb'=>'JPTCHB.sites.2010_07', + '1000g2010nov_all'=>'ALL.sites.2010_11', + ); + + if ($geneanno) { + $dbtype ||= 'refGene'; + $dbtype1 = $dbtype1{$dbtype} || $dbtype; + #$dbtype1 =~ m/^(refGene|knownGene|ensGene)$/ or pod2usage ("Error: the gene-based annotation procedure currently only support -dbtype of refGene, knownGene and ensGene"); #commented 2011feb18 + } elsif ($regionanno) { + defined $dbtype or pod2usage ("Error in argument: please specify --dbtype (required for the --regionanno operation)"); + $dbtype1 = $dbtype1{$dbtype} || $dbtype; + if ($dbtype =~ m/^mce(\d+)way/) { #added 2010Feb16 + $dbtype1 = "phastConsElements$1way"; + } + if ($dbtype1 eq 'gff3') { + defined $gff3dbfile or pod2usage ("Error in argument: please specify --gff3dbfile for the --dbtype of 'gff3'"); + } + } elsif ($filter) { + defined $dbtype or pod2usage ("Error in argument: please specify --dbtype (required for the --filter operation)"); + $dbtype =~ m/^avsift|generic|1000g_(ceu|yri|jptchb)|1000g2010_(ceu|yri|jptchb)|1000g20\d\d[a-z]{3}_[a-z]+|snp\d+|vcf|(ljb_\w+)$/ or pod2usage ("Error in argument: the specified --dbtype $dbtype is not valid for --filter operation (valid ones are '1000g_ceu', '1000g2010_yri', 'snp129', 'avsift', 'vcf', 'generic', etc)"); + $dbtype1 = $dbtype1{$dbtype} || $dbtype; + if ($dbtype1 eq 'generic') { + defined $genericdbfile or pod2usage ("Error in argument: please specify --genericdbfile for the --dbtype of 'generic'"); + } + if ($dbtype eq 'vcf') { + defined $vcfdbfile or pod2usage ("Error in argument: please specify --vcfdbfile for the --dbtype of 'vcf'"); + } + } elsif ($downdb) { + defined $dbtype and pod2usage ("Error in argument: please do not specify --dbtype for the --downdb operation"); + $dbtype1 = $dbtype1{$queryfile} || $queryfile; + } + + if (not $buildver) { + $buildver = 'hg18'; + printerr "NOTICE: The --buildver is set as 'hg18' by default\n"; + } + + if ($score_threshold) { + $score_threshold > 0 or pod2usage ("Error in argument: the --score_threshold must be a positive number (you specified $score_threshold)"); + $geneanno || $downdb and pod2usage ("Error in argument: the --score_threshold is not useful for --geneanno or --downdb operations"); + } + if ($normscore_threshold) { + $normscore_threshold <= 1000 or pod2usage ("Error in argument: the --normscore_threshold must be between 0 and 1000 (you specified $normscore_threshold)"); + $regionanno or pod2usage ("Error in argument: the --score_threshold is supported only for the --regionanno operation"); + } + + + if (defined $sift_threshold) { + $filter or pod2usage ("Error in argument: the --sift_threshold is supported only for the --filter operation"); + $dbtype1 eq 'avsift' or pod2usage ("Error in argument: the --sift_threshold argument can be used only if '--dbtype avsift' is used"); + $sift_threshold >= 0 and $sift_threshold <= 1 or pod2usage ("Error in argument: the --sift_threshold must be between 0 and 1 inclusive"); + } else { + $sift_threshold = 0.05; + } + + #operation-specific argument + if (defined $splicing_threshold) { + $geneanno or pod2usage ("Error in argument: the --splicing_threshold is supported only for the --geneanno operation"); + } else { + $splicing_threshold = 2; #for splicing annotation, specify the distance threshold between variants and exon/intron boundaries + } + if (defined $maf_threshold) { + $filter or pod2usage ("Error in argument: the --maf_threshold is supported only for the --filter operation"); + } else { + $maf_threshold = 0; #for filter-based annotations on 1000 Genomes Project data, specify the MAF threshold to be used in filtering + } + if (defined $minqueryfrac) { + $regionanno or pod2usage ("Error in argument: the --minqueryfrac is supported only for the --regionanno operation"); + } else { + $minqueryfrac = 0; #minimum query overlap to declare a "match" with database records + } + if (defined $gff3dbfile) { + $dbtype eq 'gff3' or pod2usage ("Error in argument: the --gff3dbfile argument can be used only if '--dbtype gff3' is used"); + $geneanno or $regionanno or pod2usage ("Error in argument: the --gff3dbfile argument is supported only for the --geneanno or --regionanno operation"); + } + if (defined $bedfile) { + $dbtype eq 'bed' or pod2usage ("Error in argument: the --bedfile argument can be used only if '--dbtype bed' is used"); + $regionanno or pod2usage ("Error in argument: the --bedfile argument is supported only for the --regionanno operation"); + } + if (defined $genericdbfile) { + $filter or pod2usage ("Error in argument: the --genericdbfile argument is supported only for the --filter operation"); + } + if (defined $wget) { + $downdb or pod2usage ("Error in argument: the --wget argument is supported only for the --downdb operation"); + } else { + $wget = 1; #by default, use wget for downloading files from Internet + } + if (defined $precedence) { + $geneanno or pod2usage ("Error in argument: the --precedence argument is supported only for the --geneanno operation"); + @precedence = split (/,/, $precedence); + @precedence >= 2 or pod2usage ("Error in argument: the --precedence argument should be comma delimited"); + for my $i (0 .. @precedence-1) { + $precedence[$i] =~ m/^(exonic|intronic|splicing|utr5|utr3|upstream|downstream|splicing|ncrna)$/ or pod2usage ("Error in argument: the --precedence argument contains invalid keywords (valid ones are exonic|intronic|splicing|utr5|utr3|upstream|downstream|splicing)"); + } + } + + if (defined $colsWanted) { + $regionanno or pod2usage ("Error in argument: the --colWanted argument is supported only for the --geneanno operation"); + if (lc $colsWanted eq 'all') { + @colsWanted = ('all'); + } elsif (lc $colsWanted eq 'none') { + @colsWanted = ('none'); + } else { + @colsWanted = split (/,/, $colsWanted); + for my $i (0 .. @colsWanted-1) { + $colsWanted[$i]=~m/^\d+$/ or pod2usage ("Error in argument: the --colsWanted argument ($colsWanted) must be a list of comma delimited numbers or be 'all' or be 'none'"); + } + } + } + + if (defined $scorecolumn) { + $regionanno or pod2usage ("Error in argument: the --scorecolumn argument is supported only for the --regionanno operation"); + } + + if ($exonsort) { + $geneanno or pod2usage ("Error in argument: the --exonsort argument is supported only for the --geneanno operation"); + } + + if (defined $avcolumn) { + $avcolumn =~ m/^\d+,\d+,\d+,\d+,\d+$/ or pod2usage ("Error in argument: the --avcolumn argument must be five integer numbers separated by comma"); + @avcolumn = split (/,/, $avcolumn); + @avcolumn = map {$_-1} @avcolumn; + } else { + @avcolumn = (0..4); #by default, the first five columns are the required AVINPUT information + } + + if (defined $webfrom) { + if ($webfrom ne 'ucsc' and $webfrom ne 'annovar') { + $webfrom =~ m#^(http://|ftp://)# or pod2usage ("Error: the --webfrom argument needs to be 'ucsc', 'annovar', or a URL"); + } + } + + $maf_threshold >= 0 and $maf_threshold <= 0.5 or pod2usage ("Error in argument: the --maf_threshold must be between 0 and 0.5 (you specified $maf_threshold)"); + $minqueryfrac >= 0 and $minqueryfrac <= 1 or pod2usage ("Error in argument: the --minqueryfrac must be between 0 and 1 (you specified $minqueryfrac)"); + $memfree and $memfree >= 100_000 || pod2usage ("Error in argument: the --memfree argument must be at least 100000 (in the order of kilobytes)"); + $memtotal and $memtotal >= 100_000 || pod2usage ("Error in argument: the --memtotal argument must be at least 100000 (in the order of kilobytes)"); + + if ($chromosome) { + my @chr = split (/,/, $chromosome); + for my $i (0 .. @chr-1) { + if ($chr[$i] =~ m/^(\d+)-(\d+)$/) { + for my $j ($1 .. $2) { + $valichr{$j}++; + } + } else { + $valichr{$chr[$i]}++; + } + } + printerr "NOTICE: These chromosomes in database will be examined: ", join (",", sort keys %valichr), "\n"; + } +} + + +sub annotateQueryByGene { + my ($queryfh); #query file handle + my ($totalquerycount, $totalinvalidcount, $batchcount) = qw/0 0 1/; + open ($queryfh, $queryfile) or die "Error: cannot read from --queryfile ($queryfile): $!\n"; + + open (OUT, ">$outfile.variant_function") or die "Error: cannot write to output file $outfile.variant_function: $!\n"; + open (EXONIC, ">$outfile.exonic_variant_function") or die "Error: cannot write to output file $outfile.exonic_variant_function: $!\n"; + open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n"; + + my ($genedb, $geneidmap, $cdslen, $mrnalen) = readUCSCGeneAnnotation ($dbloc); + + $time and printerr "NOTICE: Current time (before examining variants) is ", scalar (localtime), "\n"; + while (1) { + my ($linecount, $invalidcount) = newprocessNextQueryBatchByGene ($queryfh, $batchsize, $genedb, $geneidmap, $cdslen, $mrnalen); + $totalquerycount += $linecount; + $totalinvalidcount += $invalidcount; + $linecount == $batchsize or last; + $batchcount++; + printerr "NOTICE: Begin processing batch $batchcount (each batch contains $batchsize variants)\n"; + } + close (INVALID); + close (EXONIC); + close (OUT); + close ($queryfh); + $time and printerr "NOTICE: Current time (after examining variants) is ", scalar (localtime), "\n"; + + $totalinvalidcount or unlink ("$outfile.invalid_input"); #delete the file as it is empty + printerr "NOTICE: Finished gene-based annotation on $totalquerycount genetic variants in $queryfile"; + $totalinvalidcount and printerr " (including $totalinvalidcount with invalid format written to $outfile.invalid_input)"; + printerr "\n"; + printerr "NOTICE: Output files were written to $outfile.variant_function, $outfile.exonic_variant_function\n"; +} + +sub newprocessNextQueryBatchByGene { + my ($queryfh, $batchsize, $genedb, $geneidmap, $cdslen, $mrnalen) = @_; + my (%refseqvar); + + my ($chr, $start, $end, $ref, $obs); + my ($name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend, $name2); + my ($invalid); + my ($linecount, $invalidcount) = qw/0 0/; + + for my $i (1 .. $batchsize) { #process up to batchsize variants + my $nextline = <$queryfh>; #read the next line in variant file + defined $nextline or last; + $nextline =~ s/[\r\n]+$//; + + if ($nextline =~ m/^#/ and $comment) { #comment line start with #, do not include this is $linecount + print OUT "#comment\t$nextline\n"; + next; + } + + $linecount++; #linecount does not include the comment line + $invalid = 0; + + my @nextline = split (/\s+/, $nextline); + ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn]; + if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) { + $invalid++; + } else { + ($ref, $obs) = (uc $ref, uc $obs); + $zerostart and $start++; + $chr =~ s/^chr//; + if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) { + $invalid++; + } elsif ($ref eq '-' and $obs eq '-' #both are empty allele + or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code + or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code + or $start =~ m/[^\d]/ #start is not a number + or $end =~ m/[^\d]/ #end is not a number + or $start > $end #start is more than end + or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref + or $ref eq '-' and $start != $end #length mismatch for insertion + ) { + $invalid++; + } + } + + + + if ($invalid) { + print INVALID $nextline, "\n"; #invalid record found + $invalidcount++; + next; + } + + my (%intronic, %utr5, %utr3, %exonic, %upstream, %downstream, %ncrna, %intergenic, %splicing); + my $foundgenic; #variant found in genic region (between start and end position of a gene in genome) + my ($distl, $distr, $genel, $gener); #for intergenic variant, the distance and gene name to the left and right side of gene + my $bin1 = int ($start/$genomebinsize)-1; #start bin + $bin1 < 0 and $bin1=0; + my $bin2 = int ($end/$genomebinsize)+1; #end bin (usually same as start bin, unless the query is really big that spans multiple megabases) + + while (not exists $genedb->{$chr, $bin1} and $bin1 > int ($start/$genomebinsize)-$expandbin) { #examine at least 5 bins (by default 5Mb) to the left to make sure that a gene is found in the bin + $bin1 > 0 or last; + $bin1--; + } + + while (not exists $genedb->{$chr, $bin2} and $bin2 < int ($end/$genomebinsize)+$expandbin) { #examine at least 5 bins (by default 5Mb) to the right to make sure that a gene is found in the bin + $bin2++; + } + + my (%seen); + for my $nextbin ($bin1 .. $bin2) { + exists $genedb->{$chr, $nextbin} or next; #this genome bin has no annotated gene (a complete intergenic region) + for my $nextgene (@{$genedb->{$chr, $nextbin}}) { #when $genedb->{$chr, $nextbin} is undefined, this automatically create an array!!! + ($name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend, $name2) = @$nextgene; + defined $name2 or printerr "WARNING: name2 field is not provided for transcript $name (start=$txstart end=$txend)\n" and $name2=''; + $seen{$name, $txstart} and next; #name and txstart uniquely identify a transcript and chromosome position (sometimes same transcript may map to two nearby positions, such as nearby segmental duplications) + $seen{$name, $txstart}++; #a transcript may be in two adjacent bins, so if one is already scanned, there is no need to work on it again + + if ($transfun) { #variant_function output contains transcript name, rather than gene name + $name2 = $name; + } + + if (not $foundgenic) { #this variant has not hit a genic region yet + if ($start > $txend) { + defined $distl or $distl = $start-$txend and $genel=$name2; + $distl > $start-$txend and $distl = $start-$txend and $genel=$name2; #identify left closest gene + } + + if ($end < $txstart) { + defined $distr or $distr = $txstart-$end and $gener=$name2; + $distr > $txstart-$end and $distr = $txstart-$end and $gener=$name2; #identify right closest gene + } + } + + if ($end < $txstart) { + #query --- + #gene <-*----*-> + $foundgenic and last; #if found a genic annotation already, end the search of the bins + if ($end > $txstart - $neargene) { + if ($dbstrand eq '+') { + $upstream{$name2}++; + } else { + $downstream{$name2}++; + } + } else { + last; #if transcript is too far away from end, end the search of the bins + } + } elsif ($start > $txend) { + #query --- + #gene <-*----*-> + if (not $foundgenic and $start < $txend + $neargene) { + if ($dbstrand eq '+') { + $downstream{$name2}++; + } else { + $upstream{$name2}++; + } + } + } elsif ($cdsstart == $cdsend+1) { #non-coding RNA (could be microRNA, or could be due to lack of CDS annotation for mRNA such as NR_026730 or BC039000). Previously we already did cdsstart++ so here the cdsstart is more than cdsend + if ($start >= $txstart and $start <= $txend or $end >= $txstart and $end <= $txend or $start <= $txstart and $end >= $txend) { + $ncrna{$name2}++; + $foundgenic++; + } + } else { #query overlaps with coding region of gene + my ($lenintron) = (0); #cumulative intron length at a given exon + my ($rcdsstart, $rvarstart, $rvarend); #start of coding and variant in reference mRNA sequence + my @exonstart = @$exonstart; + my @exonend = @$exonend; + my $foundexonic; + if ($dbstrand eq '+') { #forward strand, search from left to right (first exon to last exon) + for my $k (0 .. @exonstart-1) { + $k and $lenintron += ($exonstart[$k]-$exonend[$k-1]-1); #calculate cumulative intron length + if ($cdsstart >= $exonstart[$k]) { #calculate CDS start accurately by considering intron length + $rcdsstart = $cdsstart-$txstart-$lenintron+1; + } + + #splicing calculation + if ($start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1 or $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) { + $splicing{$name2}++; #when query start site is close to exon start or exon end + } + if ($end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]+$splicing_threshold-1 or $end >= $exonend[$k]-$splicing_threshold+1 and $end <= $exonend[$k]+$splicing_threshold) { + $splicing{$name2}++; #when query end site is close to exon start or exon end + } + if ($start <= $exonstart[$k] and $end>=$exonstart[$k] or $start <= $exonend[$k] and $end >= $exonend[$k]) { + $splicing{$name2}++; #when query encompass the exon/intron boundary + } + + if ($start < $exonstart[$k]) { + if ($end >= $exonstart[$k]) { #exonic + $rvarstart = $exonstart[$k]-$txstart-$lenintron+1; + + for my $m ($k .. @exonstart-1) { + $m > $k and $lenintron += ($exonstart[$m]-$exonend[$m-1]-1); + if ($end < $exonstart[$m]) { + #query -------- + #gene <--**---******---****----> + $rvarend = $exonend[$m-1]-$txstart-$lenintron+1 + ($exonstart[$m]-$exonend[$m-1]-1); + last; + } elsif ($end <= $exonend[$m]) { + #query ----------- + #gene <--**---******---****----> + $rvarend = $end-$txstart-$lenintron+1; + last; + } + } + if (not defined $rvarend) { + $rvarend = $txend-$txstart-$lenintron+1; #if this value is longer than transcript length, it suggest whole gene deletion + } + + #here the trick begins to differentiate UTR versus coding exonic + if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns + #query ---- + #gene <--*---*-> + $utr5{$name2}++; #positive strand for UTR5 + } elsif ($start > $cdsend) { + #query ---- + #gene <--*---*-> + $utr3{$name2}++; #positive strand for UTR3 + } else { + $exonic{$name2}++; + $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '+', $i, $k+1, $nextline]; #refseq CDS start, refseq variant start + } + $foundgenic++; + last; + } elsif ($k and $start > $exonend[$k-1]) { #intronic + $intronic{$name2}++; + $foundgenic++; + last; + } + } elsif ($start <= $exonend[$k]) { #exonic + $rvarstart = $start-$txstart-$lenintron+1; + + for my $m ($k .. @exonstart-1) { + $m > $k and $lenintron += ($exonstart[$m]-$exonend[$m-1]-1); + if ($end < $exonstart[$m]) { + #query ------ + #gene <--**---******---****----> + $rvarend = $exonend[$m-1]-$txstart-$lenintron+1 + ($exonstart[$m]-$exonend[$m-1]-1); + last; + } elsif ($end <= $exonend[$m]) { + #query ----------- + #gene <--**---******---****----> + $rvarend = $end-$txstart-$lenintron+1; + last; + } + } + if (not defined $rvarend) { + $rvarend = $txend-$txstart-$lenintron+1; #if this value is longer than transcript length, it suggest whole gene deletion + } + + #here is the trick begins to differentiate UTR versus coding exonic + if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns + #query ---- + #gene <--*---*-> + $utr5{$name2}++; #positive strand for UTR5 + } elsif ($start > $cdsend) { + #query ---- + #gene <--*---*-> + $utr3{$name2}++; #positive strand for UTR3 + } else { + $exonic{$name2}++; + $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '+', $i, $k+1, $nextline]; #queryindex, refseq CDS start, refseq variant start + } + $foundgenic++; + last; + } + } + } elsif ($dbstrand eq '-') { #process negative strand (in the future, this should be fused to the paragraph above for positive strands; for now, I keep them separate for easier debugging) + for (my $k = @exonstart-1; $k>=0; $k--) { + $k < @exonstart-1 and $lenintron += ($exonstart[$k+1]-$exonend[$k]-1); + if ($cdsend <= $exonend[$k]) { #calculate CDS start accurately by considering intron length + $rcdsstart = $txend-$cdsend-$lenintron+1; + } + + #splicing calculation + if ($start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1 or $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) { + $splicing{$name2}++; + } + if ($end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]+$splicing_threshold-1 or $end >= $exonend[$k]-$splicing_threshold+1 and $end <= $exonend[$k]+$splicing_threshold) { + $splicing{$name2}++; + } + if ($start <= $exonstart[$k] and $end>=$exonstart[$k] or $start <= $exonend[$k] and $end >= $exonend[$k]) { + $splicing{$name2}++; + } + + if ($end > $exonend[$k]) { + if ($start <= $exonend[$k]) { + $rvarstart = $txend-$exonend[$k]-$lenintron+1; + + for (my $m = $k; $m >= 0; $m--) { + $m < $k and $lenintron += ($exonstart[$m+1]-$exonend[$m]-1); + if ($start > $exonend[$m]) { + #query -------- + #gene <--**---******---****----> + #$rvarend = $txend-$exonstart[$m]-$lenintron+1 - ($exonstart[$m+1]-$exonend[$m]-1); #commented out 2011feb18 + $rvarend = $txend-$exonstart[$m+1]+1-$lenintron + ($exonstart[$m+1]-$exonend[$m]-1); #fixed this 2011feb18 + last; #finsih the cycle!!!!!!!!!!!!!!!!!!! + } elsif ($start >= $exonstart[$m]) { #start within exons + #query ---- + #gene <--**---******---****----> + $rvarend = $txend-$start-$lenintron+1; + last; + } + } + if (not defined $rvarend) { #if rvarend is not found, then the whole tail of gene is covered + $rvarend = $txend-$txstart-$lenintron+1; + } + + #here is the trick begins to differentiate UTR versus coding exonic + if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns + #query ---- + #gene <--*---*-> + $utr3{$name2}++; #negative strand for UTR5 + } elsif ($start > $cdsend) { + #query ---- + #gene <--*---*-> + $utr5{$name2}++; #negative strand for UTR3 + } else { + $exonic{$name2}++; + $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '-', $i, @exonstart-$k, $nextline]; + } + $foundgenic++; + last; + } elsif ($k < @exonstart-1 and $end < $exonstart[$k+1]) { + $intronic{$name2}++; + $foundgenic++; + last; + } + } elsif ($end >= $exonstart[$k]) { + $rvarstart = $txend-$end-$lenintron+1; #all the rvarstart, rvarend are with respect to the cDNA sequence (so rvarstart corresponds to end of variants) + + for (my $m = $k; $m >= 0; $m--) { + $m < $k and $lenintron += ($exonstart[$m+1]-$exonend[$m]-1); + if ($start > $exonend[$m]) { + #query ---- + #gene <--**---******---****----> + #$rvarend = $txend-$exonstart[$m]-$lenintron+1 - ($exonstart[$m+1]-$exonend[$m]-1); #commented out 2011feb18 due to bug (10 42244567 42244600 CACCTTTGCTTGATATGATAATATAGTGCCAAGG - hetero) + $rvarend = $txend-$exonstart[$m+1]+1 - $lenintron + ($exonstart[$m+1]-$exonend[$m]-1); #fixed this 2011feb18 + last; #finish the circle of counting exons!!!!! + } elsif ($start >= $exonstart[$m]) { #the start is right located within exon + #query ------- + #gene <--**---******---****----> + $rvarend = $txend-$start-$lenintron+1; + last; #finish the cycle + } + } + if (not defined $rvarend) { #if rvarend is not found, then the whole tail of gene is covered + $rvarend = $txend-$txstart-$lenintron+1; + } + + #here the trick begins to differentiate UTR versus coding exonic + if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns + #query ---- + #gene <--*---*-> + $utr3{$name2}++; #negative strand for UTR5 + } elsif ($start > $cdsend) { + #query ---- + #gene <--*---*-> + $utr5{$name2}++; #negative strand for UTR3 + } else { + $exonic{$name2}++; + $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '-', $i, @exonstart-$k, $nextline]; + } + $foundgenic++; + last; + } + } + } + } + } + } + $foundgenic or $intergenic{$name2}++; + $i =~ m/000000$/ and printerr "NOTICE: Finished analyzing $i query variants\n"; + + + my (@txname, %genename); + + if ($separate) { #separately print out each effect on one line + if (%exonic or %splicing or %intronic or %utr5 or %utr3 or %ncrna or %upstream or %downstream) { + %exonic and print OUT "exonic\t", join(",", sort keys %exonic), "\t", $nextline, "\n"; + %splicing and $end-$start+1<=$splicing_threshold and print OUT "splicing\t", join (",", sort keys %splicing), "\t", $nextline, "\n"; + %intronic and print OUT "intronic\t", join(",", sort keys %intronic), "\t", $nextline, "\n"; + %utr5 and print OUT "UTR5\t", join(",", sort keys %utr5), "\t", $nextline, "\n"; + %utr3 and print OUT "UTR3\t", join(",", sort keys %utr3), "\t", $nextline, "\n"; + %ncrna and print OUT "ncRNA\t", join(",", sort keys %ncrna), "\t", $nextline, "\n"; + %upstream and print OUT "upstream\t", join(",", sort keys %upstream), "\t", $nextline, "\n"; + %downstream and print OUT "downstream\t", join(",", sort keys %downstream), "\t", $nextline, "\n"; + } elsif (%intergenic) { + $genel ||= "NONE"; + $gener ||= "NONE"; + $distl ||= "NONE"; + $distr ||= "NONE"; + print OUT "intergenic\t", "$genel(dist=$distl),$gener(dist=$distr)", "\t", $nextline, "\n"; + } else { + die "FATAL ERROR: please report bug to ANNOVAR author with your input file\n"; + } + } else { + if (@precedence) { + my $foundmatch; + for my $i (0 .. @precedence-2) { + $precedence[$i] eq 'exonic' and %exonic and $foundmatch++; + $precedence[$i] eq 'splicing' and %splicing and $foundmatch++; + $precedence[$i] eq 'intronic' and %intronic and $foundmatch++; + $precedence[$i] eq 'utr5' and %utr5 and $foundmatch++; + $precedence[$i] eq 'utr3' and %utr3 and $foundmatch++; + $precedence[$i] eq 'ncrna' and %ncrna and $foundmatch++; + $precedence[$i] eq 'upstream' and %upstream and $foundmatch++; + $precedence[$i] eq 'downstream' and %downstream and $foundmatch++; + $precedence[$i] eq 'intergenic' and %intergenic and $foundmatch++; + if ($foundmatch) { + for my $j ($i+1 .. @precedence-1) { + $precedence[$j] eq 'exonic' and %exonic = (); + $precedence[$j] eq 'splicing' and %splicing = (); + $precedence[$j] eq 'intronic' and %intronic = (); + $precedence[$j] eq 'utr5' and %utr5 = (); + $precedence[$j] eq 'utr3' and %utr3 = (); + $precedence[$j] eq 'ncrna' and %ncrna = (); + $precedence[$j] eq 'upstream' and %upstream = (); + $precedence[$j] eq 'downstream' and %downstream = (); + $precedence[$j] eq 'intergenic' and %intergenic = (); + } + last; + } + } + } + + + if (%exonic) { + if (%splicing and $end-$start+1<=$splicing_threshold) { #a big deletion spanning splicing site is not really a "splicing" mutation + print OUT "exonic;splicing\t", join(",", sort keys %exonic), ";", join (",", sort keys %splicing), "\t", $nextline, "\n"; + } else { + print OUT "exonic\t", join(",", sort keys %exonic), "\t", $nextline, "\n"; + } + } elsif (%splicing) { + print OUT "splicing\t", join (",", sort keys %splicing), "\t", $nextline, "\n"; + } elsif (%ncrna) { + print OUT "ncRNA\t", join(",", sort keys %ncrna), "\t", $nextline, "\n"; + } elsif (%utr5 or %utr3) { + if (%utr5 and %utr3) { + print OUT "UTR5;UTR3\t", join(",", sort keys %utr5), ";", join(",", sort keys %utr3), "\t", $nextline, "\n"; #use ";" to separate UTR5 and UTR3 genes + } elsif (%utr5) { + print OUT "UTR5\t", join(",", sort keys %utr5), "\t", $nextline, "\n"; + } else { + print OUT "UTR3\t", join(",", sort keys %utr3), "\t", $nextline, "\n"; + } + } elsif (%intronic) { + print OUT "intronic\t", join(",", sort keys %intronic), "\t", $nextline, "\n"; + } elsif (%upstream or %downstream) { + if (%upstream and %downstream) { + print OUT "upstream;downstream\t", join(",", sort keys %upstream), ";", join(",", sort keys %downstream), "\t", $nextline, "\n"; + } elsif (%upstream) { + print OUT "upstream\t", join(",", sort keys %upstream), "\t", $nextline, "\n"; + } else { + print OUT "downstream\t", join(",", sort keys %downstream), "\t", $nextline, "\n"; + } + } elsif (%intergenic) { + $genel ||= "NONE"; + $gener ||= "NONE"; + $distl ||= "NONE"; + $distr ||= "NONE"; + print OUT "intergenic\t", "$genel(dist=$distl),$gener(dist=$distr)", "\t", $nextline, "\n"; + } else { + die "FATAL ERROR: please report bug to ANNOVAR author with your input file\n"; + } + } + } + %refseqvar and annotateExonicVariants (\%refseqvar, $geneidmap, $cdslen, $mrnalen); + + return ($linecount, $invalidcount); +} + +sub annotateExonicVariants { + my ($refseqvar, $geneidmap, $cdslen, $mrnalen) = @_; + my $refseqhash; + my $function = {}; + my %varinfo; #variants information (same as input line) + + $refseqhash = readSeqFromFASTADB ($refseqvar); + + for my $seqid (keys %$refseqvar) { + for my $i (0 .. @{$refseqvar->{$seqid}}-1) { + my ($refcdsstart, $refvarstart, $refvarend, $refstrand, $index, $exonpos, $nextline) = @{$refseqvar->{$seqid}->[$i]}; + my ($wtnt3, $wtnt3_after, @wtnt3, $varnt3, $wtaa, $wtaa_after, $varaa, $varpos); #wtaa_after is the aa after the wtaa + my ($chr, $start, $end, $ref, $obs); + + my @nextline = split (/\s+/, $nextline); + ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn]; + ($ref, $obs) = (uc $ref, uc $obs); + $zerostart and $start++; + $chr =~ s/^chr//; + + $varinfo{$index} = $nextline; + + if (not $refseqhash->{$seqid}) { #this refseq do not have FASTA sequence so cannot be interrogated + $function->{$index}{unknown} = "UNKNOWN"; + next; + } + + my $fs = (($refvarstart-$refcdsstart) % 3); + if ($refvarstart-$fs-1 > length($refseqhash->{$seqid})) { + printerr "WARNING: Potential database annotation error seqid=$seqid, refvarstart=$refvarstart, fs=$fs, seqlength=", length($refseqhash->{$seqid}), " refcdsstart=$refcdsstart, with inputline=$nextline\n"; + next; + } + + $wtnt3 = substr ($refseqhash->{$seqid}, $refvarstart-$fs-1, 3); + if (length ($refseqhash->{$seqid}) >= $refvarstart-$fs+3) { #going into UTR + $wtnt3_after = substr ($refseqhash->{$seqid}, $refvarstart-$fs+2, 3); + } else { + $wtnt3_after = ''; #last amino acid in the sequence without UTR (extremely rare situation) (example: 17 53588444 53588444 - T 414 hetero) + } + @wtnt3 = split (//, $wtnt3); + if (@wtnt3 != 3 and $refvarstart-$fs-1>=0) { #some times there are database annotation errors (example: chr17:3,141,674-3,141,683), so the last coding frame is not complete and as a result, the cDNA sequence is not complete + $function->{$index}{unknown} = "UNKNOWN"; + next; + } + + if ($refstrand eq '-') { #change the observed nucleotide to the reverse strand + $obs = revcom ($obs); + } + + if ($start == $end) { + if ($ref eq '-') { #insertion variant + #the insertion coordinate system in ANNOVAR always uses "position after the current site" + #in positive strand, this is okay + #in negative strand, the "after current site" becomes "before current site" during transcription + #therefore, appropriate handling is necessary to take this into account + #for example, for a trinucleotide GCC with frameshift of 1 and insertion of CCT + #in positive strand, it is G-CTT-CC + #but if the transcript is in negative strand, the genomic sequence should be GC-CCT-C, and transcript is G-AGG-GC + if ($refstrand eq '+') { + if ($fs == 1) { + $varnt3 = $wtnt3[0] . $wtnt3[1] . $obs . $wtnt3[2]; + } elsif ($fs == 2) { + $varnt3 = $wtnt3[0] . $wtnt3[1] . $wtnt3[2] . $obs; + } else { + $varnt3 = $wtnt3[0] . $obs . $wtnt3[1] . $wtnt3[2]; + } + } elsif ($refstrand eq '-') { + if ($fs == 1) { + $varnt3 = $wtnt3[0] . $obs . $wtnt3[1] . $wtnt3[2]; + } elsif ($fs == 2) { + $varnt3 = $wtnt3[0] . $wtnt3[1] . $obs . $wtnt3[2]; + } else { + $varnt3 = $obs . $wtnt3[0] . $wtnt3[1] . $wtnt3[2]; + } + } + ($wtaa, $wtaa_after, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($wtnt3_after), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1); + $wtaa_after and $wtaa_after eq '*' and $wtaa_after = 'X'; #wtaa_after could be undefined, if the current aa is the stop codon (X) (example: 17 53588444 53588444 - T) + + my $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarstart-$refcdsstart+2) . "ins$obs"; #cDNA level annotation + if (length ($obs) % 3 == 0) { + if ($wtaa eq '*') { #mutation on stop codon + if ($varaa =~ m/\*/) { + $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before + $function->{$index}{nfsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,"; #stop codon is stil present + } else { + $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,"; #stop codon is lost + } + } else { + if ($varaa =~ m/\*/) { + $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before + $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "delins$varaa,"; + } else { + $function->{$index}{nfsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "delins$varaa,"; + } + } + } else { + if ($wtaa eq '*') { #mutation on stop codon + if ($varaa =~ m/\*/) { #in reality, this cannot be differentiated from non-frameshift insertion, but we'll still call it frameshift + $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before + $function->{$index}{fsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,"; + } else { + $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,"; + } + } else { + if ($varaa =~ m/\*/) { + $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before + $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "_$wtaa_after" . ($varpos+1) . "delins$varaa,"; + } else { + $function->{$index}{fsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "fs,"; + } + } + } + } elsif ($obs eq '-') { #single nucleotide deletion + my $deletent; + if ($fs == 1) { + $deletent = $wtnt3[1]; + $varnt3 = $wtnt3[0].$wtnt3[2].$wtnt3_after; + } elsif ($fs == 2) { + $deletent = $wtnt3[2]; + $varnt3 = $wtnt3[0].$wtnt3[1].$wtnt3_after; + } else { + $deletent = $wtnt3[0]; + $varnt3 = $wtnt3[1].$wtnt3[2].$wtnt3_after; + } + ($wtaa, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1); + my $canno = "c." . ($refvarstart-$refcdsstart+1) . "del$deletent"; + if ($wtaa eq '*') { #mutation on stop codon + if ($varaa =~ m/\*/) { #stop codon is still stop codon + $function->{$index}{nfsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "X,"; #changed fsdel to nfsdel on 2011feb19 + } else { #stop codon is lost + $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "$varaa,"; + } + } else { + if ($varaa =~ m/\*/) { #new stop codon created + $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "X,"; + } else { + $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "fs,"; + } + } + } elsif (length ($obs) > 1) { #block substitution (since start==end, this changed from 1nt to several nt) + if (($refvarend-$refvarstart+1-length($obs)) % 3 == 0) { + $function->{$index}{nfssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "delins$obs,"; + } else { + $function->{$index}{fssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "delins$obs,"; + } + } else { #single nucleotide substitution variant + my $canno; + if ($fs == 1) { + $varnt3 = $wtnt3[0] . $obs . $wtnt3[2]; + $canno = "c.$wtnt3[1]" . ($refvarstart-$refcdsstart+1) . $obs; + } elsif ($fs == 2) { + $varnt3 = $wtnt3[0] . $wtnt3[1]. $obs; + $canno = "c.$wtnt3[2]" . ($refvarstart-$refcdsstart+1) . $obs; + } else { + $varnt3 = $obs . $wtnt3[1] . $wtnt3[2]; + $canno = "c.$wtnt3[0]" . ($refvarstart-$refcdsstart+1) . $obs; + } + ($wtaa, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1); + + if ($wtaa eq $varaa) { + $wtaa eq '*' and ($wtaa, $varaa) = qw/X X/; #change * to X in the output + $function->{$index}{ssnv} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos$varaa,"; + } elsif ($varaa eq '*') { + $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa${varpos}X,"; + } elsif ($wtaa eq '*') { + $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos$varaa,"; + } else { + $function->{$index}{nssnv} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos$varaa,"; + } + } + } elsif ($obs eq '-') { #deletion variant involving several nucleotides + ($wtaa, $varpos) = (translateDNA ($wtnt3), int(($refvarstart-$refcdsstart)/3)+1); #wildtype amino acid, position of amino acid + my ($varposend, $canno); #the position of the last amino acid in the deletion + if ($refvarstart<=$refcdsstart) { #since the first amino acid is deleted, the whole gene is considered deleted + $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:wholegene,"; #it is exonic variant, so the varend has to hit the first exon + } elsif ($refvarend >= $cdslen->{$seqid}+$refcdsstart) { #3' portion of the gene is deleted + $varposend = int ($cdslen->{$seqid}/3); #cdslen should be multiples of 3, but just in case of database mis-annotation + $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($cdslen->{$seqid}+$refcdsstart-1) . "del"; + $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,"; + } elsif (($refvarend-$refvarstart+1) % 3 == 0) { + $varposend = int (($refvarend-$refcdsstart)/3) + 1; + $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "del"; + $function->{$index}{nfsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,"; + } else { + $varposend = int (($refvarend-$refcdsstart)/3) + 1; + $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "del"; + $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,"; + } + } else { #block substitution event + if (($refvarend-$refvarstart+1-length($obs)) % 3 == 0) { + $function->{$index}{nfssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "$obs,"; + } else { + $function->{$index}{fssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "$obs,"; + } + } + } + } + + for my $index (sort {$a<=>$b} keys %$function) { + if ($separate) { #print out each type of exonic mutations separately (one effect in one line), rather than printing out only the most important function + if ($function->{$index}{fsins}) { + print EXONIC "line$index\t", "frameshift insertion\t$function->{$index}{fsins}\t", $varinfo{$index}, "\n"; + } + if ($function->{$index}{fsdel}) { + print EXONIC "line$index\t", "frameshift deletion\t$function->{$index}{fsdel}\t", $varinfo{$index}, "\n"; + } + if ($function->{$index}{fssub}) { + print EXONIC "line$index\t", "frameshift substitution\t$function->{$index}{fssub}\t", $varinfo{$index}, "\n"; + } + if ($function->{$index}{stopgain}) { + print EXONIC "line$index\t", "stopgain SNV\t$function->{$index}{stopgain}\t", $varinfo{$index}, "\n"; + } + if ($function->{$index}{stoploss}) { + print EXONIC "line$index\t", "stoploss SNV\t$function->{$index}{stoploss}\t", $varinfo{$index}, "\n"; + } + if ($function->{$index}{nfsins}) { + print EXONIC "line$index\t", "nonframeshift insertion\t$function->{$index}{nfsins}\t", $varinfo{$index}, "\n"; + } + if ($function->{$index}{nfsdel}) { + print EXONIC "line$index\t", "nonframeshift deletion\t$function->{$index}{nfsdel}\t", $varinfo{$index}, "\n"; + } + if ($function->{$index}{nfssub}) { + print EXONIC "line$index\t", "nonframeshift substitution\t$function->{$index}{nfssub}\t", $varinfo{$index}, "\n"; + } + if ($function->{$index}{nssnv}) { + print EXONIC "line$index\t", "nonsynonymous SNV\t$function->{$index}{nssnv}\t", $varinfo{$index}, "\n"; + } + if ($function->{$index}{ssnv}) { + print EXONIC "line$index\t", "synonymous SNV\t$function->{$index}{ssnv}\t", $varinfo{$index}, "\n"; + } + if ($function->{$index}{unknown}) { + print EXONIC "line$index\t", "unknown\t$function->{$index}{unknown}\t", $varinfo{$index}, "\n"; + } + } else { #print out only the most important functional changes (for example, chr3:9931279-9931279 G->A can be both non-synonymous and synonymous mutations based on UCSC gene model) + print EXONIC "line$index\t"; + my $sortout; + if ($sortout = $function->{$index}{fsins}) { + $exonsort and $sortout = sortExonicAnnotation ($sortout); + print EXONIC "frameshift insertion\t$sortout\t"; + } elsif ($sortout = $function->{$index}{fsdel}) { + $exonsort and $sortout = sortExonicAnnotation ($sortout); + print EXONIC "frameshift deletion\t$sortout\t"; + } elsif ($sortout = $function->{$index}{fssub}) { + $exonsort and $sortout = sortExonicAnnotation ($sortout); + print EXONIC "frameshift substitution\t$sortout\t"; + } elsif ($sortout = $function->{$index}{stopgain}) { + $exonsort and $sortout = sortExonicAnnotation ($sortout); + print EXONIC "stopgain SNV\t$sortout\t"; + } elsif ($sortout = $function->{$index}{stoploss}) { + $exonsort and $sortout = sortExonicAnnotation ($sortout); + print EXONIC "stoploss SNV\t$sortout\t"; + } elsif ($sortout = $function->{$index}{nfsins}) { + $exonsort and $sortout = sortExonicAnnotation ($sortout); + print EXONIC "nonframeshift insertion\t$sortout\t"; + } elsif ($sortout = $function->{$index}{nfsdel}) { + $exonsort and $sortout = sortExonicAnnotation ($sortout); + print EXONIC "nonframeshift deletion\t$sortout\t"; + } elsif ($sortout = $function->{$index}{nfssub}) { + $exonsort and $sortout = sortExonicAnnotation ($sortout); + print EXONIC "nonframeshift substitution\t$sortout\t"; + } elsif ($sortout = $function->{$index}{nssnv}) { + $exonsort and $sortout = sortExonicAnnotation ($sortout); + print EXONIC "nonsynonymous SNV\t$sortout\t"; + } elsif ($sortout = $function->{$index}{ssnv}) { + $exonsort and $sortout = sortExonicAnnotation ($sortout); + print EXONIC "synonymous SNV\t$sortout\t"; + } elsif ($sortout = $function->{$index}{unknown}) { + $exonsort and $sortout = sortExonicAnnotation ($sortout); + print EXONIC "unknown\t$sortout\t"; + } + print EXONIC $varinfo{$index}, "\n"; + } + } +} + +sub sortExonicAnnotation { + my ($anno) = @_; + my @anno1 = split (/,/, $anno); + my @anno2; + for my $i (0 .. @anno1-1) { + my @temp = split (/:/, $anno1[$i]); + $temp[2] =~ s/^exon//; + push @anno2, [$anno1[$i], @temp]; + } + @anno2 = sort {$a->[3] <=> $b->[3] or $a->[2] cmp $b->[2]} @anno2; #first sort by exon number, then by transcript name + my @anno3 = map {$_->[0]} @anno2; + return join (',', @anno3); +} + +sub filterQuery { + open (FIL, ">$outfile.${buildver}_${dbtype1}_filtered") or die "Error: cannot write to output file $outfile.${buildver}_${dbtype1}_filtered: $!\n"; + open (DROPPED, ">$outfile.${buildver}_${dbtype1}_dropped") or die "Error: cannot write to output file $outfile.${buildver}_${dbtype1}_dropped: $!\n"; + open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n"; + + printerr "NOTICE: Variants matching filtering criteria are written to $outfile.${buildver}_${dbtype1}_dropped, other variants are written to $outfile.${buildver}_${dbtype1}_filtered\n"; + + open (QUERY, $queryfile) or die "Error: cannot read from query file $queryfile: $!\n"; + + my (%variant, $filedone, $batchdone); + my ($linecount, $batchlinecount, $invalid, $invalidcount) = (0, 0); + my ($chr, $start, $end, $ref, $obs, $info); + while (1) { + $_ = <QUERY>; + if (not defined $_) { + $filedone++; + } else { + s/[\r\n]+$//; + + if (m/^#/ and $comment) { #comment line start with #, do not include this is $linecount + print FIL "$_\n"; + print DROPPED "#comment\t#comment\t$_\n"; + next; + } + + $linecount++; + $batchlinecount++; + if ($batchlinecount == $batchsize) { + $batchdone++; + } + + if ($memfree or $memtotal) { #if these arguments are specified + if ($linecount =~ m/00000$/) { #about 40Mb memory per 10k lines for a typical input dataset + my ($availmem, $allmem) = currentAvailMemory(); + $verbose and printerr "NOTICE: Current available system memory is $availmem kb (this program uses $allmem bytes memory), after reading $linecount query\n"; + if ($availmem and $availmem <= $memfree+50_000) { #some subsequent steps may take ~50Mb memory, so here we try to allocate some more memory + $batchdone++; + } + if ($memtotal and $allmem >= $memtotal-50_000) { #when --memtotal is specified, ensure that program use less memory + $batchdone++; + } + } + } + + $invalid = 0; #reset invalid status + + my @nextline = split (/\s+/, $_); + ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn]; + if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) { + $invalid++; + } else { + ($ref, $obs) = (uc $ref, uc $obs); + $zerostart and $start++; + $chr =~ s/^chr//; + if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) { + $invalid++; + } elsif ($ref eq '-' and $obs eq '-' #both are empty allele + or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code + or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code + or $start =~ m/[^\d]/ #start is not a number + or $end =~ m/[^\d]/ #end is not a number + or $start > $end #start is more than end + or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref + or $ref eq '-' and $start != $end #length mismatch for insertion + ) { + $invalid++; + } + } + + if ($invalid) { + print INVALID $_, "\n"; #invalid record found + $invalidcount++; + next; + } + + if ($start == $end and $ref eq '-') { #insertion + $obs = "0$obs"; + } elsif ($obs eq '-') { #deletion + $obs = $end-$start+1; + } elsif ($end>$start or $start==$end and length($obs)>1) { #block substitution #fixed the bug here 2011feb19 + $obs = ($end-$start+1) . $obs; + } + + if (exists $variant{$chr, $start, $obs}) { + $variant{$chr, $start, $obs} .= "\n$_"; + } else { + $variant{$chr, $start, $obs} = "$ref\n$_"; + } + } + + if ($filedone or $batchdone) { + printerr "NOTICE: Processing next batch with ${\(scalar keys %variant)} unique variants in $batchlinecount input lines\n"; + filterNextBatch (\%variant); + %variant = (); + $batchlinecount = 0; #reset the line count for this batch + $batchdone = 0; + } + if ($filedone) { + last; + } + } + close (INVALID); close (DROPPED); close (FIL); + if ($invalidcount) { + printerr "NOTICE: Variants with invalid input format were written to $outfile.invalid_input\n"; + } else { + unlink ("$outfile.invalid_input"); + } +} + +sub filterNextBatch { + my ($variant) = @_; + my $dbfile; + + if ($dbtype1 eq 'generic') { + $dbfile = File::Spec->catfile ($dbloc, $genericdbfile); + } elsif ($dbtype1 eq 'vcf') { + $dbfile = File::Spec->catfile ($dbloc, $vcfdbfile); + } else { + $dbfile = File::Spec->catfile ($dbloc, "${buildver}_$dbtype1.txt"); + } + + open (DB, $dbfile) or die "Error: cannot read from input database file $dbfile: $!\n"; + printerr "NOTICE: Scanning filter database $dbfile..."; + + my (@record, $chr, $start, $end, $ref, $obs, $score, $qual, $fil, $info); + my ($rsid, $strand, $ucscallele, $twoallele, $class, $af, $attribute); + my $count_invalid_dbline; + while (<DB>) { + my (@obs2, @score2); #for 1000G2010 data set in VCF format, some tri-allelic SNPs are present; in the future, some quad-allelic SNPs may be also present in VCF files + s/[\r\n]+$//; + m/\S/ or next; #skip empty lines in the database file (sometimes this occurs) + m/^#/ and next; #skip the comment line + if ($dbtype eq 'avsift') { + @record = split (/\t/, $_); + @record == 8 or die "Error: invalid record found in DB file $dbfile (8 tab-delimited fields expected): <$_>\n"; + ($chr, $start, $end, $ref, $obs, $score) = @record; + if ($chromosome) { + $valichr{$chr} or next; + } + if ($score < $sift_threshold) { #this is a deleterious mutation, skip it (equal sign should not be used, otherwise the score=0 will be skipped) + next; + } + } elsif ($dbtype =~ m/^ljb_/) { + @record = split (/\t/, $_); + @record >= 5 or die "Error: invalid record found in DB file $dbfile (at least 5 tab-delimited fields expected): <$_>\n"; + ($chr, $start, $end, $ref, $obs, $score) = @record; + if ($chromosome) { + $valichr{$chr} or next; + } + if (defined $score and defined $score_threshold and $score < $score_threshold) { + next; + } + } elsif ($dbtype =~ m/^snp\d+/) { + @record = split (/\t/, $_, -1); #-1 is required before some dbSNP records have many empty tab fields in the end + @record == 18 or @record == 26 or die "Error: invalid record found in dbSNP database file $dbfile (18 or 26 fields expected but found ${\(scalar @record)}): <$_>\n" . join("\n",@record); + $record[1] =~ s/^chr// or die "Error: invalid record found in DB file (2nd field should start with 'chr'): <$_>\n"; + ($chr, $start, $end, $rsid, $strand, $ucscallele, $twoallele, $class) = @record[1,2,3,4,6,8,9,11]; + $start++; #UCSC use zero-start system + if ($chromosome) { + $valichr{$chr} or next; + } + unless ($class eq 'single' or $class eq 'deletion' or $class eq 'in-del' or $class eq 'insertion') { #enum('unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion') + next; + } + + my @allele = split (/\//, $twoallele); + + #before Jan 2011, only di-allelic SNPs are handled in ANNOVAR + #@allele == 2 or next; #many entries have no allele information (for example, rs71010435) + #in Jan 2011 version, I decided to handle tri-allelic and quad-allelic SNP as well + + @allele >= 2 or next; #Jan 2011 modification + if ($strand eq '-') { #handle reverse strand annotation (the vast majority of records in dbSNP should be already in + strand) + for my $i (0 .. @allele-1) { + $allele[$i] = revcom ($allele[$i]); + } + #$ucscallele = revcom ($ucscallele); #added Jan 24, 2011 (per Kevin Ha) removed Feb 10, 2011 (per Eric Stawiski) + #note that some SNPs (e.g., rs28434453) may have multiple location in diferent chromosome or strand; I may want to handle this by a special flag in the future + #585 chr1 13301 13302 rs28434453 0 - C C C/T genomic single etc... + #1367 chr15 102517867 102517868 rs28434453 0 + G G C/T genomic single etc... + } + + #in-del is usually annotated below, so they require special treatment + #587 chr1 384538 384539 rs3971283 0 + T T -/ATT genomic in-del unknown 0 0 unknown exact 3 + if ($class eq 'in-del') { #indel are usually annotated as -/xxx, where xxx is the alternative allele + $obs = length ($ucscallele) . $allele[1]; #prefix a number before the alleles, indicating block substitution + defined $allele[1] or die "no allele 1 <$_>"; + } elsif ($class eq 'insertion') { + $start--; + $obs = "0$allele[1]"; + } elsif ($class eq 'deletion') { + $obs = length ($ucscallele); + } else { + for my $i (0 .. @allele-1) { + if ($ucscallele eq $allele[$i]) { + @obs2 = @allele; + splice (@obs2, $i, 1); + for my $j (0 .. @obs2-1) { + push @score2, $rsid; + } + } + } + if (@obs2) { + $obs = shift @obs2; + $score = shift @score2; + } else { + $verbose and printerr ("Database error: wildtype base $ucscallele is not part of the allele description in <$_>\n"); + next; + } + } + $score = $rsid; + } elsif ($dbtype =~ m/^1000g_(\w+)/ or $dbtype =~ m/^1000g2010_(\w+)/ or $dbtype =~ m/^1000g2010\w\w\w_(\w+)/) { #dbtype1 should NOT be used here + @record = split (/\t/, $_); + @record == 5 or @record == 6 or die "Error: invalid record found in 1000G database file $dbfile (5 or 6 fields expected): <$_>\n"; + ($chr, $start, $ref, $obs, $af) = @record; #there is no "END" in 1000G input file + if ($chromosome) { + $valichr{$chr} or next; + } + if ($maf_threshold) { + if ($af > 0.5) { #the frequency is the non-reference allele frequency, which could exceed 0.5 + 1-$af >= $maf_threshold or next; + } else { + $af >= $maf_threshold or next; + } + } + $score = $af; + } elsif ($dbtype eq 'generic') { + ($chr, $start, $end, $ref, $obs, $score) = split (/\t/, uc $_); #make sure to use upper case, as query is always in upper case + defined $obs or die "Error: the generic database file must contains at least five tab-delimited fields per line (but observed line: $_)\n"; + defined $score or $score = "NA"; + if ($chromosome) { + $valichr{$chr} or next; + } + defined $obs or die "Error: invalid record found in DB file $dbfile (at least 5 fields expected for 'generic' dbtype): <$_>\n"; + if ($start == $end and $ref eq '-') { #insertion + $obs = "0$obs"; + } + if ($obs eq '-') { #deletion + $obs = $end-$start+1; + } elsif ($start != $end) { #block substitution + $obs = ($end-$start+1) . $obs; + } + if (defined $score and defined $score_threshold and $score < $score_threshold) { + next; + } + } elsif ($dbtype eq 'vcf') { #vcf file is adopted by 1000 Genomes Project; it can describe both SNPs and indels, and it may contain both summary level statistics and individual level genotype calls + ($chr, $start, $rsid, $ref, $obs, $qual, $fil, $info) = split (/\t/, $_); + if ($chromosome) { + $valichr{$chr} or next; + } + + my ($ac, $an); + + if ($info =~ m/AF=([^;]+)/) { + $score = $1; + if ($obs =~ m/(\w),(\w)/) { #1000G November; this format is not really valid because it does not handle tri-allelic SNP + ($obs, @obs2) = ($1, $2); + @score2 = ($score); + } + } elsif ($info =~ m/AC=(\S+?);AN=(\d+)/) { + my ($alleles, $count) = ($1, $2); + if ($alleles =~ m/^(\d+),(.+)/) { + $score = sprintf ("%.3f", $1/$count); + @score2 = split (/,/, $2); + @score2 = map {sprintf("%.3f", $_/$count)} @score2; + ($obs, @obs2) = split (/,/, $obs); #the obs is composed of two alleles + } else { + $af = sprintf ("%.3f", $alleles/$count); + $score = $af; + #this is an invalid record in 1000GJuly: 1 2266231 rs11589451 C T,A . PASS AA=c;AC=20;AN=120;DP=237 + if ($obs =~ m/(\w),/) { + $count_invalid_dbline++; + $verbose and printerr "WARNING: Invalid input line found in $dbfile (more than one alleles are observed, but only one is annotated with allelic counts): <$_>\n"; + next; + } + } + } else { + printerr "WARNING: the VCF file does not contain allele frequency information. ANNOVAR cannot process this file\n"; + exit; + } + + if (length ($ref) == 1 and length ($obs) == 1) {#single base substitution + 1; #the obs and obs2 is already handled + } elsif ($obs =~ m/^\-((\w)(\w*))$/) { #deletion (1000G March) + $2 eq $ref or $ref eq 'N' or die "Error: mismatch of deleted allele and reference allele: <$_>\n"; + $obs = length ($1); + } elsif ($obs =~ m/^\+(\w+)$/) { #insertion (1000G March) + $obs = "0$1"; + } elsif ($ref =~ m/^[ACGTN]+$/ and $obs =~ m/^[ACGTN]+$/) { + if (length ($obs) == 1) { #deletion (1000G July) + substr ($ref, 0, 1) eq $obs or die "Error: mismatch of deleted allele and reference allele: ref=$ref obs=$obs in <$_>\n"; + $start++; + $obs = length ($ref)-1; + } elsif (length ($ref) == 1) { #duplication (1000G July) + substr ($obs, 0, 1) eq $ref or die "Error: mismatch of duplicated allele and reference allele: ref=$ref obs=$obs in <$_>\n"; + $start++; + $obs = "0" . substr ($obs, 1); + } + } else { + die "Error: invalid record found in VCF file: ref=$ref obs=$obs <$_>\n"; + } + } else { + die "invalid dbtype: $dbtype\n"; + } + + if ($variant->{$chr, $start, $obs}) { + my ($ref, @info) = split (/\n/, $variant->{$chr, $start, $obs}); #most likely, only one piece of information + for my $i (0 .. @info-1) { + print DROPPED join ("\t", $dbtype, $score), "\t", $info[$i], "\n"; + } + delete $variant->{$chr, $start, $obs}; + } + if (@obs2) { + for my $j (0 .. @obs2-1) { + if ($variant->{$chr, $start, $obs2[$j]}) { + my ($ref, @info) = split (/\n/, $variant->{$chr, $start, $obs2[$j]}); #most likely, only one piece of information + for my $i (0 .. @info-1) { + print DROPPED join ("\t", $dbtype, $score2[$j]), "\t", $info[$i], "\n"; + } + delete $variant->{$chr, $start, $obs2[$j]}; + } + } + } + } + for my $key (keys %$variant) { + my ($chr, $start, $obs) = split ($;, $key); #hash key separator + my ($ref, @info) = split (/\n/, $variant->{$key}); + my $len; + if ($obs =~ m/^(\d+)(.*)/) { + ($len, $obs) = ($1, $2); + $obs ||= '-'; #deletion + if ($len) { + $end = $start+$len-1; + } else { + $end = $start; + } + } else { + $end = $start; + } + for my $i (0 .. @info-1) { + print FIL $info[$i], "\n"; + } + } + printerr "Done\n"; + $count_invalid_dbline and printerr "WARNING: $count_invalid_dbline lines in dbfile $dbfile were ignored due to invalid formats\n"; +} + +sub annotateQueryByRegion { + open (QUERY, $queryfile) or die "Error: cannot read from --queryfile ($queryfile): $!\n"; + open (OUT, ">$outfile.${buildver}_$dbtype1") or die "Error: cannot write to output file $outfile.${buildver}_$dbtype1: $!\n"; + open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n"; + + my ($regiondb, $parent) = ({}, {}); + + if ($dbtype eq 'gff3') { + ($regiondb, $parent) = readGFF3RegionAnnotation (); + } elsif ($dbtype eq 'bed') { + ($regiondb) = readBedRegionAnnotation (); + } else { + ($regiondb) = readUCSCRegionAnnotation (); + } + + my ($chr, $start, $end, $ref, $obs); + my ($invalid); + my ($linecount, $invalidcount) = qw/0 0/; + + $time and printerr "NOTICE: Current time (before examining variants) is ", scalar (localtime), "\n"; + while (<QUERY>) { + s/[\r\n]+$//; + + if (m/^#/ and $comment) { #comment line start with #, do not include this is $linecount + print OUT "#comment\t#comment\t$_\n"; + next; + } + + $linecount++; + + $invalid = 0; #reset invalid status + + my @nextline = split (/\s+/, $_); + ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn]; + if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) { + $invalid++; + } else { + ($ref, $obs) = (uc $ref, uc $obs); + $zerostart and $start++; + $chr =~ s/^chr//; + if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) { + $invalid++; + } elsif ($ref eq '-' and $obs eq '-' #both are empty allele + or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code + or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code + or $start =~ m/[^\d]/ #start is not a number + or $end =~ m/[^\d]/ #end is not a number + or $start > $end #start is more than end + or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref + or $ref eq '-' and $start != $end #length mismatch for insertion + ) { + $invalid++; + } + } + + + if ($invalid) { + print INVALID $_, "\n"; #invalid record found + $invalidcount++; + next; + } + + my $bin1 = int ($start/$genomebinsize); #start bin + my $bin2 = int ($end/$genomebinsize); #end bin (usually same as start bin, unless the query is really big that spans multiple megabases) + my ($foundhit, $score, $name); + for my $bin ($bin1 .. $bin2) { + for my $nextgene (@{$regiondb->{$chr, $bin}}) { + my ($txstart, $txend, $txscore, $txname) = @$nextgene; + + if ($end < $txstart) { + #db: <-------------------------> + #query: <---> + last; #if genomic region is too far away from end, end the search of the bins + } elsif ($end <= $txend) { #query contained completely within db region + if ($start >= $txstart) { + #db: <--------------------------> + #query: <------------------> + } else { #query overlap but upstream of db region + #db: <-------------------------> + #query: <----------------------> + if ($minqueryfrac) { + if (($end-$txstart+1)/($end-$start+1) < $minqueryfrac) { + next; + } + } + } + $foundhit++; + $score ||= $txscore; $name ||= $txname; + if ($score < $txscore) { + $score = $txscore; + $name=$txname; + } + if ($score == $txscore and defined $name and $name ne $txname) { + $name .= ",$txname"; + } + if ($dbtype1 eq 'cytoBand') { #a new chromosome band is encountered + $name ne $txname and $name .= ",$txname"; + } + } elsif ($start <= $txend) { + if ($start >= $txstart) { #query overlap but downstream of db region + #db: <------------------------> + #query: <-----------------------> + if ($minqueryfrac) { + if (($txend-$start+1)/($end-$start+1) < $minqueryfrac) { + next; + } + } + } else { + #db region completely contained within query + #db: <-------------------------> + #query: <------------------------------> + if ($minqueryfrac) { + if (($txend-$txstart+1)/($end-$start+1) < $minqueryfrac) { + next; + } + } + } + $foundhit++; + $score ||= $txscore; $name ||= $txname; + if ($score < $txscore) { + $score = $txscore; + $name=$txname; + } + if ($score == $txscore and defined $name and $name ne $txname) { + $name .= ",$txname"; + } + if ($dbtype1 eq 'cytoBand') { #a new chromosome band is encountered + $name ne $txname and $name .= ",$txname"; + } + } else { + #query --- + #gene <-*----*-> + } + } + } + $linecount =~ m/000000$/ and printerr "NOTICE: Finished processing $linecount variants in queryfile\n"; + if ($foundhit) { + $name ||= ''; + my @name = split (/,/, $name); + my %name = map {$_, 1} @name; + @name = keys %name; + + if ($dbtype1 eq 'cytoBand') { + map {s/^chr//} @name; + if (@name >= 2) { + $name[$#name] =~ s/^\d+//; + $name = $name[0] . '-' . $name[$#name]; + } else { + $name = $name[0]; + } + print OUT "$dbtype\t$name\t$_", "\n"; + } else { + $name = join (",", @name); + print OUT "$dbtype\t", $score?"Score=$score;":"", $name?"Name=$name":"", "\t", $_, "\n"; + } + } + } + close (QUERY); + close (OUT); + close (INVALID); + $time and printerr "NOTICE: Current time (after examining variants) is ", scalar (localtime), "\n"; + + printerr "NOTICE: Finished region-based annotation on $linecount genetic variants in $queryfile"; + if ($invalidcount) { + printerr " (including $invalidcount with invalid format written to $outfile.invalid_input)"; + } else { + unlink ("$outfile.invalid_input"); + } + printerr "\n"; + printerr "NOTICE: Output files were written to $outfile.${buildver}_$dbtype1\n"; +} + +sub readGFF3RegionAnnotation { + my ($dbfile); + my ($regioncount, $dbcount) = (0, 0); + my (@record, %regiondb, %parent); + + $dbfile = File::Spec->catfile ($dbloc, $gff3dbfile); + -f $dbfile or die "Error: required database $dbfile does not exists. Please use 'annotate_variation.pl -downdb $dbtype $dbloc -buildver $buildver' to download annotation database.\n"; + + open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n"; + printerr "NOTICE: Reading annotation database $dbfile ... "; + $_ = <DB>; + $_ =~ m/^##gff-version\s+3/ or die "Error: invalid header line found in the GFF3 database $dbfile (expect to see '##gff-version 3'): <$_>\n"; + while (<DB>) { + m/^#/ and next; #skip comments line + m/^##FASTA/ and last; #reached the FASTA sequence section of GFF3 file + $dbcount++; + s/[\r\n]+$//; #deleting the newline characters + @record = split (/\t/, $_); + @record == 9 or die "Error: invalid records found in the GFF3 database $dbfile (9 fields expected): <$_>\n"; + my ($chr, $start, $end, $score, $attribute) = @record[0,3,4,5,8]; + $chr=~s/^chr//; #sometimes the chr prefix is present and should be removed (query usually does not contain this chr prefix) + my $name; + defined $score_threshold and $score < $score_threshold and next; #if --score_threshold is set, the low scoring segment will be skipped + + my @feature = split (/;/, $attribute); + for my $i (0 .. @feature-1) { + $feature[$i] =~ m/ID=(\S+)/ and $name = $1; + } + defined $name or die "Error: invalid record in GFF3 database $dbfile (ID field not found): <$_>\n"; + for my $i (0 .. @feature-1) { + if ($feature[$i] =~ m/Parent=(.+)/) { + my @parent = split (/,/, $1); + for my $j (0 .. @parent-1) { + $parent{$name} .= $parent[$j]; + } + } + } + + my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize)); + for my $nextbin ($bin1 .. $bin2) { + push @{$regiondb{$chr, $nextbin}}, [$start, $end, $score, $name]; + } + $regioncount++; + if ($verbose and $dbcount =~ m/000000$/) { + my ($availmem, $allmem) = currentAvailMemory(); + printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n"; + } + } + close (DB); + for my $key (keys %regiondb) { #pre-sort gene DB by txstart to faciliate future use + @{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}}; + } + printerr "Done with $regioncount regions\n"; + return (\%regiondb, \%parent); +} + +sub readBedRegionAnnotation { + my ($dbfile); + my ($regioncount, $dbcount) = (0, 0); + my (@record, %regiondb); + my ($chr, $start, $end); + + $dbfile = File::Spec->catfile ($dbloc, $bedfile); + + -f $dbfile or die "Error: required bedfile $dbfile does not exists.\n"; + + open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n"; + printerr "NOTICE: Reading annotation database $dbfile ... "; + + while (<DB>) { + $dbcount++; + s/[\r\n]+$//; #deleting the newline characters + @record = split (/\t/, $_); + + ($chr, $start, $end) = @record; + + + $chr =~ s/^chr//; + $start++; #due to the zero-opening coordinate system in UCSC + + my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize)); + for my $nextbin ($bin1 .. $bin2) { + push @{$regiondb{$chr, $nextbin}}, [$start, $end, 0, 'NA']; + } + $regioncount++; + if ($verbose and $dbcount =~ m/000000$/) { + my ($availmem, $allmem) = currentAvailMemory(); + printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n"; + } + } + close (DB); + + for my $key (keys %regiondb) { #pre-sort gene DB by txstart to faciliate future use + @{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}}; + } + printerr "Done with $regioncount regions\n"; + return (\%regiondb); +} + +sub readUCSCRegionAnnotation { + my ($dbfile); + my ($regioncount, $dbcount) = (0, 0); + my (@record, %regiondb); + my ($chr, $start, $end, $score, $normscore, $name); + my ($expectedLength, @positionCols, @scoreCols, @colsToOutput); + + if ($dbtype1 =~ m/^mce(\d+way)$/) { + $dbfile = File::Spec->catfile ($dbloc, "${buildver}_phastConsElements$1.txt"); + } else { + $dbfile = File::Spec->catfile ($dbloc, "${buildver}_$dbtype1.txt"); + } + -f $dbfile or die "Error: required database $dbfile does not exists. Please use 'annotate_variation.pl -downdb $dbtype $dbloc' to download annotation database.\n"; + + #################$$$ + ### The following SWITCH structure is modified Jan 2011 to faciliate future expansion + ### $expectedLength is the number of cols expected in each line + ### @postionCols => location of ($chr,$start,$end) columns + ### @scoreCols => location of ($score, $normscore) columns leave empty is set not present (then set to zero below) ; WARNING must be empty or of length 2 + ### @colsToOutPut => location of ($name) columns to put into $name concatinated with ":" below + + if ($dbtype1 =~ m/^phastConsElements\d+way/) { + $expectedLength=6; + @positionCols=(1,2,3); + @scoreCols=(4,5); #normalized score + @colsToOutput=(4); #lod=xxx is the Name output + } elsif ($dbtype1 eq 'evofold') { + $expectedLength=10; + @positionCols=(1,2,3); + @scoreCols=(5,5); + @colsToOutput=(4); + } elsif ($dbtype1 eq 'tfbsConsSites') { + $expectedLength=8; + @positionCols=(1,2,3); + @scoreCols=(7,5); + @colsToOutput=(4); + } elsif ($dbtype1 eq 'wgRna') { + $expectedLength=10; + @positionCols=(1,2,3); + @scoreCols=(5,5); + @colsToOutput=(4); + } elsif ($dbtype1 eq 'targetScanS') { + $expectedLength=7; + @positionCols=(1,2,3); + @scoreCols=(5,5); + @colsToOutput=(4); + } elsif ($dbtype1 eq 'genomicSuperDups') { + $expectedLength=30; + @positionCols=(1,2,3); + @scoreCols=(27,27); + @colsToOutput=(4); + } elsif ($dbtype1 eq 'omimGene') { + $expectedLength=5; + @positionCols=(1,2,3); + @scoreCols=(); + @colsToOutput=(4); + } elsif ($dbtype1 eq 'gwasCatalog') { + $expectedLength=23; + @positionCols=(1,2,3); + @scoreCols=(); + @colsToOutput=(10); + } elsif ($dbtype1 eq 'dgv') { + $expectedLength=16; + @positionCols=(1,2,3); + @scoreCols=(); + @colsToOutput=(4); + } elsif ($dbtype1 eq 'cytoBand') { #special handling required + $expectedLength=5; + @positionCols=(0,1,2); + @scoreCols=(); + @colsToOutput=(0,3); + } elsif ($dbtype1 =~ m/^chr\w+_chainSelf$/) { #example: chr1_selfChain + $expectedLength=13; + @positionCols=(2,4,5); + @scoreCols=(12,12); + @colsToOutput=(11); + } elsif ($dbtype1 =~ m/^chr\w+_chain\w+$/) { #example: chr1_chainPanTro2 + $expectedLength=12; + @positionCols=(2,4,5); + @scoreCols=(); + @colsToOutput=(11); + } elsif ($dbtype1 eq 'snp130' or $dbtype1 eq 'snp131') { + $expectedLength=18; + @positionCols=(1,2,3); + @scoreCols=(); + @colsToOutput=(4); + } else { + #other UCSC format if file is not defined above + $expectedLength=''; + @positionCols=(1,2,3); + @scoreCols=(); + @colsToOutput=(4); + } + + if ($scorecolumn) { + @scoreCols = ($scorecolumn, $scorecolumn); + } + + open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n"; + printerr "NOTICE: Reading annotation database $dbfile ... "; + + if ($expectedLength eq '') { # if DB is unknown "generic format" use first line to get $expectedLength : file rewound afterwards + my $line = <DB>; + @record = split (/\t/, $line); + $expectedLength=@record; + seek (DB, 0, 0); + }; + + ########$$ Check to see if user has defined columns to output (intergers or all allowed) + if (defined $colsWanted) { + if ($colsWanted[0] eq 'all') { + @colsToOutput= 0 .. ($expectedLength-1); + } elsif ($colsWanted[0] eq 'none') { + @colsToOutput = (); + } else{ + @colsToOutput = @colsWanted; + } + }; + + ########$$ check that the columns requested exist in the current DB + for my $i (0 .. @colsToOutput-1) { + if ($colsToOutput[$i] > $expectedLength) { + die "Error: The DB file $dbfile has only $expectedLength columns but output column $colsToOutput[$i] is requested by --colsWanted!\n"; + } + } + + while (<DB>) { + $dbcount++; + s/[\r\n]+$//; #deleting the newline characters + @record = split (/\t/, $_); + + @record == $expectedLength or die "Error: invalid record in dbfile $dbfile ($expectedLength fields expected): <$_>\n"; + ($chr, $start, $end) = @record[@positionCols]; + if (@colsToOutput) { #I think there should always be a Name in the output column + $name = join (':', @record[@colsToOutput]); + } + + if(@scoreCols){ + ($score, $normscore)=(@record[@scoreCols]) + } else{ + ($score, $normscore) = qw/0 0/; + } + + #########$$ Unusual exceptions for phastCons + if ($dbtype1 =~ m/^phastConsElements\d+way/) { + $score =~ s/^lod=// or die "Error: invalid lod score designation (no 'lod=' found) in dbfile $dbfile: <$_>\n"; + } ##lod= in the score for conservation tracks + + #########$$ Unusual exceptions for cytoBand + if ($dbtype1 eq 'cytoBand' and not defined $colsWanted) { #the name for chromosome band is concatenated as single word + $name =~ s/://; + } + + defined $score_threshold and $score < $score_threshold and next; #if --score_threshold is set, the low scoring segment will be skipped + defined $normscore_threshold and $normscore < $normscore_threshold and next; #if --normscore_threshold is set, the low scoring segment will be skipped + + $chr =~ s/^chr//; + $start++; #due to the zero-opening coordinate system in UCSC + + my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize)); + for my $nextbin ($bin1 .. $bin2) { + if ($rawscore) { #print out rawscore, rather than normalized score (default) + $normscore = $score; + } + if (defined $name) { + push @{$regiondb{$chr, $nextbin}}, [$start, $end, $normscore, $name]; + } else { #name is not requested in the output + push @{$regiondb{$chr, $nextbin}}, [$start, $end, $normscore]; + } + } + $regioncount++; + if ($verbose and $dbcount =~ m/000000$/) { + my ($availmem, $allmem) = currentAvailMemory(); + printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n"; + } + } + close (DB); + + for my $key (keys %regiondb) { #pre-sort gene DB by txstart to faciliate future use + @{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}}; + } + printerr "Done with $regioncount regions"; + if (defined $score_threshold or $normscore_threshold) { + printerr " (that passed --score_threhsold or --normscore_threshold from a total of $dbcount regions)\n"; + } else { + printerr "\n"; + } + return (\%regiondb); +} + + +sub translateDNA { + my ($seq) = @_; + my ($nt3, $protein); + $seq = uc $seq; + #length ($seq) % 3 == 0 or printerr "WARNING: length of DNA sequence to be translated is not multiples of 3: <length=${\(length $seq)}>\n"; + while ($seq =~ m/(...)/g) { + defined $codon1{$1} or printerr "WARNING: invalid triplets found in DNA sequence to be translated: <$1>\n"; + $protein .= $codon1{$1}; + } + return $protein; +} + +sub translateRNA { + my ($seq) = @_; + my ($nt3, $protein); + $seq = uc $seq; + #length ($seq) % 3 == 0 or printerr "WARNING: length of RNA sequence to be translated is not multiples of 3: <length=${\(length $seq)}>\n"; + while ($seq =~ m/(...)/g) { + defined $codonr1{$1} or printerr "WARNING: invalid triplets found in RNA sequence to be translated: <$1>\n"; + $protein .= $codonr1{$1}; + } + return $protein; +} + +sub revcom { + my ($seq) = @_; + $seq = reverse $seq; + $seq =~ tr/acgtACGT/tgcaTGCA/; + return ($seq); +} + +sub readSeqFromFASTADB { + my ($refseqvar) = @_; + my (%seqhash); + my $seqdbfile; + + #the four statements below should be condensed in the future (they are identical) + $seqdbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1" . "Mrna.fa"); + + my ($seqid, $curseq) = ('', ''); + + -f $seqdbfile or die "Error: FASTA sequence file $seqdbfile does not exist. Please use 'annotate_variation.pl --downdb $dbtype $dbloc' download the database.\n"; + open (SEQ, $seqdbfile) or die "Error: cannot read from seqdbfile $seqdbfile: $!\n"; + printerr "NOTICE: Reading FASTA sequences from $seqdbfile ... "; + while (<SEQ>) { + if (m/^>(\S+)/) { + if ($refseqvar->{$seqid}) { + not defined $seqhash{$seqid} and $seqhash{$seqid} = $curseq; #finish reading the sequence for seqid and save it (unless the sequence is already read from the file) + } + $seqid = $1; + $curseq = ''; + } else { + if ($refseqvar->{$seqid}) { + s/[\r\n]+$//; + $curseq .= uc $_; #only use upper case characters + } + } + } + if ($refseqvar->{$seqid}) { #finish the last sequence in the file + not defined $seqhash{$seqid} and $seqhash{$seqid} = $curseq; + } + close (SEQ); + printerr "Done with ", scalar keys %seqhash, " sequences\n"; + if (keys %seqhash < keys %$refseqvar) { + my (@seqnotfound, @seqnotfound_example); + for $seqid (keys %$refseqvar) { + exists $seqhash{$seqid} or push @seqnotfound, $seqid; + } + printerr "WARNING: A total of ${\(scalar @seqnotfound)} sequences cannot be found in $seqdbfile"; + @seqnotfound_example = splice (@seqnotfound, 0, 3); + printerr " (example: @seqnotfound_example)\n"; + } + return (\%seqhash); +} + +sub readKgXref { + my ($inputfile) = @_; + my (%gene_xref); + open (XREF, $inputfile) or die "Error: cannot read from kgxref file $inputfile: $!\n"; + while (<XREF>) { + m/^#/ and next; + s/[\r\n]+$//; + my @record = split (/\t/, $_); + @record == 8 or die "Error: invalid record found in knownGene cross-reference file (6 fields expected): <$_>\n"; + #some genes were given names that are prefixed with "Em:" which should be removed due to the presence of ":" in exonic variant annotation + #Em:AC006547.7 Em:AC005003.4 Em:U62317.15 Em:AC008101.5 Em:AC004997.11 Em:U51561.2 + $record[4] =~ s/^Em:/Em./; + if ($gene_xref{$record[0]}) { #BC003168 occur twice in kgxref file (OSBPL10, BC003168) + if ($gene_xref{$record[0]} =~ m/^(BC|AK)\d+$/) { + $gene_xref{$record[0]} = $record[4]; + } + } else { + $gene_xref{$record[0]} = $record[4]; + } + } + close (XREF); + return (\%gene_xref); +} + +sub readUCSCGeneAnnotation { #read RefGene annotation database from the UCSC Genome Browser, convert 0-based coordinates to 1-based coordinates + my ($dbloc) = @_; + my ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes); + my (%genedb, %geneidmap, %name2count, %cdslen, %mrnalen); + my ($genecount, $ncgenecount) = (0, 0); + + my $dbfile; + my $kgxref; + + if ($dbtype1 eq 'refGene') { + $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt"); + } elsif ($dbtype1 eq 'knownGene') { + $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt"); + my $kgxreffile = File::Spec->catfile($dbloc, $buildver . "_kgXref.txt"); + -f $kgxreffile or die "Error: the knownGene cross-reference file $kgxreffile does not exist. Please use 'annotate_variation.pl --downdb knownGene $dbloc' to download the database.\n"; + $kgxref = readKgXref ($kgxreffile); + } elsif ($dbtype1 eq 'ensGene') { + $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt"); + } else { + $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt"); #added 2011feb18 + #die "FATAL ERROR: the dbype $dbtype1 is not supported in the readUCSCGeneAnnotation() subroutine.\n"; #commented 2011feb18 + } + -f $dbfile or die "Error: The gene annotation database $dbfile does not exist. Please use 'annotate_variation.pl --downdb $dbtype $dbloc -build $buildver' to download the database.\n"; + + open (GENEDB, $dbfile) or die "Error: cannot read from gene annotaion database $dbfile: $!\n"; + printerr "NOTICE: Reading gene annotation from $dbfile ... "; + while (<GENEDB>) { + s/[\r\n]+$//; #deleting the newline characters + my @record = split (/\t/, $_); + + if ($dbtype1 eq 'refGene') { + @record == 16 or die "Error: invalid record in $dbfile (expecting 16 tab-delimited fields in refGene file): <$_>\n"; + ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15]; #human hg18, mouse + } elsif ($dbtype1 eq 'knownGene') { + @record >= 11 or die "Error: invalid record in $dbfile (>=11 fields expected in knownGene file): <$_>\n"; #mm8=11, hg18=hg19=12 + ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend) = @record[0..9]; + $name2 = $kgxref->{$name} || $name; + } elsif ($dbtype1 eq 'ensGene') { + @record == 16 or die "Error: invalid record in $dbfile (expecting 16 fields in ensGene file): <$_>\n"; + ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15]; + } else { + @record >= 11 or die "Error: invalid record in $dbfile (>=11 fields expected in $dbtype1 gene definition file): <$_>\n"; + ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15]; + defined $name2 or $name2=$name; + #die "FATAL ERROR: the --dbtype $dbtype is not supported in readUCSCGeneAnnotation() subroutine.\n"; #commented 2011feb18 + } + + #handle situations where the same transcript is mapped to several chromosomes or regions (for example, NM_019105 is mapped to chr6, chr6_cox_hap1, chr6_qbl_hap2; NM_002538 is mapped to chr5 positive and negative strand and also in chr5_h2_hap1) + if ($chr =~ m/hap\d+$/) { + next; #this is a temporary solution on 2011feb19, to ignore alternative haplotype chromosomes + } + + $chr =~ s/^chr// or die "Error: invalid record found in $dbfile (chrom field not found): <$_>\n"; #UCSC always prefix "chr" to the chromosome identifier, so this is a good check to make sure that the file is the correct file + $dbstrand eq '+' or $dbstrand eq '-' or die "Error: invalid dbstrand information found in $dbfile (dbstrand has to be + or -): <$_>\n"; #dbstrand is important to know and cannot be optional + my @exonstart = split (/,/, $exonstart); #remove trailing comma + my @exonend = split (/,/, $exonend); #remove trailing comma + $exoncount == @exonstart or die "Error: invalid record found in $dbfile (exoncount discordance): <$exoncount vs ${\(scalar @exonstart)}>\n"; + @exonstart == @exonend or die "Error: invalid record found in $dbfile (exonstart and exonend count discordance): <${\(scalar @exonstart)} vs ${\(scalar @exonend)}>\n"; + $txstart++; $cdsstart++; map {$_++} @exonstart; #convert 0-based coordinate to 1-based coordinate + + #LOGIC here: + #first calcluate mRNA length, and if the transcript maps to multiple locations with discordant mRNA length, only consider the leftmost chromosome and leftmost coordinate (because the FASTA file is sorted in this manner) + + my $cdslength = 0; + my $mrnalength = 0; + for my $i (0 .. @exonstart-1) { #this calculation is valid regardless of strand + $mrnalength += $exonend[$i]-$exonstart[$i]+1; + if ($cdsstart >= $exonstart[$i] and $cdsstart <= $exonend[$i]) { + if ($cdsend <= $exonend[$i]) { + $cdslength = $cdsend-$cdsstart+1; + last; + } else { + $cdslength += $exonend[$i]-$cdsstart+1; + next; + } + } + if ($cdslength and $cdsend < $exonstart[$i]) { + die "FATAL ERROR: impossible scenario for $name in $dbfile (cdsend is less than exon start)"; + } elsif ($cdslength and $cdsend <= $exonend[$i]) { + $cdslength += $cdsend-$exonstart[$i]+1; + last; + } elsif ($cdslength and $cdsend > $exonend[$i]) { + $cdslength += $exonend[$i]-$exonstart[$i]+1; + } + + } + + if ($cdsstart != $cdsend+1) { #coding gene + if (defined $mrnalen{$name} and $mrnalen{$name} != $mrnalength) { + $verbose and printerr "WARNING: $name occurs more than once in $dbfile with different mRNA length. The first occurences with identical mRNA length will be uesd in analysis.\n"; + next; + } + + + if (defined $cdslen{$name} and $cdslen{$name} != $cdslength) { + $verbose and printerr "WARNING: $name occurs more than once in $dbfile with different CDS length. The first occurences with identical mRNA length will be uesd in analysis.\n"; + next; + } + } + + $cdslen{$name} = $cdslength; + $mrnalen{$name} = $mrnalength; + + my ($bin1, $bin2) = (int(($txstart - $neargene)/$genomebinsize), int(($txend + $neargene)/$genomebinsize)); + for my $nextbin ($bin1 .. $bin2) { + push @{$genedb{$chr, $nextbin}}, [$name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, [@exonstart], [@exonend], $name2]; + } + $geneidmap{$name} = $name2; + $genecount++; + $name2count{$name2}++; + $cdsstart == $cdsend+1 and $ncgenecount++; #non-coding gene has the same start and end site + } + close (GENEDB); + for my $key (keys %genedb) { #pre-sort gene DB by txstart to faciliate future use + @{$genedb{$key}} = sort {$a->[2] <=> $b->[2]} @{$genedb{$key}}; + } + printerr "Done with $genecount transcripts (including $ncgenecount without coding sequence annotation) for ", scalar (keys %name2count), " unique genes\n"; + return (\%genedb, \%geneidmap, \%cdslen, \%mrnalen); +} + +sub downloadDB { + my ($cwd, $msg, $sc); + + $cwd = Cwd::cwd(); + + -w $dbloc or die "Error: the directory $dbloc is not writable by the current user\n"; + chdir ($dbloc) or die "Error: the directory $dbloc cannot be accessed\n"; + + my (@urlin, @filein, @fileout, %fail); #the fail hash contains index of files that fail to be downloaded + my $count_success; + if ($dbtype1 eq 'refGene') { + push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/refGene.txt.gz"; + push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/refLink.txt.gz"; + push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_refGeneMrna.fa.gz"; + } elsif ($dbtype1 eq 'knownGene') { + push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/knownGene.txt.gz"; + push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/kgXref.txt.gz"; + push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_knownGeneMrna.fa.gz"; + } elsif ($dbtype1 eq 'ensGene') { + push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/ensGene.txt.gz"; + push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_ensGeneMrna.fa.gz"; + } elsif ($dbtype1 eq 'seq') { + push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/chromFa.zip"; #example: hg18, hg19 + push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/chromFa.tar.gz"; #example: panTro2 + push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/$buildver.fa.gz"; #example: bosTau4 + } elsif ($dbtype1 =~ m/^mce(\d+way)$/) { #it could be 17 way, 28 way, 30 way, 44 way, etc, depending on genome and on build + push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/phastConsElements$1.txt.gz"; + } elsif ($dbtype1 eq 'avsift') { + $buildver eq 'hg18' or $buildver eq 'hg19' or die "Error: currently the --dbtype of avsift only support --buildver of 'hg18' or 'hg19'\n"; + push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_avsift.txt.gz"; + } elsif ($dbtype1 eq '1000g') { #dbtype1 is same as queryfile, when --downdb is used + $buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g' only support --buildver of 'hg18'\n"; + push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/CEU.sites.2009_04.gz"; + push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/YRI.sites.2009_04.gz"; + push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/JPTCHB.sites.2009_04.gz"; + } elsif ($dbtype1 eq '1000g2010') { #dbtype1 is same as queryfile, when --downdb is used + $buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g2010' only support --buildver of 'hg18'\n"; + push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_CEU.sites.2010_03.txt.gz"; + push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_YRI.sites.2010_03.txt.gz"; + push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_JPTCHB.sites.2010_03.txt.gz"; + } elsif ($dbtype1 eq '1000g2010jul') { #dbtype1 is same as queryfile, when --downdb is used + $buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g2010jul' only support --buildver of 'hg18'\n"; + push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_CEU.sites.2010_07.txt.gz"; + push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_YRI.sites.2010_07.txt.gz"; + push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_JPTCHB.sites.2010_07.txt.gz"; + } elsif ($dbtype1 eq '1000g2010nov') { + $buildver eq 'hg19' or die "Error: currently the --dbtype of '1000g2010nov' only support --buildver of 'hg19'\n"; + push @urlin, "http://www.openbioinformatics.org/annovar/download/hg19_ALL.sites.2010_11.txt.gz"; + } elsif ($dbtype1 eq 'null') { + 1; + } else { + if ($webfrom) { + if ($webfrom eq 'annovar') { + push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_$dbtype1.txt.gz"; + } elsif ($webfrom eq 'ucsc') { + push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/$dbtype1.txt.gz"; + } else { + push @urlin, "$webfrom/$dbtype1.txt.gz"; + } + } else { + push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/$dbtype1.txt.gz"; #default goes to UCSC + } + } + + @filein = @urlin; + map {s/.+\///} @filein; + @fileout = @filein; + map {s/\.gz$//; s/\.zip$//} @fileout; + + if ($wget) { + $msg = qx/wget --help 2>&1/ || ''; #collect the output of the system command + } else { + $msg = ''; #when --nowget is specified, do not use wget to retrieve files from Internet + } + if ($msg =~ m/Usage/) { + checkProgramUpdate ("wget"); + for my $i (0 .. @urlin-1) { + printerr "NOTICE: Downloading annotation database $urlin[$i] ... "; + if ($verbose) { + $sc = "wget -t 1 -T 10 -O $filein[$i] $urlin[$i]"; + } else { + $sc = "wget -t 1 -T 10 -q -O $filein[$i] $urlin[$i]"; + } + if (system ($sc)) { #time-out is 10 seconds, with 1 retry attempt + printerr "Failed\n"; + $verbose and print "WARNING: unable to execute system command: <$sc>\n"; + unlink ($filein[$i]); #delete the temporary files generated by wget + $fail{$i}++; + } else { + printerr "OK\n"; + $count_success++; + } + } + } else { + eval { + require Net::FTP; + require LWP::UserAgent; + }; + if ($@) { + printerr "WARNING: cannot retrieve remote files automatically (by 'wget' command or by standard Net::FTP/LWP::UserAgent Perl module).\n"; + printerr "Please manually download the following file, uncompress the files to $dbloc directory, then add a ${buildver}_ prefix to the file names.\n"; + printerr join ("\n", @urlin), "\n"; + exit (100); + } + + checkProgramUpdate ("lwp"); + my ($http, $ftp); + for my $i (0 .. @urlin-1) { + printerr "NOTICE: Downloading annotation database $urlin[$i] ... "; + if ($urlin[$i] =~ m/^http/) { + $http = LWP::UserAgent->new (timeout=>10, show_progress=>$verbose); + $http->env_proxy; + + my $response = $http->get ($urlin[$i], ':content_file'=>$filein[$i]); + if ($response->is_success) { + printerr "Done\n"; + $count_success++; + } else { + printerr "Failed\n"; + $verbose and printerr "WARNING: cannot retrieve remote files ($urlin[$i]) via LWP::UserAgent Perl module: ", $response->status_line, "\n"; + $fail{$i}++; + } + } elsif ($urlin[$i] =~ m#^ftp://([^\\\/]+)#) { #for hgdownload.cse.ucsc.edu, ftp-trace.ncbi.nih.gov, ftp.ensembl.org, etc + my $urlroot = $1; + if ($ftp = Net::FTP->new($urlroot, Timeout=>10, Debug=>$verbose)) { + $ftp->login("anonymous", 'anonymous@'); + $ftp->binary(); + my $url = $urlin[$i]; + $url =~ s#ftp://[\w\.\-]+/##; #remove the URL root + if (not $ftp->get($url)) { + printerr "Failed\n"; + $verbose and printerr "WARNING: cannot retrieve remote file ($url) in FTP server $urlroot\n"; + $fail{$i}++; + } else { + printerr "Done\n"; + $count_success++; + } + } else { + printerr "Failed\n"; + $verbose and printerr "WARNING: cannot retrieve remote file ($urlin[$i]) via Net::FTP Perl module\n"; + $fail{$i}++; + } + + } else { + die "Error: The URL $urlin[$i] uses an unsupported protocol. Download cannot continue\n"; + } + } + } + + $count_success and printerr "NOTICE: Uncompressing downloaded files\n"; + for my $i (0 .. @filein-1) { + $fail{$i} and next; + if ($filein[$i] =~ m/\.zip$/) { + $msg = qx/unzip --help 2>&1/ || ''; #collect the output of the system command + if ($msg =~ m/Usage/i) { + if ($verbose) { + system ("unzip -o $filein[$i]"); + } else { + system ("unzip -o -q $filein[$i]"); + } + } else { + printerr "ERROR: unzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n"; + exit (101); + } + } elsif ($filein[$i] =~ m/\.tar\.gz$/) { #panTro2 FASTA sequence is stored as tar.gz rather than zip + $msg = qx/tar --help 2>&1/ || ''; #collect the output of the system command + if ($msg =~ m/Usage/i) { + system ("tar -x -z -f $filein[$i]"); + } else { + printerr "ERROR: tar/gunzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n"; + exit (102); + } + } elsif ($filein[$i] =~ m/\.gz$/) { + $msg = qx/gunzip --help 2>&1/ || ''; #collect the output of the system command + if ($msg =~ m/Usage/i) { + system ("gunzip -f $filein[$i]"); + } else { + printerr "ERROR: gunzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n"; + exit (103); + } + } + } + + for my $i (0 .. @fileout-1) { + $fail{$i} and next; #skip the file that failed to be downloaded + my $fileout = $fileout[$i]; + $dbtype1 eq 'seq' and next; #the zip file contains dozens of FASTA files so cannot rename them automatically + if (not $fileout =~ m/^${buildver}_/) { #if the buildver is not the prefix of the files + rename ($fileout, "${buildver}_$fileout") or die "Error: cannot rename $fileout to ${buildver}_$fileout\n"; + $fileout = "${buildver}_$fileout"; + } + if (not $fileout =~ m/\.txt$/ and not $fileout =~ m/\.fa$/) { + rename ($fileout, "$fileout.txt"); + } + } + + $count_success and printerr "NOTICE: Finished downloading annotation files for $buildver build version, with files saved at the '$dbloc' directory\n"; + $cwd and chdir ($cwd); + if (%fail) { + my @failindex = keys %fail; + if ($dbtype1 eq 'seq' and @failindex == 1) { #not really a fail, because for seq, ANNOVAR attempts on tar.gz and zip file + 1; + } else { + printerr "WARNING: Some files cannot be downloaded, including ", join (', ', @urlin[@failindex]), "\n"; + } + + for my $index (@failindex) { + if ($urlin[$index] =~ m#^http://www\.openbioinformatics\.org.+Mrna.fa.gz$#) { + printerr "---------------------------ADDITIONAL PROCEDURE---------------------------\n"; + printerr "--------------------------------------------------------------------------\n"; + printerr "NOTICE: the FASTA file $urlin[$index] is not available to download but can be generated by the ANNOVAR software. "; + printerr "PLEASE RUN THE FOLLOWING TWO COMMANDS CONSECUTIVELY TO GENERATE THE FASTA FILES:\n\n"; + printerr "\tannotate_variation.pl --buildver $buildver --downdb seq $dbloc/${buildver}_seq\n"; + printerr "\tretrieve_seq_from_fasta.pl $dbloc/${buildver}_$dbtype1.txt -seqdir $dbloc/${buildver}_seq -format $dbtype1 -outfile $dbloc/${buildver}_${dbtype1}Mrna.fa\n"; + printerr "--------------------------------------------------------------------------\n"; + printerr "--------------------------------------------------------------------------\n"; + } + } + } +} + +sub currentAvailMemory { + my ($availmem, $allmem) = (0, 0); + if ($^O eq "MSWin32") { #no easy solution to get available memory from Windows. + ($availmem, $allmem) = (0, 0); + } elsif ($^O eq 'linux' or $^O eq 'aix' or $^O eq 'solaris') { + if (open (TOP, "top -b -n 1 2>&1 |")) { + my $index; + while (<TOP>) { + if (m/^Mem:.+\s(\d+)k free/) { + $availmem = $1; + } + s/^\s+//; + my @field = split (/\s+/, $_); + @field >= 10 or next; #make sure that the PID lines are reached + if ($field[0] eq 'PID') { + for my $i (0 .. @field-1) { + $field[$i] eq 'RES' and $index = $i; + } + } + if ($field[0] eq $$) { + defined $index or die "Error: invalid output from top command: the line with PID and RES is not found\n"; + $allmem = $field[$index]; + if ($allmem =~ m/^([\d\.]+)(\w)$/) { + if ($2 eq 'g') { + $allmem = $1 * 1_000_000; + } elsif ($2 eq 'm') { + $allmem = $1 * 1_000; + } elsif ($2 eq 'k') { + $allmem = $1; + } else { + printerr "WARNING: unrecognizable output from top command: <$_>\n"; + } + } + last; + } + } + } + } else { + ($availmem, $allmem) = (0, 0); + } + return ($availmem, $allmem); +} + +sub printerr { + print STDERR @_; + print LOG @_; +} + +sub checkProgramUpdate { + my ($method) = @_; + my $sc; + my ($curdate, $webdate, $webdate1) = $LAST_CHANGED_DATE; + my (@webcontent); + $method eq 'wget' or $method eq 'lwp' or die "Error: update checking method can be only 'wget' or 'lwp'"; + printerr "NOTICE: Web-based checking to see whether ANNOVAR new version is available ... "; + $LAST_CHANGED_DATE =~ m/LastChangedDate: (\d+)\-(\d+)-(\d+)/ or printerr "Failed\n" and return; + $curdate = $1.$2.$3; + if ($method eq 'wget') { + $sc = "wget -t 1 -T 10 -q -O .annovar_date http://www.openbioinformatics.org/annovar/download/annovar_date"; + if (system ($sc)) { + printerr "Failed\n"; + return; + } else { + if (not open (AVDATE, ".annovar_date")) { + printerr "Cannot access version information\n"; + } else { + printerr "Done\n"; + @webcontent = <AVDATE>; #$LAST_CHANGED_DATE = '$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $'; + close (AVDATE); + unlink (".annovar_date"); + } + } + } elsif ($method eq 'lwp') { + my $http = LWP::UserAgent->new (timeout=>10); + $http->env_proxy; + my $response = $http->get("http://www.openbioinformatics.org/annovar/download/annovar_date"); + if ($response->is_success) { + printerr "Done\n"; + $_ = $response->decoded_content; + @webcontent = split (/\n/, $_); + } else { + printerr "Failed\n"; + return; + } + } + + $webdate = $webcontent[0]; + $webdate =~ s/[\r\n]+$//; + $webdate1 = $webdate; + $webdate1 =~ s/\-//g; #remove the - sign in webdate + if ($curdate < $webdate1) { + printerr "----------------------------UPDATE AVAILABLE------------------------------\n"; + printerr "--------------------------------------------------------------------------\n"; + printerr "WARNING: A new version of ANNOVAR (dated $webdate) is available!\n"; + printerr " Download from http://www.openbioinformatics.org/annovar/\n"; + + if (@webcontent >= 2) { + printerr "Changes made in the $webdate version:\n"; + for my $i (1 .. @webcontent-1) { + if ($webcontent[$i] =~ m/^(\d{4})\-(\d{2})\-(\d{2})[\r\n]+$/) { + $webdate = "$1-$2-$3"; + $webdate1 = "$1$2$3"; + if ($curdate >= $webdate1) { #the current version is more recent than this date + last; + } else { + printerr "Changes made in the $webdate version:\n"; + } + } else { + printerr " * $webcontent[$i]"; + } + } + } + printerr "--------------------------------------------------------------------------\n"; + printerr "--------------------------------------------------------------------------\n"; + } +} + +=head1 SYNOPSIS + + annotate_variation.pl [arguments] <query-file|table-name> <database-location> + + Optional arguments: + -h, --help print help message + -m, --man print complete documentation + -v, --verbose use verbose output + + Arguments to download databases or perform annotations + --downdb download UCSC Genome Browser annotation database + --geneanno annotate variants by functional consequences on genes + --regionanno annotate variants by targetting specific genomics regions + --filter filter variants based on a position list + --webfrom <string> specify the source of database (default usually works fine) + + Arguments to control input and output files + --outfile <file> output file prefix + --zerostart input query file uses half-open zero-start coordinate + --dbtype <string> database type + --buildver <string> genome build version (default: hg18 for human) + --gff3dbfile <file> specify the GFF3 DB file used in region-based annotation + --genericdbfile <file> specify the generic DB file used in filter-based annotation + --vcfdbfile <file> specify the DB file in VCF format in filter-based annotation + --bedfile <file> specify a BED file in region-based annotation + --time print out local time during program run + --separate separately print out all function of a variant (default: one line per variant) + --colsWanted <string> specify which columns to output in -regionanno by comma-delimited numbers + --comment print out comment line (those starting with #) in output files + --scorecolumn <int> the column with scores in database file (for region-based annotation) + --exonsort sort the exon number in output line (for gene-based annotation) + --transcript_function use transcript name rather than gene name in gene-based annotation output + + Arguments to fine-tune the annotation procedure + --batchsize <int> batch size for processing variants per batch (default: 5m) + --genomebinsize <int> bin size to speed up search (default: 100k for -geneanno, 10k for -regionanno) + --expandbin <int> check nearby bin to find neighboring genes (default: 2m/genomebinsize) + --neargene <int> distance threshold to define upstream/downstream of a gene + --score_threshold <float> minimum score of DB regions to use in annotation + --normscore_threshold <float> minimum normalized score of DB regions to use in annotation + --rawscore output includes the raw score (not normalized score) in UCSC Browser Track + --minqueryfrac <float> minimum percentage of query overlap to define match to DB (default: 0) + --splicing_threshold <int> distance between splicing variants and exon/intron boundary (default: 2) + --maf_threshold <float> filter 1000G variants with MAF above this threshold (default: 0) + --sift_threshold <float> SIFT threshold for deleterious prediction (default: 0.05) + --precedence <string> comma-delimited to specify precedence of variant function (default: exonic>intronic...) + + Arguments to control memory usage + --memfree <int> ensure minimum amount of free system memory (default: 100000, in the order of kb) + --memtotal <int> limit total amount of memory used by ANNOVAR (default: 0, unlimited, in the order of kb) + --chromosome <string> examine these specific chromosomes in database file + + + Function: annotate a list of genetic variants against genome annotation + databases saved at local disk. + + Example: #download gene annotation database (for hg18 build) and save to humandb/ directory + annotate_variation.pl -downdb gene humandb/ + annotate_variation.pl -buildver mm9 -downdb mce30way mousedb/ + annotate_variation.pl -downdb snp130 humandb/ + + #gene-based annotation of variants in the varlist file (by default --geneanno is ON) + annotate_variation.pl ex1.human humandb/ + + #region-based annotate variants + annotate_variation.pl -regionanno -dbtype mce44way ex1.human humandb/ + annotate_variation.pl -regionanno -dbtype gff3 -gff3dbfile tfbs.gff3 ex1.human humandb/ + + #filter rare or unreported variants (in 1000G/dbSNP) or predicted deleterious variants + annotate_variation.pl -filter -dbtype 1000g_ceu -maf 0.01 ex1.human humandb/ + annotate_variation.pl -filter -dbtype snp130 ex1.human humandb/ + annotate_variation.pl -filter -dbtype avsift ex1.human humandb/ + + Version: $LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $ + +=head1 OPTIONS + +=over 8 + +=item B<--help> + +print a brief usage message and detailed explanation of options. + +=item B<--man> + +print the complete manual of the program. + +=item B<--verbose> + +use verbose output. + +=item B<--downdb> + +download annotation databases from UCSC Genome Browser, Ensembl, 1000 Genomes +Project or other resources. The annotation files in this database are required +for the functional annotation of variants. + +=item B<--geneanno> + +perform gene-based annotation. For each variant, examine whether it hit exon, +intron, intergenic region, or close to a transcript, or hit a non-coding RNA +gene, or is located in a untranslated region. In addition, for an exonic variant, +determine whether it causes splicing change, non-synonymous amino acid change, +synonymous amino acid change or frameshift changes. + +=item B<--regionanno> + +perform region-based annotation. For each variant, examine whether it overlaps +with a specific genomic region, such as the most conserved elements, the +predicted transcription factor binding sites, the specific cytogeneic bands, the +evolutionarily conserved RNA secondary structures and so on. + +=item B<--filter> + +perform filter-based annotation. For each variants, filter it against a +variation database, such as the 1000 Genomes Project database and the dbSNP +database, and identify a subset that have not been reported in these databases +as novel variants. + +=item B<--outfile> + +specify the output file prefix. Several output files will be generated using +this prefix and different suffixes. A directory name can also be specified as +part of the argument, so that the output files can be written to a different +directory than the current directory. + +=item B<--zerostart> + +utilize the half-open zero-start coordinate system that is used by many UCSC +Genome Browser annotation tables. By default, the 1-based coordinate system will +be used. + +=item B<--dbtype> + +specify the database type to be used in gene-based, region-based or filter-based +annotations. For gene-based annotation, by default refGene annotations from the +UCSC Genome Browser will be used for annotating variants. However, users can +switch to utilize Ensembl annotations instead, or use the UCSC Gene annotations. +In general, RefSeq gene annotations are more conservative, and UCSC Gene +annotations are most liberal with many predicted genes and non-coding RNAs. For +region-based annotations, users can select any UCSC annotation databases (by +providing the database name), or alternatively select a Generic Feature Format +version 3 (GFF3) formatted file for annotation (by providing 'gff3' as the -- +dbtype and providing the --gff3dbfile argument). For filter-based annotations, +users can select a dbSNP file, a 1000G file, a generic format file (with simple +columns including chr, start, end, reference, observed, score), a VCF format +(which is the current popular format for variants exchange), or a avsift format +(which is identital to the generic format but is provided for convenience). + +=item B<--buildver> + +genome build version to use. By default, the hg18 build for human genome is +used. The build version will be used by ANNOVAR to identify corresponding database files +automatically, for example, when gene-based annotation is used for hg18 build, +ANNOVAR will search for the hg18_refGene.txt file, but if the hg19 is used as -- +buildver, ANNOVAR will examine hg19_refGene.txt instead. + +=item B<--gff3dbfile> + +specify the GFF3-formatted database file used in the region-based annotation. + +=item B<--genericdbfile> + +specify the generic format database file used in the filter-based annotation. + +=item B<--vcfdbfile> + +specify the database file in VCF format in the filter-based annotation. VCF has +been a popular format for summarizing SNP and indel calls in a population of +samples, and has been adopted by 1000 Genomes Project in their most recent data +release. + +=item B<--time> + +print out the local time during execution of the program + +=item B<--separate> + +for gene-based annotation, separate the effects of each variant, so that each +effect (intronic, exonic, splicing) is printed in one output line. By default, +all effects are printed in the same line, in the comma-separated form of +'UTR3,UTR5' or 'exonic,splicing'. + +=item B<--colsWanted> + +specify which columns are desired in the output for -regionanno. By default, +ANNOVAR inteligently selects the columns based on the DB type. However, users +can use a list of comma-delimited numbers, or use 'all', or use 'none', to +request custom output columns. + +=item B<--comment> + +specify that the program should include comment lines in the output files. +Comment lines are defined as any line starting with #. By default, these lines +are not recognized as valid ANNOVAR input and are therefore written to the +INVALID_INPUT file. This argument can be very useful to keep columns headers in +the output file, if the input file use comment line to flag the column headers +(usually the first line in the input file). + +=item B<--scorecolumn> + +specify the the column with desired output scores in UCSC database file (for +region-based annotation). The default usually works okay. + +=item B<--exonsort> + +sort the exon number in output line in the exonic_variant_function file during +gene-based annotation. If a mutation affects multiple transcripts, the ones with +the smaller exon number will be printed before the transcript with larger exon +number in the output. + +=item B<--batchsize> + +this argument specifies the batch size for processing variants by gene-based +annotation. Normally 5 million variants (usually one human genome will have +about 3-5 million variants depending on ethnicity) are annotated as a batch, to +reduce the amounts of memory. The users can adjust the parameters: larger values +make the program slightly faster, at the expense of slightly larger memory +requirements. In a 64bit computer, the default settings usually take 1GB memory +for gene-based annotation for human genome for a typical query file, but this +depends on the complexity of the query (note that the query has a few required +fields, but may have many optional fields and those fields need to be read and +kept in memory). + +=item B<--genomebinsize> + +the bin size of genome to speed up search. By default 100kb is used for gene- +based annotation, so that variant annotation focused on specific bins only +(based on the start-end site of a given variant), rather than searching the +entire chromosomes for each variant. By default 10kb is used for region-based +annotation. The filter-based annotations look for variants directly so no bin is +used. + +=item B<--expandbin> + +expand bin to both sides to find neighboring genes/regions. For gene-based +annotation, ANNOVAR tries to find nearby genes for any intergenic variant, with +a maximum number of nearby bins to search. By default, ANNOVAR will +automatically set this argument to search 2 megabases to the left and right of +the variant in genome. + +=item B<--neargene> + +the distance threshold to define whether a variant is in the upstream or +downstream region of a gene. By default 1 kilobase from the start or end site of +a transcript is defined as upstream or downstream, respectively. This is useful, +for example, when one wants to identify variants that are located in the +promoter regions of genes across the genome. + +=item B<--score_threshold> + +the minimum score to consider when examining region-based annotations on UCSC +Genome Browser tables. Some tables do not have such scores and this argument +will not be effective. + +=item B<--normscore_threshold> + +the minimum normalized score to consider when examining region-based annotations +on UCSC Genome Browser tables. The normalized score is calculated by UCSC, +ranging from 0 to 1000, to make visualization easier. Some tables do not have +such scores and this argument will not be effective. + +=item B<--rawscore> + +for region-based annotation, print out raw scores from UCSC Genome Browser +tables, rather than normalized scores. By default, normalized scores are printed +in the output files. Normalized scores are compiled by UCSC Genome Browser for +each track, and they usually range from 0 to 1000, but there are some +exceptions. + +=item B<--minqueryfrac> + +The minimum fraction of overlap between a query and a database record to decide +on their match. By default, any overlap is regarded as a match, but this may not +work best when query consist of large copy number variants. + +=item B<--splicing_threshold> + +distance between splicing variants and exon/intron boundary, to claim that a +variant is a splicing variant. By default, 2bp is used. ANNOVAR is relatively +more stringent than some other software to claim variant as regulating splicing. +In addition, if a variant is an exonic variant, it will not be reported as +splicing variant even if it is within 2bp to an exon/intron boundary. + +=item B<--maf_threshold> + +the minor allele frequency (MAF) threshold to be used in the filter-based +annotation for the 1000 Genomes Project databases. By default, any variant +annotated in the 1000G will be used in filtering. + +=item B<--memfree> + +the minimum amount of free system memory that ANNOVAR should ensure to have. By +default, if ANNOVAR takes too much memory such that only 100Mb system memory is +available, ANNOVAR will stop reading annotation database into memory, and will +start annotation procedure, and then clear the memory, and then read the next +block of annotation database again. This argument ensures that ANNOVAR will not +attempt to use virtual memory in the system (which makes ANNOVAR extremely +slow). + +=item B<--memtotal> + +the total amount of memory that ANNOVAR should use at most. By default, this +value is zero, meaning that there is no limit on that. Decreasing this threshold +reduce the memory requirement by ANNOVAR, but may increase the execution time. + +=item B<--chromosome> + +examine these specific chromosomes in database file. The argument takes comma- +delimited values, and the dash can be correctly recognized. For example, 5-10,X +represent chromosome 5 through chromosome 10 plus chromosome X. + +=back + +=head1 DESCRIPTION + +ANNOVAR is a software tool that can be used to functionally annotate a list of genetic variants, +possibly generated from next-generation sequencing experiments. For example, +given a whole-genome resequencing data set for a human with specific diseases, +typically around 3 million SNPs and around half million insertions/deletions +will be identified. Given this massive amounts of data (and candidate disease- +causing variants), it is necessary to have a fast algorithm that scans the data +and identify a prioritized subset of variants that are most likely functional +for follow-up Sanger sequencing studies and functional assays. + +Currently, these various types of functional annotations produced by ANNOVAR can +be (1) gene-based annotations (the default behavior), such as exonic variants, +intronic variants, intergenic variants, downstream variants, UTR variants, +splicing site variants, stc. For exonic variants, ANNOVAR will try to predict +whether each of the variants is non-synonymous SNV, synonymous SNV, +frameshifting change, nonframeshifting change. (2) region-based annotation, to +identify whether a given variant overlaps with a specific type of genomic +region, for example, predicted transcription factor binding site or predicted +microRNAs.(3) filter-based annotation, to filter a list of variants so that only +those not observed in variation databases (such as 1000 Genomes Project and +dbSNP) are printed out. + +Currently, I am expanding the functionality of ANNOVAR on (1) Fusion gene +detection from large deletions, where a deletion joins the reading frame of two +genes (same orientation of transcription) together to create a new gene. (2) +Assignment of functional importance score to each observed mutation in the +genome. This will be extremely important for the development of association +tests for rare variants, and for prioritization of variants in downstream +functional studies after a successful genome-wide association studies (GWAS). + +=over 8 + +=item * B<variant file format> + +A sample variant file contains one variant per line, with the fields being chr, +start, end, reference allele, observed allele, other information. The other +information can be anything (for example, it may contain sample identifiers for +the corresponding variant.) An example is shown below: + + 16 49303427 49303427 C T rs2066844 R702W (NOD2) + 16 49314041 49314041 G C rs2066845 G908R (NOD2) + 16 49321279 49321279 - C rs2066847 c.3016_3017insC (NOD2) + 16 49290897 49290897 C T rs9999999 intronic (NOD2) + 16 49288500 49288500 A T rs8888888 intergenic (NOD2) + 16 49288552 49288552 T - rs7777777 UTR5 (NOD2) + 18 56190256 56190256 C T rs2229616 V103I (MC4R) + +=item * B<database file format: UCSC Genome Browser annotation database> + +Most but not all of the gene annotation databases are directly downloaded from +UCSC Genome Browser, so the file format is identical to what was used by the +genome browser. The users can check Table Browser (for example, human hg18 table +browser is at http://www.genome.ucsc.edu/cgi-bin/hgTables?org=Human&db=hg18) to +see what fields are available in the annotation file. Note that even for the +same species (such as humans), the file format might be different between +different genome builds (such as between hg16, hg17 and hg18). ANNOVAR will try +to be smart about guessing file format, based on the combination of the -- +buildver argument and the number of columns in the input file. In general, the +database file format should not be something that users need to worry about. + +=item * B<database file format: GFF3 format for gene-based annotations)> + +As of June 2010, ANNOVAR cannot perform gene-based annotations using GFF3 input +files, and any annotations on GFF3 is region-based. However, this is expected to +be changed in the future. + +=item * B<database file format: GFF3 format for region-based annotations)> + +Currently, region-based annotations can support the Generic Feature Format +version 3 (GFF3) formatted files. The GFF3 has become the de facto golden +standards for many model organism databases, such that many users may want to +take a custom annotation database and run ANNOVAR on them, and it would be the +most convenient if the custom file is made with GFF3 format. + +=item * B<database file format: generic format for filter-based annotations)> + +The 'generic' format is designed for filter-based annotation that looks for +exact variants. The format is almost identical to the ANNOVAR input format, with +chr, start, end, reference allele, observed allele and scores (higher scores are +regarded as better). + +=item * B<database file format: VCF format for filter-based annotations)> + +The 1000 Genomes Project now provide their variant annotations in VCF format, so +I implemented the functionality to directly interrogate VCF files. A VCF file +may contain summary information for variants (for example, this variant has MAF +of 5% in this population), or it may contain the actual variant calls for each +individual in a specific population. As of March 2010, the files from 1000G website +only contains the first type of information (that is, alleles and their +frequencies in population). For the purpose of simplicity, ANNOVAR only +interrogates the first type of information. + +=item * B<database file format: avsift for filter-based annotations)> + +avsift refers to a file that ANNOVAR developers compiled for fast annotation of +SIFT scores for non-synonymous variants in the human genome. It conforms to the +generic format described above. However, users can directly specify '--dbtype +avsift' in command line to perform avsift annotations, making it more convenient +for users. Alternatively, users can use '--dbtype generic -genericdbfile +hg18_avsift.txt' for the annotation, and the effects are usually the same. + +=item * B<sequence file format> + +ANNOVAR can directly examine FASTA-formatted sequence files. For mRNA sequences, +the name of the sequences are the mRNA identifier. For genomic sequences, the +name of the sequences in the files are usually chr1, chr2, chr3, etc, so that +ANNOVAR knows which sequence corresponds to which chromosome. Unfortunately, +UCSC uses things like chr6_random to annotate un-assembled sequences, as opposed +to using the actual contig identifiers. This causes some issues (depending on +how reads alignment algorithms works), but in general should not be something +that user need to worry about. If the users absolutely care about the exact +contigs rather than chr*_random, then they will need to re-align the short reads +at chr*_random to a different FASTA file that contains the contigs, and then +execute ANNOVAR on the newly identified variants. + +=item * B<invalid input> + +If the query file contains input lines with invalid format, ANNOVAR will skip +such line and continue with the annotation on next lines. These invalid input +lines will be written to a file with suffix invalid_input. Users should manually +examine this file and identify sources of error. + +=back + +ANNOVAR is freely available to the academic community for non-commercial use. For +questions or comments, please contact kai@openbioinformatics.org. + +=cut \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/inc/annovar/convert2annovar.pl Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,1778 @@ +#!/usr/bin/perl +use warnings; +use strict; +use Getopt::Long; +use Pod::Usage; + +our $VERSION = '$Revision: 466 $'; +our $LAST_CHANGED_DATE = '$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $'; + +our ($verbose, $help, $man); +our ($variantfile); +our ($outfile, $format, $includeinfo, $snpqual, $snppvalue, $coverage, $maxcoverage, $chr, $chrmt, $altcov, $fraction, $species, $filterword, $confraction, $allallele); + +our %iupac = (R=>'AG', Y=>'CT', S=>'GC', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-'); + + +GetOptions('verbose'=>\$verbose, 'help|h'=>\$help, 'man'=>\$man, 'outfile=s'=>\$outfile, 'format=s'=>\$format, 'includeinfo'=>\$includeinfo, + 'snpqual=f'=>\$snpqual, 'snppvalue=f'=>\$snppvalue, 'coverage=i'=>\$coverage, 'maxcoverage=i'=>\$maxcoverage, 'chr=s'=>\$chr, 'chrmt=s'=>\$chrmt, + 'fraction=f'=>\$fraction, 'altcov=i'=>\$altcov, + 'species'=>\$species, 'filter=s'=>\$filterword, 'confraction=f'=>\$confraction, 'allallele!'=>\$allallele) or pod2usage (); + +$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT); +$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT); +@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT); +@ARGV == 1 or pod2usage ("Syntax error"); + +($variantfile) = @ARGV; + +$chrmt ||= 'M'; + +if (not $format) { + $format = 'pileup'; + print STDERR "NOTICE: the default --format argument is set as 'pileup'\n"; +} + +if (defined $outfile) { + open (STDOUT, ">$outfile") or die "Error: cannot write to output file $outfile: $!\n"; +} + +defined $snpqual and $format eq 'pileup' || $format eq 'vcf4' || pod2usage ("Error in argument: the --snpqual is supported only for the 'pileup' or 'vcf4' format"); +defined $snppvalue and $format eq 'gff3-solid' || pod2usage ("Error in argument: the --snppvalue is supported only for the 'gff3-solid' format"); +if (not defined $snpqual and $format eq 'pileup') { + $snpqual = 20; + print STDERR "NOTICE: the default --snpqual argument for pileup format is set as 20\n"; +} + +if (not defined $snppvalue) { + $snppvalue = 1; #by default, do not use any of the P-value cutoff in filtering out GFF3-SOLID files (this is differnt from handling pileup files) +} + +if (not defined $coverage) { + $coverage = 0; +} + +if (defined $fraction) { + $format eq 'pileup' or $format eq 'vcf4' or pod2usage ("Error in argument: the '--fraction' argument is supported for the pileup or vcf4 format only"); + $format eq 'vcf4' and print STDERR "NOTICE: the --fraction argument works ONLY on indels for vcf4 format\n"; + $fraction >= 0 and $fraction <=1 or pod2suage ("Error in argument: the --fraction argument must be between 0 and 1 inclusive"); +} else { + $fraction = 0; +} + +if (defined $confraction) { + $format eq 'vcf4' and print STDERR "NOTICE: the --confraction argument works ONLY on indels for vcf4 format\n"; + $confraction >= 0 and $fraction <=1 or pod2suage ("Error in argument: the --confraction argument must be between 0 and 1 inclusive"); +} else { + $confraction = 0; +} + +if (defined $altcov) { + $format eq 'pileup' or pod2usage ("Error in argument: the '--altcov' argument is supported for the '--format pileup' only"); + $altcov < $coverage or pod2suage ("Error in argument: the --altcov argument must be less than --coverage"); + $altcov > 0 or pod2suage ("Error in argument: the --altcov argument must be a positive integer"); +} + +if (defined $species) { + $format eq 'gff3-solid' or pod2usage ("Error in argument: the '--species' argument is only necessary for the '--format gff3-solid'"); +} + +if ($allallele) { + $format eq 'vcf4' or pod2usage ("Error in argument: the '--allallele' argument is only supported for the '--format vcf4'"); +} + +if ($format eq 'pileup') { + convertPileup ($variantfile); +} elsif ($format eq 'cg') { + convertCG ($variantfile); +} elsif ($format eq 'gff3-solid') { + convertGFF3SolidSNP ($variantfile); +} elsif ($format eq 'soap') { + print STDERR "WARNING: the support for '--format soap' is not well developed yet and may contain bugs for indel analysis.\n"; + convertSOAP ($variantfile); +} elsif ($format eq 'maq') { + print STDERR "WARNING: the support for '--format maq' is not well developed yet and may contain bugs.\n"; + convertMAQSNP ($variantfile); +} elsif ($format eq 'casava') { + if (not defined $chr) { + pod2usage ("Error in argument: please supply --chr argument for the '--format casava'"); + } + convertCASAVA ($variantfile, $chr); +} elsif ($format eq 'vcf4') { + convertVCF4 ($variantfile); +} elsif ($format eq 'annovar') { + convertANNOVAR ($variantfile); +} else { + pod2usage ("Error in argument: the --format $format is not currently supported. Please contact ANNOVAR developer for adding the support"); +} + + +sub convertPileup { + my ($variantfile) = @_; + my ($countline, $countvar, $counthom, $counthet, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0/; + + if ($variantfile eq 'stdin') { + *VAR = *STDIN; + } else { + open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; + } + print STDERR "NOTICE: Column 6-9 in output are heterozygosity status, SNP quality, total reads, reads with mutation\n"; + + while (<VAR>) { + s/[\r\n]+$//; + $countline++; + my $hom = 'hom'; + my @field = split (/\t/, $_); + @field >= 10 or die "Error: invalid record found in pileupfile $variantfile (at least 10 fields expected): <$_>\n"; + my ($chr, $pos, $wt, $call, @other) = @field; + my ($cons_qual, $snp_quality, $readcount, $readallele) = @field[4,5,7,8]; + $chr =~ s/^chr//; + $wt = uc $wt; #wt may or may not in upper case, it depends on the input FASTA file + $call = uc $call; #usually call is in upper case + $readallele = uc $readallele; #lower case indicate the opposite strand + + $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed + + $snp_quality >= $snpqual or next; #quality of the variant call + $readcount >= $coverage or next; #read count of the variant call + $maxcoverage and $readcount <= $maxcoverage || next; #maximum coverage of the variant call + + if ($wt eq '*') { #indel + #example: + #1 970271 * +C/+C 39 106 44 5 +C * 1 4 0 0 0 + #1 1548977 * */+CCG 29 29 42 3 * +CCG 2 1 0 0 0 + #1 1674810 * */+C 24 24 42 6 * +C 5 1 0 0 0 + #1 968466 * -CT/-CT 53 339 55 5 -CT * 5 0 0 0 0 + #1 1093600 * -GAAA/* 29 29 53 3 -GAAA * 1 2 0 0 0 + #1 1110101 * */-A 41 41 17 6 * -A 5 1 0 0 0 + #1 1215395 * */-TC 26 26 32 4 * -TC 3 1 0 0 0 + my @obs = split (/\//, $call); #example: '+AG/+AG' as homozygotes, '*/-TA' or '*/+T' as heterozygotes + @obs == 2 or die "Error: pileup record contains more than two alternative alleles: <$_>\n"; + my ($end, $ref, $alt); + my ($indelreadcount); #number of reads supporting the indel + + + if ($obs[0] eq $obs[1]) { + #something weird may occur in SamTools output: 22 15231121 * */* 360 0 32 158 * +GA 156 2 0 0 0 + $obs[0] eq '*' and next; + + #for deletions, SAMTOOLS represent deletion using a location before the first deleted base in the reference sequence coordinate system + #for example, a deletion in Samtools output is "1 109266688 * */-CTT 1429 1429 58 43 * -CTT 24 19 0 0 0" + #the correct ANNOVAR input (for rs35029887) should be "1 109266689 109266691 CTT - het 1429" + #insertions are fine without change; for example, check rs5745466 in Genome Browser; samtools report "1 76119508 * +AT/+AT" + #for this insertion, ANNOVAR input file (for rs5745466) becomes "1 76119508 76119508 - AT hom 1601" + + if ($obs[0] =~ m/^\-/) { + $pos++; #add 1 to position in deletion + } + + $indelreadcount = calculateIndelReadCount ($obs[0], \@field); + $indelreadcount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold + defined $altcov and $indelreadcount >= $altcov || next; + + ($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[0]); + print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n"; + $counthom++; + } else { + $hom = 'het'; + if ($obs[0] =~ m/^[\-\+]/) { + $obs[0] =~ m/^\-/ and $pos++; + ($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[0]); + $indelreadcount = calculateIndelReadCount ($obs[0], \@field); + $indelreadcount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold + defined $altcov and $indelreadcount >= $altcov || next; + + print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n"; + $counthet++; + } + if ($obs[1] =~ m/^[\-\+]/) { + $obs[1] =~ m/^\-/ and $pos++; + ($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[1]); + $indelreadcount = calculateIndelReadCount ($obs[1], \@field); + $indelreadcount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold + defined $altcov and $indelreadcount >= $altcov || next; + + print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n"; + $counthet++; + } + } + $countindel++; + } else { + #1 798494 G A 36 36 58 3 AAA bbb + #1 798677 T K 33 33 52 26 ,$.,,G.GG,.,......,..G,,... b`bbBaJIbFbZWaTNQbb_VZcbbb + #1 856182 G A 42 42 50 5 AAAAA B\bbb + #1 861034 A M 48 48 49 14 ,$,.,..,cc.c.,c bbBbb`]BFbHbBB + #1 864289 T K 22 22 56 6 .g,,g, BH^_BB + + $wt eq $call and next; #this is not a SNP + my $obs = $iupac{$call} or die "Error: invalid best call ($call) in <$_>\n"; + my @obs = split (//, $obs); + @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n"; + if ($obs[0] ne $obs[1]) { + $hom = 'het'; + } + + + if ($obs[0] eq $wt) { #obs[0] is guaranteed to be an alternative allele + @obs = @obs[1,0]; + } + if ($wt eq 'A' and $obs[0] eq 'G' or $wt eq 'G' and $obs[0] eq 'A' or $wt eq 'C' and $obs[0] eq 'T' or $wt eq 'T' and $obs[0] eq 'C') { + unless ($wt ne $obs[0] and $wt ne $obs[1] and $obs[0] ne $obs[1]) { + $countti++; + } + + } else { + unless ($wt ne $obs[0] and $wt ne $obs[1] and $obs[0] ne $obs[1]) { + $counttv++; + } + } + + my $mutallelecount; + + if ($obs[1] eq $wt) { #het SNP + if ($chr eq $chrmt) { + $hom = calculateAllelicFraction ($obs[0], $field[8], $readcount); + } + $mutallelecount = calculateMutAlleleCount ($obs[0], $readallele); + $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold + defined $altcov and $mutallelecount >= $altcov || next; + + print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n"; + $counthet++; + } elsif ($obs[1] ne $obs[0]) { #het SNP but both differ from reference allele + if ($chr eq $chrmt) { + $hom = calculateAllelicFraction ($obs[1], $field[8], $readcount); + } + $mutallelecount = calculateMutAlleleCount ($obs[1], $readallele); + $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold + defined $altcov and $mutallelecount >= $altcov || next; + + print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[1], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n"; + if ($chr eq $chrmt) { + $hom = calculateAllelicFraction ($obs[0], $field[8], $readcount); + } + $mutallelecount = calculateMutAlleleCount ($obs[0], $readallele); + $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold + defined $altcov and $mutallelecount >= $altcov || next; + + print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n"; + $counthet++; + $counthet++; + } else { #homo SNP + if ($chr eq $chrmt) { + $hom = calculateAllelicFraction ($obs[0], $field[8], $readcount); + } + $mutallelecount = calculateMutAlleleCount ($obs[0], $readallele); + $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold + defined $altcov and $mutallelecount >= $altcov || next; + + print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n"; + $counthom++; + } + $countsnp++; + } + $countvar++; + } + my $triallelic = $countsnp-$countti-$counttv; + print STDERR "NOTICE: Read $countline lines and wrote ${\($counthet+$counthom)} different variants at $countvar genomic positions ($countsnp SNPs and $countindel indels)\n"; + print STDERR "NOTICE: Among ${\($counthet+$counthom)} different variants at $countvar positions, $counthet are heterozygotes, $counthom are homozygotes\n"; + print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions", $triallelic?", $triallelic have more than 2 alleles\n":"\n"; +} + +sub calculateIndelReadCount { + my ($obs, $field) = @_; + #make sure to use upper case in the comparison, for example: + #chr10 130133 * */-ca 189 189 59 31 * -ca 27 4 0 0 0 + if ($obs eq uc $field->[8]) { + return $field->[10]; + } elsif ($obs eq uc $field->[9]) { + return $field->[11]; + } else { + die "Error: invalid record in pileup file (indel counts cannot be inferred): <$obs> vs <@$field>\n"; + } +} + +sub calculateMutAlleleCount { + my ($allele, $string) = @_; #they should be already both in upper case + $string =~ s/\^.//g; #^ is followed by mapping quality + $string =~ s/\$//g; + $string =~ s/[+-]1[^\d]//g; #1 followed by a non-number + $string =~ s/[+-]2..//g; + $string =~ s/[+-]3...//g; + $string =~ s/[+-]4....//g; + $string =~ s/[+-]5.....//g; + $string =~ s/[+-]6......//g; + $string =~ s/[+-]7.......//g; + $string =~ s/[+-]8........//g; + $string =~ s/[+-]9.........//g; + $string =~ s/[+-]10..........//g; + + #make sure to use upper case letter + my @string = split (//, uc $string); + my $count = 0; + for my $i (0 .. @string-1) { + $allele eq $string[$i] and $count++; + } + return $count; +} + +sub calculateAllelicFraction { + my ($obs, $readbase, $readcount) = @_; + my @readbase = split (//, $readbase); + my $count=0; + for my $i (0 .. @readbase-1) { + uc $obs eq uc $readbase[$i] and $count++; + } + my $hom = $count/$readcount; + length ($hom) > 5 and $hom > 0.001 and $hom = sprintf ("%.3f", $hom); + return $hom; +} + +sub recalculateEndRefObs { #recalculate end position, reference allele and observed allele + my ($end, $ref, $obs) = @_; + if ($obs =~ m/^\-(\w+)/) { #deletion + $end += (length ($1)-1); + $ref = $1; + $obs = '-'; + } elsif ($obs =~ m/^\+(\w+)/) { #insertion + $ref = '-'; + $obs = $1; + } else { + die "Error: cannot handle $end, $ref, $obs\n"; + } + return ($end, $ref, $obs); +} + +sub convertCG { + my ($variantfile) = @_; + + my ($foundheader, $countline, @field); + my ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/; + open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; + print STDERR "NOTICE: Converting variants from $variantfile\n"; + while (<VAR>) { + s/[\r\n]+$//; + $countline++; + if (m/^>locus/) { + $foundheader++; + } + if (not $foundheader) { + $countline > 50 and die "Error: invalid CG-var file format for $variantfile (>locus record is not found within the first 50 lines)\n"; + next; + } + my ($locus, $ploidy, $haplo, $chr, $start, $end, $vartype, $ref, $obs, $score, $haplolink, $xref) = split (/\t/, $_); + $chr =~ s/^chr//; + $vartype eq 'ins' or $start++; #CG use zero-start, half-open coordinate. Insertion does not need to be processed (example, "16476 2 2 chr1 751820 751820 ins T 49 dbsnp:rs59038458") + $obs eq '' and $obs = '-'; + $ref eq '' and $ref = '-'; + + if ($vartype =~ m/^snp|ins|del|delins|sub$/) { #in new versions of the files, "sub" is used instead of "delins". + #$chr eq 'M' and next; #ignore chrM markers as they are not diploid + if ($chr eq $prechr and $start eq $prestart and $end eq $preend and $obs eq $preobs) { #homozygous mutation + print $chr, "\t", $start, "\t", $end, "\t", $ref, "\t", $obs, "\t", $vartype, "\t", ($score+$prescore)/2, "\t", "hom\t", $xref, "\n"; + ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/; + } else { + if ($prestart and $preend) { + print $prechr, "\t", $prestart, "\t", $preend, "\t", $preref, "\t", $preobs, "\t", $prevartype, "\t", $prescore, "\thet\t", $prexref, "\n"; + } + ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = ($chr, $start, $end, $vartype, $ref, $obs, $score, $xref); + } + } + } + if ($prestart and $preend) { + print $prechr, "\t", $prestart, "\t", $preend, "\t", $preref, "\t", $preobs, "\t", $prevartype, "\t", $prescore, "\thet\t", $prexref, "\n"; + } + print STDERR "NOTICE: Done with $countline lines\n"; +} + +sub convertGFF3SolidSNP { + my ($variantfile) = @_; + my ($countline, $countvar, $countallvar, @other) = (0, 0, 0); + my ($unknown_count); #count of variants with 'unknown' variation type + + open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; + $_ = <VAR>; + s/[\r\n]+$//; + m/^##gff-version\s+3/ or die "Error: invalid first line in GFF3 file ('##gff-version 3' expected): <$_>\n"; + $_ = <VAR>; + s/[\r\n]+$//; + m/^##solid-gff-version/ or print STDERR "WARNING: problematic second line in GFF3-SOLiD file ('##solid-gff-version' expected): <$_>\n"; + + print STDERR "NOTICE: Column 6-9 in output are heterozygosity status, variant score (P-value), total clipped normal coverage reads, total reads with mutated allele\n"; + + while (<VAR>) { + s/[\r\n]+$//; + $countline++; + m/^##/ and next; #header of comment lines + m/^#/ and next; #header of results lines + + my @field = split (/\t/, $_); + @field == 9 or die "Error: invalid record found in $variantfile (10 fields expected): <$_>\n"; + my ($chr, $program, $type, $pos, $end, $score, $attribute) = @field[0,1,2,3,4,5,8]; #score is the P-value for the SNP calls + $chr eq 'chr_name' and next; #header line + + if ($score ne '.') { + $score >=0 and $score <=1 or die "Error: invalid score record found in file (0-1 range expected): <$_>\n"; + $score <= $snppvalue or next; + } + + if ($species and $species eq 'human') { + $chr eq '23' and $chr = 'X'; + $chr eq '24' and $chr = 'Y'; + $chr eq '25' and $chr = 'M'; + } + + $includeinfo and @other = ($attribute); #unless -includeinfo is set, the other will not be printed + + my ($readcount, $mutallelecount) = ('.', '.'); #total coverage, coverage for mutated alleles + + if ($type eq 'unknown') { + #SOLiD GDD3 may have unknown variation types + #chr1 AB_SOLiD Small Indel Tool unknown 3833062 3833153 1 . . ID=5483;len=no_call;allele-call-pos=3833062;allele-call=/CCAC;allele-pos=3833057;alleles=atccatccacccatc/aTCCATCCACCCACCCATC/NO_CALL;allele-counts=REF,2,2;tight_chrom_pos=none;loose_chrom_pos=3833058-3833069;no_nonred_reads=3;coverage_ratio=8.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_50_10_r,L1_1_50_15_r,L1_1_50_15_r,L1_1_50_12_r;bead_ids=1018_196_970,699_1263_465,220_513_923,2022_1532_1071;overall_qvs=4,6,2,50;no_mismatches=5,4,2,0;read_pos=27,29,31,13;from_end_pos=23,21,19,37;strands=+,+,+,+;tags=R3,F3,F3,F3;indel_sizes=-92,-112,4,4;non_indel_no_mismatches=0,0,8,0;unmatched-lengths=50,50,50,50;ave-unmatched=50.0000;anchor-match-lengths=48,49,49,49;ave-anchor-length=48.7500;read_seqs=G23223321322112233223100132013201320110011001322332,T33223321322112233223100132013201320110013021322332,T33223321322112233223100132013201320110011001322332,T31001320132013201100110013223322113030332233113032;base_qvs=;non_indel_seqs=T21322332211221121322332230321212121223322332233221,G12020202202020012001200213022002130012332310122030,G12020202202020012001000210022012110312331331122030,G22111012101031010100002002321020002202121121313021;non_indel_qvs= + $unknown_count++; + next; #do not count this one! + } + + if ($program eq 'SOLiD_diBayes' or $program eq 'AB_SOLiD SNP caller') { #SNP variants + #detailed description can be found at http://solidsoftwaretools.com/gf/download/docmanfileversion/208/866/DiBayes_Documentation_v1.2.pdf + #chr1 SOLiD_diBayes SNP 559817 559817 0.094413 . . genotype=Y;reference=T;coverage=9;refAlleleCounts=5;refAlleleStarts=4;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=14;diColor1=11;diColor2=33;het=1;flag= + #chr1 SOLiD_diBayes SNP 714068 714068 0.000000 . . genotype=M;reference=C;coverage=13;refAlleleCounts=7;refAlleleStarts=6;refAlleleMeanQV=25;novelAlleleCounts=6;novelAlleleStarts=4;novelAlleleMeanQV=22;diColor1=00;diColor2=11;het=1;flag= + #chr1 SOLiD_diBayes SNP 714835 714835 0.041579 . . genotype=R;reference=A;coverage=5;refAlleleCounts=3;refAlleleStarts=3;refAlleleMeanQV=18;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=20;diColor1=02;diColor2=20;het=1;flag= + + $pos == $end or die "Error: invalid record found in GFF3-SOLiD file: start and end discordant: <$_>\n"; + + my ($wt, $call); + + if ($attribute =~ m/ref_base=(\w)/) { + $wt = $1; + } elsif ($attribute =~ m/reference=(\w)/) { + $wt = $1; + } else { + die "Error: invalid record found in GFF3-SOLiD file (ref_base/reference was not found): <$_>\n"; + } + + if ($attribute =~ m/consen_base=(\w)/) { + $call = $1; + } elsif ($attribute =~ m/genotype=(\w)/) { + $call = $1; + } else { + die "Error: invalid record found in GFF3-SOLiD file (consen_base was not found): <$_>\n"; + } + + if ($attribute =~ m/coverage=(\d+)/) { + $readcount = $1; + $readcount >= $coverage or next; #read count of the variant call + $maxcoverage and $readcount <= $maxcoverage || next; + } + if ($attribute =~ m/novelAlleleCounts=(\d+)/) { + $mutallelecount = $1; + $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold + defined $altcov and $mutallelecount >= $altcov || next; + } + + my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n"; + my @obs = split (//, $obs); + @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n"; + if ($obs[0] eq $wt and $obs[1] eq $wt) { + die "Error: reference alleles are identical to observed alleles: <$_>\n"; + } elsif ($obs[0] eq $wt) { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; + } elsif ($obs[1] eq $wt) { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; + } elsif ($obs[1] ne $obs[0]) { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; + $countallvar++; + } else { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; + } + } elsif ($program eq 'AB_CNV_PIPELINE') { #CNV + if ($attribute =~ m/copynum=(\d+)/ or $attribute =~ m/copynumber=(\d+)/) { + if ($1 < 2) { + print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", '-', "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n"; + } elsif ($1 > 2) { + print $chr, "\t", $end, "\t", $end, "\t", '-', "\t", 0, "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n"; + } + } else { + print $chr, "\t", $end, "\t", $end, "\t", '-', "\t", 0, "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n"; + } + } elsif ($program eq 'AB_SOLiD Large Indel Tool') { #CNV + #http://solidsoftwaretools.com/gf/download/docmanfileversion/182/780/Large_Indel_Documentation_v1.0.0.pdf + ## [FIELDS] (1) chromosome (2) version (3) indel type (4) breakpoint start (5) breakpoint end (6) p-value (7) NA (8) NA (9) attributes + #chr10 AB_SOLiD Large Indel Tool insertion 151910 151969 2.77548e-11 . . dev=-71;avgDev=-1.63884;zygosity=HOMOZYGOUS;nRef=0;nDev=14;refDev=0;devDev=-1.60924;refVar=0;devVar=0.0159438;beadIds=1750_720_1641,649_1680_794,503_1756_608,1726_174_1362,1208_1772_353,872_594_1604,1924_990_858,1899_961_1848,901_1226_378,323_1750_1017,1185_180_1237,1519_490_1074,1291_94_324,285_758_922,1135_95_1594,1055_218_1279, + #chr10 AB_SOLiD Large Indel Tool insertion 154109 154729 2.1559e-11 . . dev=-66;avgDev=-1.51253;zygosity=HOMOZYGOUS;nRef=0;nDev=15;refDev=0;devDev=-1.02864;refVar=0;devVar=0.133236;beadIds=1728_1671_1739,1250_231_25,811_783_1090,1035_908_491,649_1680_794,503_1756_608,1726_174_1362,1208_1772_353,872_594_1604,1924_990_858,1899_961_1848,901_1226_378,323_1750_1017,1185_180_1237,1519_490_1074,1291_94_324,285_758_922,1135_95_1594,1055_218_1279, + my ($call, @call, $zygosity); + if ($attribute =~ m#zygosity=HEMIZYGOUS#) { + $zygosity = 'het'; + } elsif ($attribute =~ m#zygosity=HOMOZYGOUS#) { + $zygosity = 'hom'; + } else { + $zygosity = 'unk'; + } + if ($type eq 'insertion') { + #the true boundary is unknown (start is different from end) so we cannot use "-" to represent reference allele. + print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", 0, "\t", $zygosity, "\t", "$score\t.\t.\t", join ("\t", @other), "\n"; + } elsif ($type eq 'deletion') { + print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", '-', "\t", $zygosity, "\t", "$score\t.\t.\t", join ("\t", @other), "\n"; + } + } elsif ($program eq 'AB_SOLiD Small Indel Tool') { #small indels + #typical simple insertion and deletions + #chr1 AB_SOLiD Small Indel Tool deletion 1352612 1352614 1 . . ID=1290;del_len=3;allele-call-pos=1352612;allele-call=cct/;allele-pos=1352610;alleles=cccctccat/cCCCAT;allele-counts=REF,2;tight_chrom_pos=1352612-1352614;loose_chrom_pos=1352612-1352614;no_nonred_reads=2;coverage_ratio=11.5000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_3_r,L1_1_25_8_r;bead_ids=1470_2000_506,822_1710_1767;overall_qvs=18,19;no_mismatches=3,3;read_pos=6,13;from_end_pos=19,12;strands=-,+;tags=R3,R3;indel_sizes=-3,-3;non_indel_no_mismatches=1,-1;unmatched-lengths=25,25;ave-unmatched=25.0000;anchor-match-lengths=24,99;ave-anchor-length=61.5000;read_seqs=G0112310001100003120031200,G0300213000011000132110021;base_qvs=;non_indel_seqs=T2120033002022200220000002,;non_indel_qvs= + #chr1 AB_SOLiD Small Indel Tool insertion_site 1311162 1311162 1 . . ID=1249;ins_len=1;allele-call-pos=1311162;allele-call=/G;allele-pos=1311161;alleles=gaggggggg/GAGGGGGGGG/NO_CALL;allele-counts=REF,3,1;tight_chrom_pos=none;loose_chrom_pos=1311160-1311169;no_nonred_reads=3;coverage_ratio=4.6667;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_6_r,L1_1_50_10_r,L1_1_25_2_r,L1_1_25_3_r;bead_ids=850_837_429,1160_878_181,404_1050_1881,1084_64_1343;overall_qvs=20,56,25,25;no_mismatches=3,2,2,1;read_pos=11,22,11,11;from_end_pos=14,28,14,14;strands=+,-,-,-;tags=R3,F3,F3,F3;indel_sizes=1,1,1,1;non_indel_no_mismatches=-1,1,0,1;unmatched-lengths=25,50,25,25;ave-unmatched=31.2500;anchor-match-lengths=99,49,24,24;ave-anchor-length=49.0000;read_seqs=G1020001130221020000000020,T03223323210110021000000022122030100020221222222122,T0102210000000221223301000,T0102210000000221220301000;base_qvs=;non_indel_seqs=,G21202030032202013220021321131212021000122300013132,G1331133120001221220120120,G1331133120001221220120220;non_indel_qvs= + + #sometimes, allele-call is ambiguous that requires a "block substitution" representation (although they were annotated as insertion or deletion by SOLiD, they should be treated as block substitution by ANNOVAR) + #sometimes, mutiple allele calls may be present at the same site + #chr1 AB_SOLiD Small Indel Tool deletion 1209357 1209360 1 . . ID=1101;del_len=4;allele-call-pos=1209357;allele-call=ggtggg/TT;allele-pos=1209355;alleles=ggggtgggggggtt/gGTTGGGGTT/gGTGTTTTGCCTT/NO_CALL;allele-counts=REF,3,1,1;tight_chrom_pos=none;loose_chrom_pos=1209357-1209363;no_nonred_reads=4;coverage_ratio=3.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=0.9888;run_names=L1_1_25_1_r,L1_1_25_2_r,L1_1_25_4_r,L1_1_25_3_r,L1_1_25_7_r;bead_ids=1017_1024_53,1493_1896_615,1794_647_1473,307_1116_687,773_1492_1671;overall_qvs=24,24,28,24,8;no_mismatches=2,3,2,3,2;read_pos=14,9,14,9,15;from_end_pos=11,16,11,16,10;strands=-,+,-,+,+;tags=F3,R3,F3,F3,F3;indel_sizes=-4,-4,-4,-4,3;non_indel_no_mismatches=1,0,0,0,0;unmatched-lengths=25,25,25,25,25;ave-unmatched=25.0000;anchor-match-lengths=24,24,24,24,24;ave-anchor-length=24.0000;read_seqs=T2221100101000101000221100,G0001122000100000101001020,T2221100101000101000221100,T1112200010100010100112000,T1011220000111000130200001;base_qvs=;non_indel_seqs=G0312033221312111022200300,T0111113210210112100001130,G0312133221312111022200300,G0231003132222112000012020,G3121331033101113122312020;non_indel_qvs= + #chr1 AB_SOLiD Small Indel Tool deletion 1209436 1209436 1 . . ID=1103;del_len=1;allele-call-pos=1209436;allele-call=ag/A/G;allele-pos=1209434;alleles=tgagggggtt/tGAGGGGTT/tGGGGGGTT;allele-counts=REF,1,1;tight_chrom_pos=none;loose_chrom_pos=1209436-1209441;no_nonred_reads=2;coverage_ratio=5.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_6_r,L1_1_25_2_r;bead_ids=1315_1584_2005,1706_194_437;overall_qvs=28,21;no_mismatches=0,3;read_pos=9,7;from_end_pos=16,18;strands=-,-;tags=R3,R3;indel_sizes=-1,-1;non_indel_no_mismatches=-1,0;unmatched-lengths=25,25;ave-unmatched=25.0000;anchor-match-lengths=99,24;ave-anchor-length=61.5000;read_seqs=G3001010000011001010000001,G3010100022110010111000110;base_qvs=;non_indel_seqs=,T1112003220020013202122300;non_indel_qvs= + #chr1 AB_SOLiD Small Indel Tool insertion_site 1424540 1424540 1 . . ID=1376;ins_len=3;allele-call-pos=1424540;allele-call=tt/CCCAC;allele-pos=1424537;alleles=ttttttg/TTTCCCACTG/NO_CALL;allele-counts=REF,1,1;tight_chrom_pos=none;loose_chrom_pos=1424536-1424543;no_nonred_reads=2;coverage_ratio=11.5000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_7_r,L1_1_50_16_r;bead_ids=703_437_370,1089_878_1744;overall_qvs=1,9;no_mismatches=3,4;read_pos=5,35;from_end_pos=20,15;strands=-,-;tags=R3,F3;indel_sizes=3,3;non_indel_no_mismatches=2,0;unmatched-lengths=25,50;ave-unmatched=37.5000;anchor-match-lengths=24,47;ave-anchor-length=35.5000;read_seqs=G2032002200200000000000020,T30100102220312202103112130230322210121100200002100;base_qvs=;non_indel_seqs=T2121120003012303000000000,G22213300221101011121030022002222300220322213303102;non_indel_qvs= + my ($call, @call, $zygosity); + if ($attribute =~ m#experimental-zygosity=HEMIZYGOUS#) { + $zygosity = 'het'; + } elsif ($attribute =~ m#experimental-zygosity=HOMOZYGOUS#) { + $zygosity = 'hom'; + } else { + $zygosity = 'unk'; + } + $score = '.'; #by default, score=1 in the output + + #no_nonred_reads: Number of reads with unique start positions (non-redundant reads). + #coverage_ratio: Clipped normal coverage/number of non-redundant reads.Clipped coverage is where the parts of the read that are within 5 bp at either end are not counted as a part of coverage. + if ($attribute =~ m/no_nonred_reads=(\d+);coverage_ratio=([\d\.]+)/) { + $readcount = int ($1*$2); + $readcount >= $coverage or next; #clipped normal read count of the variant call (this could even be lower than mut allele count) + $maxcoverage and $readcount <= $maxcoverage || next; + } else { + $readcount = '.'; + } + if ($attribute =~ m/allele-counts=REF,(\d+)/) { + $mutallelecount = $1; + } + + if ($attribute =~ m#allele\-call=([\w\/]+)#) { + @call = split (/\//, $1); + + if (@call == 1) { #a simple deletion + print $chr, "\t", $pos, "\t", $end, "\t", $call[0], "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; + } elsif ($call[0] eq '') { #a simple insertion (single or multiple allele) + for my $i (1 .. @call-1) { + print $chr, "\t", $pos, "\t", $pos, "\t", '-', "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; + $i > 1 and $countallvar++; + } + } else { #an indel that may have several alleles, or may require a block substitution representation + for my $i (1 .. @call-1) { + print $chr, "\t", $pos, "\t", $pos+length($call[0])-1, "\t", $call[0], "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; + $i > 1 and $countallvar++; + } + } + } else { + $call = '0'; + print $chr, "\t", $pos, "\t", $end, "\t", $call, "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n"; + } + } else { + die "Error: unrecognizable genotype calling program encountered (valid types are SOLiD_diBayes, AB_CNV_PIPELINE, AB_SOLiD Large Indel Tool, AB_SOLiD Small Indel Tool): <$_>\n"; + } + + $countvar++; #variation positions + $countallvar++; #all variants (maybe several at one variation position) + } + print STDERR "NOTICE: Finished processing $variantfile with $countline input lines\n"; + print STDERR "NOTICE: Wrote variants in $countvar variation positions ($countallvar variants considering multi-allelic ones)\n"; + $unknown_count and print STDERR "WARNING: $unknown_count variants with 'unknown' variation type were skipped\n"; +} + + + +sub convertSOAP { + my ($variantfile) = @_; + my ($countline, $countvar, @other); + + open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; + + while (<VAR>) { + s/[\r\n]+$//; + $countline++; + + my @field = split (/\t/, $_); + if (@field == 18) { #snp file + my ($chr, $pos, $wt, $call, @other) = @field; + $chr =~ s/^chr//; + + $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed + + my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n"; + my @obs = split (//, $obs); + @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n"; + if ($obs[0] eq $wt and $obs[1] eq $wt) { + die "Error: reference alleles are identical to observed alleles: <$_>\n"; + } elsif ($obs[0] eq $wt) { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; + } elsif ($obs[1] eq $wt) { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n"; + } elsif ($obs[1] ne $obs[0]) { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n"; + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; + $countvar++; + } else { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n"; + } + } elsif (@field == 6) { #indel file + my ($chr, $pos, $strand, $indellen, $call, $homo) = @field; + $homo eq 'Homo' and $homo = 'hom'; + $homo eq 'Hete' and $homo = 'het'; + $chr =~ s/^chr//; + + $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed + + if ($indellen =~ m/^\+(\d+)$/) { #insertion + length ($call) == $1 or die "Error: invalid record found in SOAPindel file: <$_>\n"; + print join("\t", $chr, $pos, $pos, '-', $call, $homo), "\n"; + } elsif ($indellen =~ m/^\-(\d+)$/) { #deletion + length ($call) == $1 or die "Error: invalid record found in SOAPindel file: <$_>\n"; + print join("\t", $chr, $pos, $pos+$1-1, $call, '-', $homo), "\n"; + } else { + die "Error: invalid record found in SOAPindel file: <$_>\n"; + } + } else { + die "Error: invalid record found in $variantfile (18 or 6 fields expected, observed ${\(scalar @field)} fields): <$_>\n"; + } + $countvar++; + } + print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n"; +} + +sub convertANNOVAR { + my ($variantfile) = @_; + my ($countline, $countvar, $countsnp); + my ($countti, $counttv) = (0, 0); + + open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; + + while (<VAR>) { + $countline++; + + my @field = split (/\t/, $_); + my ($chr, $start, $end, $ref, $obs) = @field; + if ($ref =~ m/^[ACGT]$/ and $obs =~ m/^[ACGT]$/) { + if ($ref eq 'A' and $obs eq 'G' or $ref eq 'G' or $obs eq 'A' or $ref eq 'C' and $obs eq 'T' or $ref eq 'T' and $obs eq 'C') { + $countti++; + } else { + $counttv++; + } + $countsnp++; + } + + print; + $countvar++; + } + print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n"; + print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions\n"; +} + +sub convertMAQSNP { + my ($variantfile) = @_; + my ($countline, $countvar, @other); + + open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; + + while (<VAR>) { + s/[\r\n]+$//; + $countline++; + + my @field = split (/\t/, $_); + if (@field == 12) { #SNPs + my ($chr, $pos, $wt, $call, @other) = @field; + $chr =~ s/^chr//; + + $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed + + my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n"; + my @obs = split (//, $obs); + @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n"; + if ($obs[0] eq $wt and $obs[1] eq $wt) { + die "Error: reference alleles are identical to observed alleles: <$_>\n"; + } elsif ($obs[0] eq $wt) { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; + } elsif ($obs[1] eq $wt) { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n"; + } elsif ($obs[1] ne $obs[0]) { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n"; + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; + $countvar++; + } else { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n"; + } + $countvar++; + } elsif (@field == 13) { #indels; the deletion start site do not need changes; the duplication start site need additional investigation by ANNOVAR developers + my ($chr, $pos, $type, $numread, $call, @other) = @field; + $chr =~ s/^chr//; + + $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed + + my @obs = split (/:/, $call); + @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n"; + if ($obs[0] =~ m/^\-(\d+)/) { + my $len = $1; + print $chr, "\t", $pos, "\t", $pos+$len-1, "\t", $obs[1], "\t", '-', "\t", "het\t", join ("\t", @other), "\n"; + } elsif ($obs[0] =~ m/^(\d+)/) { + my $len = $1; + print $chr, "\t", $pos, "\t", $pos, "\t", '-', "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; + } + $countvar++; + } else { + die "Error: invalid record found in $variantfile (12 or 13 fields expected, observed ${\(scalar @field)} fields): <$_>\n"; + } + } + print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n"; +} + +sub convertCASAVA { + my ($variantfile, $chr) = @_; + my ($countline, $countvar, @other); + + my ($intype); + my ($pos_index, $call_index, $reference_index, $type_index, $score_index, $total_index, $used_index); + my ($ref_indel_index, $quality_index, $maxgtype_index, $bp1_reads_index, $ref_reads_index, $indel_reads_index, $other_reads_index); + open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; + + while (<VAR>) { + s/[\r\n]+$//; + $countline++; + my @field; + + if (m/^#/) { + s/^#//; + if (s/^\$\sCOLUMNS\s//) { + @field = split (/\s+/, $_); + } else { + @field = split (/\t/, $_); + } + if (m/\bposition\b/ or m/\bpos\b/) { + for my $i (0 .. @field-1) { + if ($field[$i] eq 'position' or $field[$i] eq 'pos') { + $pos_index = $i; + } elsif ($field[$i] eq 'modified_call') { + $intype = 'snp'; + print STDERR "NOTICE: Automatically detected input type as $intype\n"; + $call_index = $i; + } elsif ($field[$i] eq 'reference') { + $reference_index = $i; + } elsif ($field[$i] eq 'type') { + $type_index = $i; + } elsif ($field[$i] eq 'score') { + $score_index = $i; + } elsif ($field[$i] eq 'total') { + $total_index = $i; + } elsif ($field[$i] eq 'used') { + $used_index = $i; + } elsif ($field[$i] eq 'ref/indel') { + $intype = 'indel'; + print STDERR "NOTICE: Automatically detected input type as $intype\n"; + $ref_indel_index = $i; + } elsif ($field[$i] eq 'Q(indel)') { + $quality_index = $i; + } elsif ($field[$i] eq 'max_gtype') { + $maxgtype_index = $i; + } elsif ($field[$i] eq 'bp1_reads') { + $bp1_reads_index = $i; + } elsif ($field[$i] eq 'ref_reads') { + $ref_reads_index = $i; + } elsif ($field[$i] eq 'indel_reads') { + $indel_reads_index = $i; + } elsif ($field[$i] eq 'other_reads') { + $other_reads_index = $i; + } + } + } + next; + } + + $intype or die "Error: unable to recognize the correct type of the input file (make sure that header line is present in $variantfile)\n"; + @field = split (/\t/, $_); + + if ($intype eq 'snp') { #SNPs + defined $pos_index and defined $reference_index and defined $call_index or die "Error: unalbe to find the position, reference and modified_call column header in $variantfile\n"; + my ($pos, $wt, $obs) = @field[$pos_index, $reference_index, $call_index]; + my (@other); + defined $pos and defined $wt and defined $obs or die; + if ($includeinfo) { + defined $score_index and push @other, $field[$score_index]; + defined $total_index and push @other, $field[$total_index]; + defined $used_index and push @other, $field[$used_index]; + defined $type_index and push @other, $field[$type_index]; + } + + length ($obs) == 1 and $obs .= $obs; + my @obs = split (//, $obs); + @obs == 2 or die "Error: observed allele $obs should correspond to two nucleotide alleles: <$_>\n"; + if ($obs[0] eq $wt and $obs[1] eq $wt) { + die "Error: reference alleles are identical to observed alleles: <$_>\n"; + } elsif ($obs[0] eq $wt) { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; + } elsif ($obs[1] eq $wt) { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n"; + } elsif ($obs[1] ne $obs[0]) { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n"; + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n"; + $countvar++; + } else { + print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n"; + } + $countvar++; + } elsif ($intype eq 'indel') { #indels + defined $pos_index and defined $ref_indel_index and defined $maxgtype_index or die "Error: unable to find the pos, ref_indel and max_gtype column header in $variantfile\n"; + my ($pos, $call, $hom, @other) = @field[$pos_index, $ref_indel_index, $maxgtype_index]; + if ($includeinfo) { + defined $quality_index and push @other, $field[$quality_index]; + defined $maxgtype_index and push @other, $field[$maxgtype_index]; + defined $bp1_reads_index and push @other, $field[$bp1_reads_index]; + defined $ref_reads_index and push @other, $field[$ref_reads_index]; + defined $indel_reads_index and push @other, $field[$indel_reads_index]; + defined $other_reads_index and push @other, $field[$other_reads_index]; + } + + #hg19 coordinate below; insertion needs position adjustment!!! deletion is fine + #948847 1I CCTCAGGCTT -/A ATAATAGGGC 969 hom 47 het 22 0 16 6 A 1 2 + #978604 2D CACTGAGCCC CT/-- GTGTCCTTCC 251 hom 20 het 8 0 4 4 CT 1 0 + #1276974 4I CCTCATGCAG ----/ACAC ACACATGCAC 838 hom 39 het 18 0 14 4 AC 2 4 + #1289368 2D AGCCCGGGAC TG/-- GGAGCCGCGC 1376 hom 83 het 33 0 25 9 TG 1 0 + #185137455 11I10M2I TATGTGTCCT -----------TTTTTTATTT--/AAATGATAGACTTTTTTTTTTAA ATTTCAGAAA 1126 het 988 hom 45 20 24 7 N/A 0 0 + #1276931 2D41M4I CACACACATG CACACACACGCACACACGTGCAATGTGAAAACACCTCATGCAG----/--CACACACGCACACACGTGCAATGTGAAAACACCTCATGCAGACAC ACACATGCAC 548 hom 16 het 8 0 11 11 N/A 0 0 + + my @obs = split (/\//, $call); + @obs == 2 or die "Error: observed indel allele $call should correspond to two alleles: <$_>\n"; + if ($obs[0] =~ m/^\-+$/) { #insertion + my $len = length ($obs[0]); + print $chr, "\t", $pos-1, "\t", $pos-1, "\t", '-', "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n"; + } elsif ($obs[1] =~ m/^\-+$/) { #deletion + my $len = length ($obs[0]); + print $chr, "\t", $pos, "\t", $pos+$len-1, "\t", $obs[0], "\t", '-', "\t", $hom, "\t", join ("\t", @other), "\n"; + } elsif (length ($obs[0]) eq length ($obs[1])) { #block substitution + $obs[0] =~ s/\-//g; + $obs[1] =~ s/\-//g; + print $chr, "\t", $pos, "\t", $pos+length($obs[0])-1, "\t", $obs[0], "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n"; + } else { + die "Error: invalid record found in indel line: <$_>\n"; + } + $countvar++; + } else { + die "Error: invalid record found in $variantfile (11 or 15 fields expected, observed ${\(scalar @field)} fields): <$_>\n"; + } + } + print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n"; +} + +sub convertVCF4 { + my ($variantfile) = @_; + + my ($countline, $countvar, $counthom, $counthet, $countunknown, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0 0/; + + my ($source_program); #the program that generated the VCF4 file + open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n"; + + while (<VAR>) { + $countline++; + + if (m/^##fileformat=VCFv(\d+\.)/) { + $1<4 and print STDERR "ERROR: Input file is not in VCF version 4 format but is $_" and exit; + } + if (m/^##UnifiedGenotyper/) { + $source_program = 'gatksnp'; + print STDERR "NOTICE: Detected that the VCF4 file is generated by GATK UnifiedGenotyper\n"; + print STDERR "NOTICE: column 6-10 represent heterozygosity status, quality score, read depth, RMS mapping quality, quality by depth\n"; + $fraction and print STDERR "WARNING: the --fraction argument will be ignored for GATK SNP calls!!!\n"; + $confraction and print STDERR "WARNING: the --confraction argument will be ignored for GATK SNP calls!!!\n"; + } + if (m/^##IndelGenotyper/) { + $source_program = 'gatkindel'; + print STDERR "NOTICE: Detected that the VCF4 file is generated by GATK IndelGenotyper\n"; + print STDERR "NOTICE: column 6-10 represent heterozygosity status, quality score, read depth, read count supporting indel call, RMS mapping quality\n"; + } + + m/^#/ and next; #skip comment lines + s/[\r\n]+$//; #delete trailing new lines + my $otherinfo = $_; #this is the complete line (when -includeinfo is set, the entire line will be included in output file) + + #format description: http://www.1000genomes.org/wiki/Analysis/vcf4.0 + #standard VCF4 should have 8 columns, but some software may produce more columns (for example, for genotype calls). The first 8 columns should follow the specification + + #example of VCF4 generated by GATK SNP caller + #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE + #1 55 . T G 34.82 . DP=2;Dels=0.00;HRun=0;HaplotypeScore=0.00;MQ=14.16;MQ0=0;QD=17.41;SB=-10.00 GT:DP:GL:GQ 0/1:1:-6.66,-0.30,-0.00:1.76 + #1 2646 . G A 40.91 . DP=4;Dels=0.00;HRun=0;HaplotypeScore=0.00;MQ=7.50;MQ0=3;QD=10.23;SB=-10.00 GT:DP:GL:GQ 0/1:1:-7.27,-0.30,-0.00:1.76 + + #example of VCF4 generated by GATK indel caller + #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE + #1 2525324 . G GC . PASS AC=5,5;DP=12;MM=4.8,3.7142856;MQ=29.0,42.285713;NQSBQ=33.0,46.463768;NQSMM=0.24,0.20289855;SC=0,5,1,6 GT 0/1 + #1 3553372 . GC G . PASS AC=6,6;DP=6;MM=0.8333333,0.0;MQ=60.0,0.0;NQSBQ=63.533333,0.0;NQSMM=0.0,0.0;SC=0,6,0,0 GT 1/0 + #1 6093011 . CG C . PASS AC=31,31;DP=32;MM=0.7096774,2.0;MQ=59.64516,60.0;NQSBQ=64.192184,39.666668;NQSMM=0.0,0.11111111;SC=23,8,0,1 GT 1/0 + + #example of VCF4 generated by 1000G + #CHROM POS ID REF ALT QUAL FILTER INFO + #1 533 . G C . PASS AA=.;AC=6;AN=120;DP=423 + #1 41342 . T A . PASS AA=.;AC=29;AN=120;DP=188 + #1 41791 . G A . PASS AA=.;AC=5;AN=120;DP=192 + #1 44449 . T C . PASS AA=C;AC=2;AN=120;DP=166 + #1 44539 rs2462492 C T . PASS AA=T;AC=2;AN=120;DP=131 + + #example of VCF4 generated by 1000G + #CHROM POS ID REF ALT QUAL FILTER INFO + #1 1000153 . TCACA T 100 PASS AF=0.115095;HP=1;NF=16;NR=13;NS=52;CA=0;DP=615 + #1 1000906 . CA C 48 PASS AF=0.0772696;HP=1;NF=2;NR=9;NS=51;CA=0;DP=281 + #1 1000950 rs60561655;-/G CG C 100 PASS AF=0.447771;HP=5;DB;NF=10;NR=20;NS=50;CA=M;DP=291 + #1 1010786 rs36095298;-/G,mills,venter A AG 100 PASS AF=0.774334;HP=1;DB;NF=21;NR=27;NS=51;CA=0;DP=306 + #1 1026158 . T TGGGGG 100 PASS AF=0.115637;HP=1;NF=5;NR=2;NS=52;CA=0;DP=591 + + #reserved VCF4 sub-fields in the INFO field + # * AA ancestral allele + # * AC allele count in genotypes, for each ALT allele, in the same order as listed + # * AF allele frequency for each ALT allele in the same order as listed: use this when estimated from primary data, not called genotypes + # * AN total number of alleles in called genotypes + # * BQ RMS base quality at this position + # * CIGAR cigar string describing how to align an alternate allele to the reference allele + # * DB dbSNP membership + # * DP combined depth across samples, e.g. DP=154 + # * END end position of the variant described in this record (esp. for CNVs) + # * H2 membership in hapmap2 + # * MQ RMS mapping quality, e.g. MQ=52 + # * MQ0 Number of MAPQ == 0 reads covering this record + # * NS Number of samples with data + # * SB strand bias at this position + # * SOMATIC indicates that the record is a somatic mutation, for cancer genomics + # * VALIDATED validated by follow-up experiment + + + #SAMtools/BCFtools specific information + #SAMtools/BCFtools may write the following tags in the INFO field in VCF/BCF. + #Tag Description + #I16 16 integers: + #1 #reference Q13 bases on the forward strand 2 #reference Q13 bases on the reverse strand + #3 #non-ref Q13 bases on the forward strand 4 #non-ref Q13 bases on the reverse strand + #5 sum of reference base qualities 6 sum of squares of reference base qualities + #7 sum of non-ref base qualities 8 sum of squares of non-ref base qualities + #9 sum of ref mapping qualities 10 sum of squares of ref mapping qualities + #11 sum of non-ref mapping qualities 12 sum of squares of non-ref mapping qualities + #13 sum of tail distance for ref bases 14 sum of squares of tail distance for ref bases + #15 sum of tail distance for non-ref bases 16 sum of squares of tail distance for non-ref + #INDEL Indicating the variant is an INDEL. + #DP The number of reads covering or bridging POS. + #DP4 Number of 1) forward ref alleles; 2) reverse ref; 3) forward non-ref; 4) reverse non-ref alleles, used in variant calling. Sum can be smaller than DP because low-quality bases are not counted. + #PV4 P-values for 1) strand bias (exact test); 2) baseQ bias (t-test); 3) mapQ bias (t); 4) tail distance bias (t) + #FQ Consensus quality. If positive, FQ equals the phred-scaled probability of there being two or more different alleles. If negative, FQ equals the minus phred-scaled probability of all chromosomes being identical. Notably, given one sample, FQ is positive at hets and negative at homs. + #AF1 EM estimate of the site allele frequency of the strongest non-reference allele. + #CI95 Equal-tail (Bayesian) credible interval of the site allele frequency at the 95% level. + #PC2 Phred-scaled probability of the alternate allele frequency of group1 samples being larger (,smaller) than of group2 samples. + #PCHI2 Posterior weighted chi^2 P-value between group1 and group2 samples. This P-value is conservative. + #QCHI2 Phred-scaled PCHI2 + #RP Number of permutations yeilding a smaller PCHI2 + + #example of triallelic variants generated by mpileup/bcftools + #1 156706559 . A C,G 114 . DP=20;AF1=1;CI95=1,1;DP4=0,0,1,19;MQ=60;FQ=-63 GT:PL:GQ 1/2:237,126,90,162,0,138:99 + #6 31129642 . A G,C 76 . DP=31;AF1=1;CI95=1,1;DP4=0,0,28,3;MQ=60;FQ=-75 GT:PL:GQ 1/2:255,194,146,164,0,119:99 + #1 11297762 . T C,A 98 . DP=19;AF1=1;CI95=1,1;DP4=0,0,17,1;MQ=60;FQ=-78 GT:PL:GQ 1/1:131,51,0,120,28,117:99 + + my @field=split(/\t/,$_); + @field >=8 or die "Error: invalid record found in VCF4 file (at least 8 tab-delimited fields expected): <$_>\n"; + my ($chr, $start, $ID, $ref_allele, $mut_allele, $quality_score, $filter, $info, $format, $sample) = @field; + my ($end); + my ($mut_allele2, $zygosity); + + if ($filterword) { #ensure that the filter field contains the filterword + $filter =~ m/\b$filterword\b/i or next; + } + + + #sometimes the alleles are not in the same case + #chr1 1869771 1869774 actc aCTctc 43.5 13 INDEL;DP=13;AF1=0.5;CI95=0.5,0.5;DP4=0,4,4,0;MQ=37;PV4=0.029,0.45,1,0.46 + $ref_allele = uc $ref_allele; + $mut_allele = uc $mut_allele; + + #if ($ID eq '.' || $ID =~ /^rs/) { #per MISHIMA, Hiroyuki suggestion (vcf4's third column (ID column) are not always ".") + # $end = $start; #this block is commented out on 2011feb19 + #} + + if ($mut_allele eq '.') { #no variant call was made at this position + next; + } + + if ($mut_allele =~ m/([^,]+),([^,]+)/) { + $mut_allele = $1; + $mut_allele2 = $2; + } + + if(length($ref_allele)==1 && length($mut_allele)==1) { ### output snv + my ($unfiltered_read_depth) = $info =~ /DP=(\d+)/; + my ($MappingQuality) = $info =~ /MQ=([^;]+)/; + my ($QualityByDepth) = $info =~ /QD=([^;]+)/; + + + + if ($coverage) { + defined $unfiltered_read_depth and $unfiltered_read_depth >= $coverage || next; + if ($maxcoverage) { + defined $unfiltered_read_depth and $unfiltered_read_depth <= $maxcoverage || next; + } + } + + if ($snpqual) { + defined $QualityByDepth and $QualityByDepth >= $snpqual || next; #the QD was used here as quality score + } + + if (defined $sample) { + if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) { + $zygosity = 'het'; + $counthet++; + } elsif ($sample =~ m#^1/1#) { + $zygosity = 'hom'; + $counthom++; + } else { + $zygosity = 'unknown'; + $countunknown++; + } + } else { + $zygosity = 'unknown'; + $countunknown++; + } + + #the subject is called as homozygous for the first alternative allele (genotype 1/1. i.e. C/C), but since there was one read containing A, samtools still keep both alleles in the VCF file (but gives a very low probabilities for it). + #1 11297762 . T C,A 98 . DP=19;AF1=1;CI95=1,1;DP4=0,0,17,1;MQ=60;FQ=-78 GT:PL:GQ 1/1:131,51,0,120,28,117:99 + if ($mut_allele2 and $zygosity eq 'hom') { + $mut_allele2 = ''; + } + + if (not $mut_allele2) { + if ($ref_allele eq 'A' and $mut_allele eq 'G' or $ref_allele eq 'G' and $mut_allele eq 'A' or $ref_allele eq 'C' and $mut_allele eq 'T' or $ref_allele eq 'T' and $mut_allele eq 'C') { + $countti++; + + } else { + $counttv++; + } + } + + print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele, "\t$zygosity", "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', $includeinfo ? "\t$otherinfo" : '', "\n"; + + if ($allallele) { + if ($mut_allele2) { + print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele2, "\t$zygosity", "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', $includeinfo ? "\t$otherinfo" : '', "\n"; + } + } + + $countsnp++; + } elsif (length($ref_allele) > 1 || length($mut_allele) > 1) { ### output indel + my ($indel_read_depth1, $indel_read_depth2) = $info =~ /AC=([^,;]+),([^,;]+)/; #number of reads supporting consensus indel, any indel + my ($unfiltered_read_depth) = $info =~ /DP=(\d+)/; + + if ($coverage) { + defined $unfiltered_read_depth and $unfiltered_read_depth >= $coverage || next; + if ($maxcoverage) { + defined $unfiltered_read_depth and $unfiltered_read_depth <= $maxcoverage || next; + } + } + + if (defined $indel_read_depth1 and defined $unfiltered_read_depth) { + $indel_read_depth1/$unfiltered_read_depth >= $fraction or next; #do not meet minimum alternative allele fraction threshold + $indel_read_depth1/$indel_read_depth2 >= $confraction or next; + } + + my ($MappingQuality) = $info =~ /MQ=([^;]+),/; + + #example VCF4 records below: + #20 2 . TCG T . PASS DP=100 + #Chr1 5473 . AT ATT 23.5 . INDEL;DP=16;AF1=0.5;CI95=0.5,0.5;DP4=4,2,3,1;MQ=42;PV4=1,0.41,0.042,0.24 + #Chr1 6498 . ATTTT ATTTTT 53.5 . INDEL;DP=9;AF1=1;CI95=1,1;DP4=0,0,5,3;MQ=28 + + if(length($ref_allele) > length ($mut_allele)) { # deletion or block substitution + my $head = substr($ref_allele, 0, length ($mut_allele)); + if ($head eq $mut_allele) { + print $chr,"\t"; + print $start+length($head),"\t"; + print $start+length($ref_allele)-1,"\t"; + + $ref_allele = substr ($ref_allele, length ($mut_allele)); + print $ref_allele,"\t"; + print "-"; + } else { + print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele, "\t"; + } + } elsif(length($mut_allele) >= length ($ref_allele)) { # insertion or block substitution + my $head = substr ($mut_allele, 0, length ($ref_allele)); + if ($head eq $ref_allele) { + print $chr,"\t"; + print $start+length($ref_allele)-1,"\t"; + print $start+length($ref_allele)-1,"\t"; + + $mut_allele = substr ($mut_allele, length ($ref_allele)); + print "-\t"; + print $mut_allele; + } else { + print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele, "\t"; + } + } + + if (defined $sample) { + if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) { + print "\thet"; + $counthet++; + } elsif ($sample =~ m#^1/1#) { + print "\thom"; + $counthom++; + } # BEGIN ARQ + elsif ($sample =~ m#^./.#) { + print "\tunknown"; + $countunknown++; + } # END ARQ + } + + print "\t", $quality_score; + defined $unfiltered_read_depth and print "\t", $unfiltered_read_depth; + + defined $indel_read_depth1 and print "\t", $indel_read_depth1; + defined $MappingQuality and print "\t", $MappingQuality; + $includeinfo and print "\t", $otherinfo; + print "\n"; + $countindel++; + + + #do the same thing again, exactly like above, except that we work on second mutation; + #in the future, consider rewrite this paragraph to make the code more elegant + if ($allallele) { + if ($mut_allele2) { + if(length($ref_allele) > length ($mut_allele2)) { # deletion or block substitution + my $head = substr($ref_allele, 0, length ($mut_allele2)); + if ($head eq $mut_allele2) { + print $chr,"\t"; + print $start+length($head),"\t"; + print $start+length($ref_allele)-1,"\t"; + + $ref_allele = substr ($ref_allele, length ($mut_allele2)); + print $ref_allele,"\t"; + print "-"; + } else { + print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2; + } + } elsif(length($mut_allele2) > length ($ref_allele)) { # insertion or block substitution + my $head = substr ($mut_allele2, 0, length ($ref_allele)); + if ($head eq $ref_allele) { + print $chr,"\t"; + print $start+length($ref_allele)-1,"\t"; + print $start+length($ref_allele)-1,"\t"; + + $mut_allele = substr ($mut_allele2, length ($ref_allele)); + print "-\t"; + print $mut_allele2; + } else { + print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2; + } + } + + if (defined $sample) { + if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) { + print "\thet"; + $counthet++; + } elsif ($sample =~ m#^1/1#) { + print "\thom"; + $counthom++; + } # BEGIN ARQ + elsif ($sample =~ m#^./.#) { + print "\tunknown"; + $countunknown++; + } # END ARQ + } + + print "\t", $quality_score; + defined $unfiltered_read_depth and print "\t", $unfiltered_read_depth; + + defined $indel_read_depth1 and print "\t", $indel_read_depth1; + defined $MappingQuality and print "\t", $MappingQuality; + $includeinfo and print "\t", $otherinfo; + print "\n"; + + } + } + } + $countvar++; + } + my $triallelic = $countsnp-$countti-$counttv; + print STDERR "NOTICE: Read $countline lines and wrote ${\($counthet+$counthom)} different variants at $countvar genomic positions ($countsnp SNPs and $countindel indels)\n"; + print STDERR "NOTICE: Among ${\($counthet+$counthom+$countunknown)} different variants at $countvar positions, $counthet are heterozygotes, $counthom are homozygotes\n"; + print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions", $triallelic?", $triallelic have more than 2 alleles\n":"\n"; +} + + +=head1 SYNOPSIS + + convert2annovar.pl [arguments] <variantfile> + + Optional arguments: + -h, --help print help message + -m, --man print complete documentation + -v, --verbose use verbose output + --format <string> input format (default: pileup) + --outfile <file> output file name (default: STDOUT) + --snpqual <float> quality score threshold in pileup file (default: 20) + --snppvalue <float> SNP P-value threshold in GFF3-SOLiD file (default: 1) + --coverage <int> read coverage threshold in pileup file (default: 0) + --maxcoverage <int> maximum coverage threshold (default: none) + --includeinfo include supporting information in output + --chr <string> specify the chromosome (for CASAVA format) + --chrmt <string> chr identifier for mitochondria (default: M) + --altcov <int> alternative allele coverage threshold (for pileup format) + --fraction <float> minimum allelic fraction to claim a mutation (for pileup/vcf4_indel format) + --species <string> if human, convert chr23/24/25 to X/Y/M (for gff3-solid format) + --filter <string> output variants with this filter (case insensitive, for vcf4 format) + --confraction <float> minimum consensus indel / all indel fraction (for vcf4_indel format) + --allallele print all alleles when multiple calls are present (for vcf4 format) + + Function: convert variant call file generated from various software programs + into ANNOVAR input format + + Example: convert2annovar.pl -format pileup -outfile variant.query variant.pileup + convert2annovar.pl -format cg -outfile variant.query variant.cg + convert2annovar.pl -format gff3-solid -outfile variant.query variant.snp.gff + convert2annovar.pl -format soap variant.snp > variant.avinput + convert2annovar.pl -format maq variant.snp > variant.avinput + convert2annovar.pl -format casava -chr 1 variant.snp > variant.avinput + convert2annovar.pl -format vcf4 variantfile > variant.avinput + convert2annovar.pl -format vcf4 -filter pass variantfile > variant.avinput + + Version: $LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $ + +=head1 OPTIONS + +=over 8 + +=item B<--help> + +print a brief usage message and detailed explanation of options. + +=item B<--man> + +print the complete manual of the program. + +=item B<--verbose> + +use verbose output. + +=item B<--format> + +the format of the input files. + +=item B<--outfile> + +specify the output file name. By default, output is written to STDOUT. + +=item B<--snpqual> + +quality score threshold in the pileup file, such that variant calls with lower +quality scores will not be printed out in the output file. When VCF4 file is +used, this argument works on the Quality-by-Depth measure, rather than the raw +quality measure. + +=item B<--coverage> + +read coverage threshold in the pileup file, such that variants calls generated +with lower coverage will not be printed in the output file. + +=item B<--includeinfo> + +specify that the output should contain additional information in the input line. +By default, only the chr, start, end, reference allele, observed allele and +homozygosity status are included in output files. + +=item B<--chr> + +specify the chromosome for CASAVA format + +=item B<--chrmt> + +specify the name of mitochondria chromosome (default is MT) + +=item B<--altcov> + +the minimum coverage of the alternative (mutated) allele to be printed out in +output + +=item B<--fraction> + +specify the minimum fraction of alternative allele, to print out the mutation. +For example, a site has 10 reads, 3 supports alternative allele. A -fraction of +0.4 will not allow the mutation to be printed out. + +=item B<--species> + +specify the species from which the sequencing data is obtained. For the GFF3- +SOLiD format, when species is human, the chromosome 23, 24 and 25 will be +converted to X, Y and M, respectively. + +=item B<--filter> + +for VCF4 file, only print out variant calls with this filter annotated. For +example, if using GATK VariantFiltration walker, you will see PASS, +GATKStandard, HARD_TO_VALIDATE, etc in the filter field. Using 'pass' as a +filter is recommended in this case. + +=item B<--confraction> + +consesus indel fraction, calculated as reads supporting consensus indels divided +by reads supporting any indels + +=item B<--allallele> + +print all alleles for mutations at a locus, rather than the first allele, if the +input VCF4 file contains multiple alternative alleles for a mutation. By +default, this option is off. When it is on, two lines will be printed out in the +output, and both will have the same quality scores as VCF4 does not provide +separate quality scores for individual alleles. + +=back + +=head1 DESCRIPTION + +This program is used to convert variant call file generated from various +software programs into ANNOVAR input format. Currently, the program can handle +Samtools genotype-calling pileup format, Solid GFF format, Complete Genomics +variant format, SOAP format. These formats are described below. + +=over 8 + +=item * B<pileup format> + +The pileup format can be produced by the Samtools genotyping calling subroutine. +Note that the phrase pileup format can be used in several instances, and here I +am only referring to the pileup files that contains the actual genotype calls. + +Using SamTools, given an alignment file in BAM format, a pileup file with +genotype calls can be produced by the command below: + + samtools pileup -vcf ref.fa aln.bam> raw.pileup + samtools.pl varFilter raw.pileup > final.pileup + +ANNOVAR will automatically filter the pileup file so that only SNPs reaching a +quality threshold are printed out (default is 20, use --snpqual argument to +change this). Most likely, users may want to also apply a coverage threshold, +such that SNPs calls from only a few reads are not considered. This can be +achieved using the -coverage argument (default value is 0). + +An example of pileup files for SNPs is shown below: + + chr1 556674 G G 54 0 60 16 a,.....,...,.... (B%A+%7B;0;%=B<: + chr1 556675 C C 55 0 60 16 ,,..A..,...,.... CB%%5%,A/+,%.... + chr1 556676 C C 59 0 60 16 g,.....,...,.... .B%%.%.?.=/%...1 + chr1 556677 G G 75 0 60 16 ,$,.....,...,.... .B%%9%5A6?)%;?:< + chr1 556678 G K 60 60 60 24 ,$.....,...,....^~t^~t^~t^~t^~t^~t^~t^~t^~t B%%B%<A;AA%??<=??;BA%B89 + chr1 556679 C C 61 0 60 23 .....a...a....,,,,,,,,, %%1%&?*:2%*&)(89/1A@B@@ + chr1 556680 G K 88 93 60 23 ..A..,..A,....ttttttttt %%)%7B:B0%55:7=>>A@B?B; + chr1 556681 C C 102 0 60 25 .$....,...,....,,,,,,,,,^~,^~. %%3%.B*4.%.34.6./B=?@@>5. + chr1 556682 A A 70 0 60 24 ...C,...,....,,,,,,,,,,. %:%(B:A4%7A?;A><<999=<< + chr1 556683 G G 99 0 60 24 ....,...,....,,,,,,,,,,. %A%3B@%?%C?AB@BB/./-1A7? + +The columns are chromosome, 1-based coordinate, reference base, consensus base, +consensus quality, SNP quality, maximum mapping quality of the reads covering +the sites, the number of reads covering the site, read bases and base qualities. + +An example of pileup files for indels is shown below: + + seq2 156 * +AG/+AG 71 252 99 11 +AG * 3 8 0 + +ANNOVAR automatically recognizes both SNPs and indels in pileup file, and process them correctly. + +=item * B<GFF3-SOLiD format> + +The SOLiD provides a GFF3-compatible format for SNPs, indels and structural +variants. A typical example file is given below: + + ##gff-version 3 + ##solid-gff-version 0.3 + ##source-version 2 + ##type DNA + ##date 2009-03-13 + ##time 0:0:0 + ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141 + ##reference-file + ##input-files Yoruban_snp_10x.txt + ##run-path + chr_name AB_SOLiD SNP caller SNP coord coord 1 . . coverage=# cov;ref_base=ref;ref_score=score;ref_confi=confi;ref_single=Single;ref_paired=Paired;consen_base=consen;consen_score=score;consen_confi=conf;consen_single=Single;consen_paired=Paired;rs_id=rs_id,dbSNP129 + 1 AB_SOLiD SNP caller SNP 997 997 1 . . coverage=3;ref_base=A;ref_score=0.3284;ref_confi=0.9142;ref_single=0/0;ref_paired=1/1;consen_base=G;consen_score=0.6716;consen_confi=0.9349;consen_single=0/0;consen_paired=2/2 + 1 AB_SOLiD SNP caller SNP 2061 2061 1 . . coverage=2;ref_base=G;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=C;consen_score=1.0000;consen_confi=0.8985;consen_single=0/0;consen_paired=2/2 + 1 AB_SOLiD SNP caller SNP 4770 4770 1 . . coverage=2;ref_base=A;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=G;consen_score=1.0000;consen_confi=0.8854;consen_single=0/0;consen_paired=2/2 + 1 AB_SOLiD SNP caller SNP 4793 4793 1 . . coverage=14;ref_base=A;ref_score=0.0723;ref_confi=0.8746;ref_single=0/0;ref_paired=1/1;consen_base=G;consen_score=0.6549;consen_confi=0.8798;consen_single=0/0;consen_paired=9/9 + 1 AB_SOLiD SNP caller SNP 6241 6241 1 . . coverage=2;ref_base=T;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=C;consen_score=1.0000;consen_confi=0.7839;consen_single=0/0;consen_paired=2/2 + +Newer version of ABI BioScope now use diBayes caller, and the output file is given below: + + ##gff-version 3 + ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141 + ##List of SNPs. Date Sat Dec 18 10:30:45 2010 Stringency: medium Mate Pair: 1 Read Length: 50 Polymorphism Rate: 0.003000 Bayes Coverage: 60 Bayes_Single_SNP: 1 Filter_Single_SNP: 1 Quick_P_Threshold: 0.997000 Bayes_P_Threshold: 0.040000 Minimum_Allele_Ratio: 0.150000 Minimum_Allele_Ratio_Multiple_of_Dicolor_Error: 100 + ##1 chr1 + ##2 chr2 + ##3 chr3 + ##4 chr4 + ##5 chr5 + ##6 chr6 + ##7 chr7 + ##8 chr8 + ##9 chr9 + ##10 chr10 + ##11 chr11 + ##12 chr12 + ##13 chr13 + ##14 chr14 + ##15 chr15 + ##16 chr16 + ##17 chr17 + ##18 chr18 + ##19 chr19 + ##20 chr20 + ##21 chr21 + ##22 chr22 + ##23 chrX + ##24 chrY + ##25 chrM + # source-version SOLiD BioScope diBayes(SNP caller) + #Chr Source Type Pos_Start Pos_End Score Strand Phase Attributes + chr1 SOLiD_diBayes SNP 221367 221367 0.091151 . . genotype=R;reference=G;coverage=3;refAlleleCounts=1;refAlleleStarts=1;refAlleleMeanQV=29;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=27;diColor1=11;diColor2=33;het=1;flag= + chr1 SOLiD_diBayes SNP 555317 555317 0.095188 . . genotype=Y;reference=T;coverage=13;refAlleleCounts=11;refAlleleStarts=10;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=29;diColor1=00;diColor2=22;het=1;flag= + chr1 SOLiD_diBayes SNP 555327 555327 0.037582 . . genotype=Y;reference=T;coverage=12;refAlleleCounts=6;refAlleleStarts=6;refAlleleMeanQV=19;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=29;diColor1=12;diColor2=30;het=1;flag= + chr1 SOLiD_diBayes SNP 559817 559817 0.094413 . . genotype=Y;reference=T;coverage=9;refAlleleCounts=5;refAlleleStarts=4;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=14;diColor1=11;diColor2=33;het=1;flag= + chr1 SOLiD_diBayes SNP 714068 714068 0.000000 . . genotype=M;reference=C;coverage=13;refAlleleCounts=7;refAlleleStarts=6;refAlleleMeanQV=25;novelAlleleCounts=6;novelAlleleStarts=4;novelAlleleMeanQV=22;diColor1=00;diColor2=11;het=1;flag= + The file conforms to standard GFF3 specifications, but the last column is solid- + specific and it gives certain parameters for the SNP calls. + +An example of the short indel format by GFF3-SOLiD is given below: + + ##gff-version 3 + ##solid-gff-version 0.3 + ##source-version SOLiD Corona Lite v.4.0r2.0, find-small-indels.pl v 1.0.1, process-small-indels v 0.2.2, 2009-01-12 12:28:49 + ##type DNA + ##date 2009-01-26 + ##time 18:33:20 + ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141 + ##reference-file + ##input-files ../../mp-results/JOAN_20080104_1.pas,../../mp-results/BARB_20071114_1.pas,../../mp-results/BARB_20080227_2.pas + ##run-path /data/results2/Yoruban-frag-indel/try.01.06/mp-w2x25-2x-4x-8x-10x/2x + ##Filter-settings: max-ave-read-pos=none,min-ave-from-end-pos=9.1,max-nonreds-4filt=2,min-insertion-size=none,min-deletion-size=none,max-insertion-size=none,max-deletion-size=none,require-called-indel-size?=T + chr1 AB_SOLiD Small Indel Tool deletion 824501 824501 1 . . del_len=1;tight_chrom_pos=824501-824502;loose_chrom_pos=824501-824502;no_nonred_reads=2;no_mismatches=1,0;read_pos=4,6;from_end_pos=21,19;strands=+,-;tags=R3,F3;indel_sizes=-1,-1;read_seqs=G3021212231123203300032223,T3321132212120222323222101;dbSNP=rs34941678,chr1:824502-824502(-),EXACT,1,/GG + chr1 AB_SOLiD Small Indel Tool insertion_site 1118641 1118641 1 . . ins_len=3;tight_chrom_pos=1118641-1118642;loose_chrom_pos=1118641-1118642;no_nonred_reads=2;no_mismatches=0,1;read_pos=17,6;from_end_pos=8,19;strands=+,+;tags=F3,R3;indel_sizes=3,3;read_seqs=T0033001100022331122033112,G3233112203311220000001002 + +The keyword deletion or insertion_site is used in the fourth column to indicate +that file format. + +An example of the medium CNV format by GFF3-SOLiD is given below: + + ##gff-version 3 + ##solid-gff-version 0.3 + ##source-version SOLiD Corona Lite v.4.0r2.0, find-small-indels.pl v 1.0.1, process-small-indels v 0.2.2, 2009-01-12 12:28:49 + ##type DNA + ##date 2009-01-27 + ##time 15:54:36 + ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141 + ##reference-file + ##input-files big_d20e5-del12n_up-ConsGrp-2nonred.pas.sum + ##run-path /data/results2/Yoruban-frag-indel/try.01.06/mp-results-lmp-e5/big_d20e5-indel_950_2050 + chr1 AB_SOLiD Small Indel Tool deletion 3087770 3087831 1 . . del_len=62;tight_chrom_pos=none;loose_chrom_pos=3087768-3087773;no_nonred_reads=2;no_mismatches=2,2;read_pos=27,24;from_end_pos=23,26;strands=-,+;tags=F3,F3;indel_sizes=-62,-62;read_seqs=T11113022103331111130221213201111302212132011113022,T02203111102312122031111023121220311111333012203111 + chr1 AB_SOLiD Small Indel Tool deletion 4104535 4104584 1 . . del_len=50;tight_chrom_pos=4104534-4104537;loose_chrom_pos=4104528-4104545;no_nonred_reads=3;no_mismatches=0,4,4;read_pos=19,19,27;from_end_pos=31,31,23;strands=+,+,-;tags=F3,R3,R3;indel_sizes=-50,-50,-50;read_seqs=T31011011013211110130332130332132110110132020312332,G21031011013211112130332130332132110132132020312332,G20321302023001101123123303103303101113231011011011 + chr1 AB_SOLiD Small Indel Tool insertion_site 2044888 2044888 1 . . ins_len=18;tight_chrom_pos=2044887-2044888;loose_chrom_pos=2044887-2044889;no_nonred_reads=2;bead_ids=1217_1811_209,1316_908_1346;no_mismatches=0,2;read_pos=13,15;from_end_pos=37,35;strands=-,-;tags=F3,F3;indel_sizes=18,18;read_seqs=T31002301231011013121000101233323031121002301231011,T11121002301231011013121000101233323031121000101231;non_indel_no_mismatches=3,1;non_indel_seqs=NIL,NIL + chr1 AB_SOLiD Small Indel Tool insertion_site 74832565 74832565 1 . . ins_len=16;tight_chrom_pos=74832545-74832565;loose_chrom_pos=74832545-74832565;no_nonred_reads=2;bead_ids=1795_181_514,1651_740_519;no_mismatches=0,2;read_pos=13,13;from_end_pos=37,37;strands=-,-;tags=F3,R3;indel_sizes=16,16;read_seqs=T33311111111111111111111111111111111111111111111111,G23311111111111111111111111111111111111111311011111;non_indel_no_mismatches=1,0;non_indel_seqs=NIL,NIL + +An example of the large indel format by GFF3-SOLiD is given below: + + ##gff-version 3 + ##solid-gff-version 0.3 + ##source-version ??? + ##type DNA + ##date 2009-03-13 + ##time 0:0:0 + ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141 + ##reference-file + ##input-files /data/results5/yoruban_strikes_back_large_indels/LMP/five_mm_unique_hits_no_rescue/5_point_6x_del_lib_1/results/NA18507_inter_read_indels_5_point_6x.dat + ##run-path + chr1 AB_SOLiD Large Indel Tool insertion_site 1307279 1307791 1 . . deviation=-742;stddev=7.18;ref_clones=-;dev_clones=4 + chr1 AB_SOLiD Large Indel Tool insertion_site 2042742 2042861 1 . . deviation=-933;stddev=8.14;ref_clones=-;dev_clones=3 + chr1 AB_SOLiD Large Indel Tool insertion_site 2443482 2444342 1 . . deviation=-547;stddev=11.36;ref_clones=-;dev_clones=17 + chr1 AB_SOLiD Large Indel Tool insertion_site 2932046 2932984 1 . . deviation=-329;stddev=6.07;ref_clones=-;dev_clones=14 + chr1 AB_SOLiD Large Indel Tool insertion_site 3166925 3167584 1 . . deviation=-752;stddev=13.81;ref_clones=-;dev_clones=14 + +An example of the CNV format by GFF3-SOLiD if given below: + + ##gff-version 3 + ##solid-gff-version 0.3 + ##source-version ??? + ##type DNA + ##date 2009-03-13 + ##time 0:0:0 + ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141 + ##reference-file + ##input-files Yoruban_cnv.coords + ##run-path + chr1 AB_CNV_PIPELINE repeat_region 1062939 1066829 . . . fraction_mappable=51.400002;logratio=-1.039300;copynum=1;numwindows=1 + chr1 AB_CNV_PIPELINE repeat_region 1073630 1078667 . . . fraction_mappable=81.000000;logratio=-1.409500;copynum=1;numwindows=2 + chr1 AB_CNV_PIPELINE repeat_region 2148325 2150352 . . . fraction_mappable=98.699997;logratio=-1.055000;copynum=1;numwindows=1 + chr1 AB_CNV_PIPELINE repeat_region 2245558 2248109 . . . fraction_mappable=78.400002;logratio=-1.042900;copynum=1;numwindows=1 + chr1 AB_CNV_PIPELINE repeat_region 3489252 3492632 . . . fraction_mappable=59.200001;logratio=-1.119900;copynum=1;numwindows=1 + chr1 AB_CNV_PIPELINE repeat_region 5654415 5657276 . . . fraction_mappable=69.900002;logratio=1.114500;copynum=4;numwindows=1 + chr1 AB_CNV_PIPELINE repeat_region 9516165 9522726 . . . fraction_mappable=65.850006;logratio=-1.316700;numwindows=2 + chr1 AB_CNV_PIPELINE repeat_region 16795117 16841025 . . . fraction_mappable=44.600002;logratio=1.880778;copynum=7;numwindows=9 + +The keyword repeat_region is used here, although it actually refers to CNVs. + +An example of the inversion format by GFF3-SOLiD is given below: + + ##gff-version 3 + ##solid-gff-version 0.2 + ##generated by SOLiD inversion tool + chr10 AB_SOLiD inversion 46443107 46479585 268.9 . . left=chr10:46443107-46443146;right=chr10:46479583-46479585;leftscore=295.0;rightscore=247.0;count_AAA_further_left=117;count_AAA_left=3;count_AAA_right=3;count_AAA_further_right=97;left_min_count_AAA=chr10:46443107-46443112;count_AAA_min_left=0;count_AAA_max_left=3;right_min_count_AAA=chr10:46479585-46479585;count_AAA_min_right=1;count_AAA_max_right=3;homozygous=UNKNOWN + chr4 AB_SOLiD inversion 190822813 190850112 214.7 . . left=chr4:190822813-190822922;right=chr4:190850110-190850112;leftscore=140.0;rightscore=460.0;count_AAA_further_left=110;count_AAA_left=78;count_AAA_right=74;count_AAA_further_right=77;left_min_count_AAA=chr4:190822813-190822814;count_AAA_min_left=69;count_AAA_max_left=77;right_min_count_AAA=chr4:190850110-190850112;count_AAA_min_right=74;count_AAA_max_right=74;homozygous=NO + chr6 AB_SOLiD inversion 168834969 168837154 175.3 . . left=chr6:168834969-168835496;right=chr6:168836643-168837154;leftscore=185.4;rightscore=166.2;count_AAA_further_left=67;count_AAA_left=43;count_AAA_right=40;count_AAA_further_right=59;left_min_count_AAA=chr6:168835058-168835124,chr6:168835143-168835161,chr6:168835176-168835181,chr6:168835231-168835262;count_AAA_min_left=23;count_AAA_max_left=29;right_min_count_AAA=chr6:168836643-168836652;count_AAA_min_right=23;count_AAA_max_right=31;homozygous=NO + +The program should be able to recognize all the above GFF3-SOLiD format +automatically, and handle them accordingly. + +=item * B<Complete Genomics format> + +This format is provided by the Complete Genomics company to their customers. The +file var-[ASM-ID].tsv.bz2 includes a description of all loci where the assembled +genome differs from the reference genome. + +An example of the Complete Genomics format is shown below: + + #BUILD 1.5.0.5 + #GENERATED_AT 2009-Nov-03 19:52:21.722927 + #GENERATED_BY dbsnptool + #TYPE VAR-ANNOTATION + #VAR_ANN_SET /Proj/Pipeline/Production_Data/REF/HUMAN-F_06-REF/dbSNP.csv + #VAR_ANN_TYPE dbSNP + #VERSION 0.3 + + >locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef + 1 2 all chr1 0 959 no-call = ? + 2 2 all chr1 959 972 = = = + 3 2 all chr1 972 1001 no-call = ? + 4 2 all chr1 1001 1008 = = = + 5 2 all chr1 1008 1114 no-call = ? + 6 2 all chr1 1114 1125 = = = + 7 2 all chr1 1125 1191 no-call = ? + 8 2 all chr1 1191 1225 = = = + 9 2 all chr1 1225 1258 no-call = ? + 10 2 all chr1 1258 1267 = = = + 12 2 all chr1 1267 1275 no-call = ? + 13 2 all chr1 1275 1316 = = = + 14 2 all chr1 1316 1346 no-call = ? + 15 2 all chr1 1346 1367 = = = + 16 2 all chr1 1367 1374 no-call = ? + 17 2 all chr1 1374 1388 = = = + 18 2 all chr1 1388 1431 no-call = ? + 19 2 all chr1 1431 1447 = = = + 20 2 all chr1 1447 1454 no-call = ? + +The following information is provided in documentation from Complete Genomics, that describes the var-ASM format. + + 1. locus. Identifier of a particular genomic locus + 2. ploidy. The ploidy of the reference genome at the locus (= 2 for autosomes, 2 for pseudoautosomal regions on the sex chromosomes, 1 for males on the non-pseudoautosomal parts of the sex chromosomes, 1 for mitochondrion, '?' if varType is 'no-ref' or 'PAR-called-in-X'). The reported ploidy is fully determined by gender, chromosome and location, and is not inferred from the sequence data. + 3. haplotype. Identifier for each haplotype at the variation locus. For diploid genomes, 1 or 2. Shorthand of 'all' is allowed where the varType field is one of 'ref', 'no-call', 'no-ref', or 'PAR-called-in-X'. Haplotype numbering does not imply phasing; haplotype 1 in locus 1 is not necessarily in phase with haplotype 1 in locus 2. See hapLink, below, for phasing information. + 4. chromosome. Chromosome name in text: 'chr1','chr2', ... ,'chr22','chrX','chrY'. The mitochondrion is represented as 'chrM'. The pseudoautosomal regions within the sex chromosomes X and Y are reported at their coordinates on chromosome X. + 5. begin. Reference coordinate specifying the start of the variation (not the locus) using the half-open zero-based coordinate system. See section 'Sequence Coordinate System' for more information. + 6. end. Reference coordinate specifying the end of the variation (not the locus) using the half-open zero-based coordinate system. See section 'Sequence Coordinate System' for more information. + 7. varType. Type of variation, currently one of: + snp: single-nucleotide polymorphism + ins: insertion + del: deletion + sub: Substitution of one or more reference bases with the bases in the allele column + 'ref' : no variation; the sequence is identical to the reference sequence on the indicated haplotype + no-call-rc: 'no-call reference consistent 'one or more bases are ambiguous, but the allele is potentially consistent with the reference + no-call-ri: 'no-call reference inconsistent' one or more bases are ambiguous, but the allele is definitely inconsistent with the reference + no-call: an allele is completely indeterminate in length and composition, i.e. alleleSeq = '?' + no-ref: the reference sequence is unspecified at this locus. + PAR-called-in-X: this locus overlaps one of the pseudoautosomal regions on the sex chromosomes. The called sequence is reported as diploid sequence on Chromosome X; on chromosome Y the sequence is reported as varType = 'PAR-called-in-X'. + 8. reference. The reference sequence for the locus of variation. Empty when varType is ins. A value of '=' indicates that the user must consult the reference for the sequence; this shorthand is only used in regions where no haplotype deviates from the reference sequence. + 9. alleleSeq. The observed sequence at the locus of variation. Empty when varType is del. '?' isused to indicate 0 or more unknown bases within the sequence; 'N' is used to indicate exactly one unknown base within the sequence.'=' is used as shorthand to indicate identity to the reference sequence for non-variant sequence, i.e. when varType is 'ref'. + 10. totalScore. A score corresponding to a single variation and haplotype, representing the confidence in the call. + 11. hapLink. Identifier that links a haplotype at one locus to haplotypes at other loci. Currently only populated for very proximate variations that were assembled together. Two calls that share a hapLink identifier are expected to be on the same haplotype, + 12. xRef. Field containing external variation identifiers, currently only populated for variations corroborated directly by dbSNP. Format: dbsnp:[rsID], with multiple entries separated by the semicolon (;). + +In older versions of the format specification, the sub keyword used to be insdel +keyword. ANNOVAR takes care of this. + +=item * B<SOAPsnp format> + +An example of the SOAP SNP caller format is shown below: + + chr8 35782 A R 1 A 27 1 2 G 26 1 2 5 0.500000 2.00000 1 5 + chr8 35787 G R 0 G 25 4 6 A 17 2 4 10 0.266667 1.60000 0 5 + +The following information is provided in documentation from BGI who developed +SOAP suite. It differs slightly from the description at the SOAPsnp website, and +presumably the website is outdated. + + Format description:(left to right) + 1. Chromosome name + 2. Position of locus + 3. Nucleotide at corresponding locus of reference sequence + 4. Genotype of sequencing sample + 5. Quality value + 6. nucleotide with the highest probability(first nucleotide) + 7. Quality value of the nucleotide with the highest probability + 8. Number of supported reads that can only be aligned to this locus + 9. Number of all supported reads that can be aligned to this locus + 10. Nucleotide with higher probability + 11. Quality value of nucleotide with higher probability + 12. Number of supported reads that can only be aligned to this locus + 13. Number of all supported reads that can be aligned to this locus + 14. Total number of reads that can be aligned to this locus + 15. Order and quality value + 16. Estimated copy number for this locus + 17. Presence of this locus in the dbSNP database. 1 refers to presence and 0 refers to inexistence + 18. The distance between this locus and another closest SNP + +=item * B<SOAPindel format> + +The current version of ANNOVAR handles SoapSNP and SoapIndel automatically via a +single argument '--format soap'. An example of SOAP indel caller format is shown +below: + + chr11 44061282 - +2 CT Hete + chr11 45901572 + +1 C Hete + chr11 48242562 * -3 TTC Homo + chr11 57228723 * +4 CTTT Homo + chr11 57228734 * +4 CTTT Homo + chr11 57555685 * -1 C Hete + chr11 61482191 - +3 TCC Hete + chr11 64608031 * -1 T Homo + chr11 64654936 * +1 C Homo + chr11 71188303 + -1 T Hete + chr11 75741034 + +1 T Hete + chr11 76632438 * +1 A Hete + chr11 89578266 * -2 AG Homo + chr11 104383261 * +1 T Hete + chr11 124125940 + +4 CCCC Hete + chr12 7760052 * +1 T Homo + chr12 8266049 * +3 ACG Homo + +I do not see a documentation describing this format yet as of September 2010. + +=item B<--SOAPsv format> + +An example is given below: + + Chr2 Deletion 42894 43832 43167 43555 388 0-0-0 FR 41 + +An explanation of the structural variation format is given below: + + Format description (from left to right) + 1. Chromosome name + 2. Type of structure variation + 3. Minimal value of start position in cluster + 4. Maximal value of end position in cluster + 5. Estimated start position of this structure variation + 6. Estimated end position of this structure variation + 7. Length of SV + 8. Breakpoint of SV (only for insertion) + 9. Unusual matching mode (F refers to align with forward sequence, R refers + to align with reverse + sequence) + 10. number of paired-end read which support this structure variation + +=item * B<MAQ format> + +MAQ can perform alignment and generate genotype calls, including SNP calls and +indel calls. The format is described below: + +For indel header: The output is TAB delimited with each line consisting of chromosome, start +position, type of the indel, number of reads across the indel, size of the indel +and inserted/deleted nucleotides (separated by colon), number of indels on the +reverse strand, number of indels on the forward strand, 5' sequence ahead of the +indel, 3' sequence following the indel, number of reads aligned without indels +and three additional columns for filters. + +An example is below: + + chr10 110583 - 2 -2:AG 0 1 GCGAGACTCAGTATCAAAAAAAAAAAAAAAAA AGAAAGAAAGAAAAAGAAAAAAATAGAAAGAA 1 @2, @72, @0, + chr10 120134 - 8 -2:CA 0 1 CTCTTGCCCGCTCACACATGTACACACACGCG CACACACACACACACACATCAGCTACCTACCT 7 @65,62,61,61,45,22,7, @9,12,13,13,29,52,67, @0,0,0,0,0,0,0, + chr10 129630 - 1 -1:T 1 0 ATGTTGTGACTCTTAATGGATAAGTTCAGTCA TTTTTTTTTAGCTTTTAACCGGACAAAAAAAG 0 @ @ @ + chr10 150209 - 1 4:TTCC 1 0 GCATATAGGGATGGGCACTTTACCTTTCTTTT TTCCTTCCTTCCTTCCTTCCCTTTCCTTTCCT 0 @ @ @ + chr10 150244 - 2 -4:TTCT 0 1 CTTCCTTCCTTCCTTCCCTTTCCTTTCCTTTC TTCTTTCTTTCTTTCTTTCTTTTTTTTTTTTT 0 @ @ @ + chr10 159622 - 1 3:AGG 0 1 GAAGGAGGAAGGACGGAAGGAGGAAGGAAGGA AGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGA 0 @ @ @ + chr10 206372 - 2 2:GT 1 0 ATAATAGTAACTGTGTATTTGATTATGTGTGC GTGTGTGTGTGTGTGTGTGTGTGTGCGTGCTT 1 @37, @37, @8, + chr10 245751 - 11 -1:C 0 1 CTCATAAATACAAGTCATAATGAAAGAAATTA CCACCATTTTCTTATTTTCATTCATTTTTAGT 10 @69,64,53,41,30,25,22,14,5,4, @5,10,21,33,44,49,52,60,69,70, @0,0,0,0,0,0,0,0,0,0, + chr10 253066 - 1 2:TT 0 1 TATTGATGAGGGTGGATTATACTTTAGAACAC TATTCAAACAGTTCTTCCACATATCTCCCTTT 0 @ @ @ + chr10 253455 - 2 -3:AAA 1 0 GTTGCACTCCAGCCTGGCGAGATTCTGTCTCC AAAAAAAAAAAAAAAAATTGTTGTGAAATACA 1 @55, @19, @4, + +For snp output file: Each line consists of chromosome, position, reference base, +consensus base, Phred-like consensus quality, read depth, the average number of +hits of reads covering this position, the highest mapping quality of the reads +covering the position, the minimum consensus quality in the 3bp flanking regions +at each side of the site (6bp in total), the second best call, log likelihood +ratio of the second best and the third best call, and the third best call. + +An example is below: + + chr10 83603 C T 28 12 2.81 63 34 Y 26 C + chr10 83945 G R 59 61 4.75 63 62 A 47 G + chr10 83978 G R 47 40 3.31 63 62 A 21 G + chr10 84026 G R 89 22 2.44 63 62 G 49 A + chr10 84545 C T 54 9 1.69 63 30 N 135 N + chr10 85074 G A 42 5 1.19 63 38 N 108 N + chr10 85226 A T 42 5 1.00 63 42 N 107 N + chr10 85229 C T 42 5 1.00 63 42 N 112 N + chr10 87518 A G 39 4 3.25 63 38 N 9 N + chr10 116402 T C 39 4 1.00 63 38 N 76 N + + +=item * B<CASAVA format> + +An example of Illumina CASAVA format is given below: + + #position A C G T modified_call total used score reference type + 14930 3 0 8 0 GA 11 11 29.10:11.10 A SNP_het2 + 14933 4 0 7 0 GA 11 11 23.50:13.70 G SNP_het1 + 14976 3 0 8 0 GA 11 11 24.09:9.10 G SNP_het1 + 15118 2 1 4 0 GA 8 7 10.84:6.30 A SNP_het2 + +An example of the indels is given below: + + # ** CASAVA depth-filtered indel calls ** + #$ CMDLINE /illumina/pipeline/install/CASAVA_v1.7.0/libexec/CASAVA-1.7.0/filterIndelCalls.pl--meanReadDepth=2.60395068970547 --indelsCovCutoff=-1 --chrom=chr1.fa /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0000.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0001.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0002.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0003.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0004.txt + #$ CHROMOSOME chr1.fa + #$ MAX_DEPTH undefined + # + #$ COLUMNS pos CIGAR ref_upstream ref/indel ref_downstream Q(indel) max_gtype Q(max_gtype) max2_gtype bp1_reads ref_reads indel_reads other_reads repeat_unit ref_repeat_count indel_repeat_count + 948847 1I CCTCAGGCTT -/A ATAATAGGGC 969 hom 47 het 22 0 16 6 A 1 2 + 978604 2D CACTGAGCCC CT/-- GTGTCCTTCC 251 hom 20 het 8 0 4 4 CT 1 0 + 1276974 4I CCTCATGCAG ----/ACAC ACACATGCAC 838 hom 39 het 18 0 14 4 AC 2 4 + 1289368 2D AGCCCGGGAC TG/-- GGAGCCGCGC 1376 hom 83 het 33 0 25 9 TG 1 0 + +=item * B<VCF4 format> + +VCF4 can be used to describe both population-level variation information, or for +reads derived from a single individual. + +One example of the indel format for one individual is given below: + + ##fileformat=VCFv4.0 + ##IGv2_bam_file_used=MIAPACA2.alnReAln.bam + ##INFO=<ID=AC,Number=2,Type=Integer,Description="# of reads supporting consensus indel/any indel at the site"> + ##INFO=<ID=DP,Number=1,Type=Integer,Description="total coverage at the site"> + ##INFO=<ID=MM,Number=2,Type=Float,Description="average # of mismatches per consensus indel-supporting read/per reference-supporting read"> + ##INFO=<ID=MQ,Number=2,Type=Float,Description="average mapping quality of consensus indel-supporting reads/reference-supporting reads"> + ##INFO=<ID=NQSBQ,Number=2,Type=Float,Description="Within NQS window: average quality of bases from consensus indel-supporting reads/from reference-supporting reads"> + ##INFO=<ID=NQSMM,Number=2,Type=Float,Description="Within NQS window: fraction of mismatching bases in consensus indel-supporting reads/in reference-supporting reads"> + ##INFO=<ID=SC,Number=4,Type=Integer,Description="strandness: counts of forward-/reverse-aligned indel-supporting reads / forward-/reverse-aligned reference supporting reads"> + ##IndelGenotyperV2="" + ##reference=hg18.fa + ##source=IndelGenotyperV2 + #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Miapaca_trimmed_sorted.bam + chr1 439 . AC A . PASS AC=5,5;DP=7;MM=7.0,3.0;MQ=23.4,1.0;NQSBQ=23.98,25.5;NQSMM=0.04,0.0;SC=2,3,0,2 GT 1/0 + chr1 714048 . T TCAAC . PASS AC=3,3;DP=9;MM=3.0,7.1666665;MQ=1.0,10.833333;NQSBQ=23.266666,21.932203;NQSMM=0.0,0.15254237;SC=3,0,3,3 GT 0/1 + chr1 714049 . G GC . PASS AC=3,3;DP=9;MM=3.0,7.1666665;MQ=1.0,10.833333;NQSBQ=23.233334,21.83051;NQSMM=0.0,0.15254237;SC=3,0,3,3 GT 0/1 + chr1 813675 . A AATAG . PASS AC=5,5;DP=8;MM=0.4,1.0;MQ=5.0,67.0;NQSBQ=25.74,25.166666;NQSMM=0.0,0.033333335;SC=4,1,1,2 GT 0/1 + chr1 813687 . AGAGAGAGAGAAG A . PASS AC=5,5;DP=8;MM=0.4,1.0;MQ=5.0,67.0;NQSBQ=24.54,25.2;NQSMM=0.02,0.06666667;SC=4,1,1,2 GT 1/0 + + +=back + +The code was written by Dr. Kai Wang and modified by Dr. Germán Gastón Leparc. +Various users have provided sample input files for many SNP callin software, for +the development of conversion subroutines. We thank these users for their +continued support to improve the functionality of the script. + +For questions or comments, please contact kai@openbioinformatics.org. + +=cut
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gfapts/inc/perlmod/ngsutil.pm Fri Jun 29 10:20:55 2012 -0400 @@ -0,0 +1,84 @@ +package ngsutil; +use Exporter; +our @ISA = qw[ Exporter ]; +our @EXPORT = qw[ &explode_varcall &varscan ]; +use strict; +use warnings FATAL => qw[ numeric uninitialized ]; +use List::Util qw[ sum ]; + +sub explode_varcall{ + my $N=0; + $_=shift @_ foreach my($POS, $REF, $ALT); + $_=$POS foreach my($START, $END); + my(@length, @range, @idx, @VAR, @POS); + @{$_}=() foreach (\@length, \@range, \@idx, \@VAR, \@POS); + push @length, length($_) foreach ($REF, $ALT); + @range=sort{ $a<=>$b } @length; + if($range[0]==1){ + if($range[1]!=1){ + foreach ($REF, $ALT){ + $_=substr($_, 1); + $_=~s/^$/-/; + } + if($length[0]!=1){ + $END+=$length[0]-1; + $START++; + } + } + push @POS, $START, $END; + push @VAR, $REF, $ALT; + }else{ + my @N=(); + undef $_ foreach my ($i, $VAR); + $_-=2 foreach (@length, @range); + $_++ foreach ($START, $END); + $_=substr($_, 1) foreach ($REF, $ALT); + my $indel='-' x ($range[1]-$range[0]); + $VAR.=($_>$range[0])? + ('-'):((substr($REF, $_, 1) ne substr($ALT, $_, 1))? + 0:1) for 0 .. $range[1]; + $N++ while $VAR =~ /0/g; + if($length[0]<$length[1]){ + @VAR=($VAR); + @N=($N); + $N=0; + undef($VAR); + $VAR.=($_>$range[0])? + ('-'):((substr($REF, $length[0]-$_, 1) ne substr($ALT, $length[1]-$_, 1))? + 0:1) for reverse 0 .. $range[1]; + $N++ while $VAR =~ /0/g; + if($N>=$N[0]){ $N=shift(@N); $VAR=shift(@VAR); } + else{ $REF=$indel . $REF; } + }else{ $ALT.=$indel; } + foreach (qw[ 0 \- ]){ + push @idx, [ $-[0], $+[0]-$-[0] ] while ($VAR =~ /$_+/g); + } + @{$_}=() foreach (\@VAR, \@POS); + foreach my $k (@idx){ + push @VAR, substr($_, ${$k}[0], ${$k}[1]) || '-' foreach ($REF, $ALT); + push @POS, ${$k}[0], sum(@{$k})-1; + } + $_+=$START foreach @POS; + $_=~s/\-+/\-/ foreach @VAR; + for($i=0; $i<$#POS; $i+=2){ $POS[$i+1]=$POS[$i] if $VAR[$i] eq '-'; } + } + return(\@POS, \@VAR); + } + +sub varscan{ + $_=shift @_ foreach my($kname, $fpath, $href); + my($k, @buffer); + open IN, "<$fpath" or die $!; + while(<IN>){ + next if /^#/; + chomp; + @buffer=split /\s+/, $_; + next if !exists $$href{($k=join(':', @buffer[0..2]))}; + next if $$href{$k}->{ref} !~ $buffer[3]; + next if $$href{$k}->{alt} !~ $buffer[4]; + splice(@buffer, 0, 5); + $$href{$k}->{$kname}=join(':', @buffer); + } + close IN; + } +1; \ No newline at end of file