comparison gfapts/inc/annovar/annotate_variation.pl @ 0:f753b30013e6 draft

Uploaded
author rdaveau
date Fri, 29 Jun 2012 10:20:55 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f753b30013e6
1 #!/usr/bin/perl
2 use warnings;
3 use strict;
4 use Pod::Usage;
5 use Getopt::Long;
6 use File::Spec;
7 use Cwd;
8
9 our $VERSION = '$Revision: 466 $';
10 our $LAST_CHANGED_DATE = '$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $';
11
12 our ($verbose, $help, $man);
13 our ($queryfile, $dbloc);
14 our ($outfile, $separate, $batchsize, $dbtype, $neargene, $genomebinsize, $geneanno, $regionanno, $filter, $downdb, $buildver, $score_threshold, $normscore_threshold, $minqueryfrac, $expandbin, $splicing_threshold,
15 $maf_threshold, $chromosome, $zerostart, $rawscore, $memfree, $memtotal, $sift_threshold, $gff3dbfile, $genericdbfile, $vcfdbfile, $time, $wget, $precedence,
16 $webfrom, $colsWanted, $comment, $scorecolumn, $transfun, $exonsort, $avcolumn, $bedfile);
17 our (%valichr, $dbtype1);
18 our (@precedence, @colsWanted, @avcolumn);
19 sub printerr; #declare a subroutine
20
21 our %codon1 = (TTT=>"F", TTC=>"F", TCT=>"S", TCC=>"S", TAT=>"Y", TAC=>"Y", TGT=>"C", TGC=>"C", TTA=>"L", TCA=>"S", TAA=>"*", TGA=>"*", TTG=>"L", TCG=>"S", TAG=>"*", TGG=>"W", CTT=>"L", CTC=>"L", CCT=>"P", CCC=>"P", CAT=>"H", CAC=>"H", CGT=>"R", CGC=>"R", CTA=>"L", CTG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", ATT=>"I", ATC=>"I", ACT=>"T", ACC=>"T", AAT=>"N", AAC=>"N", AGT=>"S", AGC=>"S", ATA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", ATG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GTT=>"V", GTC=>"V", GCT=>"A", GCC=>"A", GAT=>"D", GAC=>"D", GGT=>"G", GGC=>"G", GTA=>"V", GTG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G");
22 our %codon3 = (TTT=>"Phe", TTC=>"Phe", TCT=>"Ser", TCC=>"Ser", TAT=>"Tyr", TAC=>"Tyr", TGT=>"Cys", TGC=>"Cys", TTA=>"Leu", TCA=>"Ser", TAA=>"*", TGA=>"*", TTG=>"Leu", TCG=>"Ser", TAG=>"*", TGG=>"Trp", CTT=>"Leu", CTC=>"Leu", CCT=>"Pro", CCC=>"Pro", CAT=>"His", CAC=>"His", CGT=>"Arg", CGC=>"Arg", CTA=>"Leu", CTG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", ATT=>"Ile", ATC=>"Ile", ACT=>"Thr", ACC=>"Thr", AAT=>"Asn", AAC=>"Asn", AGT=>"Ser", AGC=>"Ser", ATA=>"Ile", ACA=>"Thr", AAA=>"Lys", AGA=>"Arg", ATG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"Arg", GTT=>"Val", GTC=>"Val", GCT=>"Ala", GCC=>"Ala", GAT=>"Asp", GAC=>"Asp", GGT=>"Gly", GGC=>"Gly", GTA=>"Val", GTG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly");
23 our %codonfull = (TTT=>"Phenylalanine", TTC=>"Phenylalanine", TCT=>"Serine", TCC=>"Serine", TAT=>"Tyrosine", TAC=>"Tyrosine", TGT=>"Cysteine", TGC=>"Cysteine", TTA=>"Leucine", TCA=>"Serine", TAA=>"Stop", TGA=>"Stop", TTG=>"Leucine", TCG=>"Serine", TAG=>"Stop", TGG=>"Tryptophan", CTT=>"Leucine", CTC=>"Leucine", CCT=>"Proline", CCC=>"Proline", CAT=>"Histidine", CAC=>"Histidine", CGT=>"Arginine", CGC=>"Arginine", CTA=>"Leucine", CTG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", ATT=>"Isoleucine", ATC=>"Isoleucine", ACT=>"Threonine", ACC=>"Threonine", AAT=>"Asparagine", AAC=>"Asparagine", AGT=>"Serine", AGC=>"Serine", ATA=>"Isoleucine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Arginine", ATG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Arginine", GTT=>"Valine", GTC=>"Valine", GCT=>"Alanine", GCC=>"Alanine", GAT=>"Aspartic acid", GAC=>"Aspartic acid", GGT=>"Glycine", GGC=>"Glycine", GTA=>"Valine", GTG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine");
24 our %codonr1 = (UUU=>"F", UUC=>"F", UCU=>"S", UCC=>"S", UAU=>"Y", UAC=>"Y", UGU=>"C", UGC=>"C", UUA=>"L", UCA=>"S", UAA=>"*", UGA=>"*", UUG=>"L", UCG=>"S", UAG=>"*", UGG=>"W", CUU=>"L", CUC=>"L", CCU=>"P", CCC=>"P", CAU=>"H", CAC=>"H", CGU=>"R", CGC=>"R", CUA=>"L", CUG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", AUU=>"I", AUC=>"I", ACU=>"T", ACC=>"T", AAU=>"N", AAC=>"N", AGU=>"S", AGC=>"S", AUA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", AUG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GUU=>"V", GUC=>"V", GCU=>"A", GCC=>"A", GAU=>"D", GAC=>"D", GGU=>"G", GGC=>"G", GUA=>"V", GUG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G");
25 our %codonr3 = (UUU=>"Phe", UUC=>"Phe", UCU=>"Ser", UCC=>"Ser", UAU=>"Tyr", UAC=>"Tyr", UGU=>"Cys", UGC=>"Cys", UUA=>"Leu", UCA=>"Ser", UAA=>"*", UGA=>"*", UUG=>"Leu", UCG=>"Ser", UAG=>"*", UGG=>"Trp", CUU=>"Leu", CUC=>"Leu", CCU=>"Pro", CCC=>"Pro", CAU=>"His", CAC=>"His", CGU=>"Arg", CGC=>"Arg", CUA=>"Leu", CUG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", AUU=>"Ile", AUC=>"Ile", ACU=>"Thr", ACC=>"Thr", AAU=>"Asn", AAC=>"Asn", AGU=>"Ser", AGC=>"Ser", AUA=>"Ile", ACA=>"Thr", AAA=>"Lys", AGA=>"Arg", AUG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"Arg", GUU=>"Val", GUC=>"Val", GCU=>"Ala", GCC=>"Ala", GAU=>"Asp", GAC=>"Asp", GGU=>"Gly", GGC=>"Gly", GUA=>"Val", GUG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly");
26 our %codonrfull = (UUU=>"Phenylalanine", UUC=>"Phenylalanine", UCU=>"Serine", UCC=>"Serine", UAU=>"Tyrosine", UAC=>"Tyrosine", UGU=>"Cysteine", UGC=>"Cysteine", UUA=>"Leucine", UCA=>"Serine", UAA=>"Stop", UGA=>"Stop", UUG=>"Leucine", UCG=>"Serine", UAG=>"Stop", UGG=>"Tryptophan", CUU=>"Leucine", CUC=>"Leucine", CCU=>"Proline", CCC=>"Proline", CAU=>"Histidine", CAC=>"Histidine", CGU=>"Arginine", CGC=>"Arginine", CUA=>"Leucine", CUG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", AUU=>"Isoleucine", AUC=>"Isoleucine", ACU=>"Threonine", ACC=>"Threonine", AAU=>"Asparagine", AAC=>"Asparagine", AGU=>"Serine", AGC=>"Serine", AUA=>"Isoleucine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Arginine", AUG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Arginine", GUU=>"Valine", GUC=>"Valine", GCU=>"Alanine", GCC=>"Alanine", GAU=>"Aspartic acid", GAC=>"Aspartic acid", GGU=>"Glycine", GGC=>"Glycine", GUA=>"Valine", GUG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine");
27 our %iupac = (R=>'AG', Y=>'CT', S=>'GC', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-');
28
29 processArguments (); #process program arguments, set up default values, check for errors, check for existence of db files
30 if ($geneanno) {
31 annotateQueryByGene (); #generate gene-based annoations (classify variants into intergenic, introgenic, non-synonymous, synonymous, UTR, frameshift, etc)
32 } elsif ($regionanno) {
33 annotateQueryByRegion (); #generate region-based annotations (most conserved elements, transcription factor binding sites, etc)
34 } elsif ($filter) {
35 filterQuery (); #generate filter-based annotations (identify variants not reported in variation databases)
36 } elsif ($downdb) {
37 downloadDB (); #download annotation databases from Internet
38 }
39
40 sub processArguments {
41 my @command_line = @ARGV; #command line argument
42 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'outfile=s'=>\$outfile, 'separate'=>\$separate,
43 'batchsize=s'=>\$batchsize, 'dbtype=s'=>\$dbtype, 'neargene=i'=>\$neargene, 'genomebinsize=s'=>\$genomebinsize,
44 'geneanno'=>\$geneanno, 'regionanno'=>\$regionanno, , 'filter'=>\$filter, 'downdb'=>\$downdb, 'buildver=s'=>\$buildver, 'score_threshold=f'=>\$score_threshold,
45 'normscore_threshold=i'=>\$normscore_threshold, 'minqueryfrac=f'=>\$minqueryfrac, 'expandbin=i'=>\$expandbin, 'splicing_threshold=i'=>\$splicing_threshold,
46 'maf_threshold=f'=>\$maf_threshold, 'chromosome=s'=>\$chromosome, 'zerostart'=>\$zerostart, 'rawscore'=>\$rawscore, 'memfree=i'=>\$memfree,
47 'memtotal=i'=>\$memtotal, 'sift_threshold=f'=>\$sift_threshold, 'gff3dbfile=s'=>\$gff3dbfile, 'genericdbfile=s'=>\$genericdbfile, 'vcfdbfile=s'=>\$vcfdbfile,
48 'time'=>\$time, 'wget!'=>\$wget, 'precedence=s'=>\$precedence, 'webfrom=s'=>\$webfrom, 'colsWanted=s'=>\$colsWanted, 'comment'=>\$comment,
49 'scorecolumn=i'=>\$scorecolumn, 'transcript_function'=>\$transfun, 'exonsort'=>\$exonsort, 'avcolumn=s'=>\$avcolumn, 'bedfile=s'=>\$bedfile) or pod2usage ();
50
51 $help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
52 $man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
53 @ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
54 @ARGV == 2 or pod2usage ("Syntax error");
55
56 ($queryfile, $dbloc) = @ARGV;
57
58 $dbloc =~ s/[\\\/]$//; #delete the trailing / or \ sign as part of the directory name
59 if (defined $batchsize) {
60 $batchsize =~ s/k$/000/;
61 $batchsize =~ s/m$/000000/;
62 $batchsize =~ m/^\d+$/ or pod2usage ("Error: the --batchsize argument must be a positive integer (suffix of k or m is okay)");
63 } else {
64 $batchsize = 5_000_000;
65 }
66 if (defined $genomebinsize) {
67 $genomebinsize =~ s/k$/000/;
68 $genomebinsize =~ s/m$/000000/;
69 $genomebinsize =~ m/^\d+$/ or pod2usage ("Error: the --genomebinsize argument must be a positive integer (suffix of k or m is okay)");
70 $genomebinsize > 1000 or pod2suage ("Error: the --genomebinsize argument must be larger than 1000");
71 } else {
72 if ($geneanno) {
73 $genomebinsize = 100_000; #gene usually span large genomic regions
74 } else {
75 $genomebinsize = 10_000; #MCE, TFBS, miRNA, etc are small genomic regions
76 }
77 }
78
79 $verbose ||= 0; #when it is not specified, it is zero
80 $neargene ||= 1_000; #for upstream/downstream annotation of variants, specify the distance threshold between variants and genes
81 $expandbin ||= int(2_000_000/$genomebinsize); #for gene-based annotations, when intergenic variants are found, expand to specified number of nearby bins to find closest genes
82 $outfile ||= $queryfile; #specify the prefix of output file names
83
84 #set up log file
85 if ($downdb) {
86 if (not -d $dbloc) {
87 mkdir ($dbloc) or die "Error: the directory $dbloc does not exist and cannot be created\n";
88 }
89 my $errfile = File::Spec->catfile ($dbloc, "annovar_downdb.log");
90 open (LOG, ">$errfile") or die "Error: cannot write LOG information to log file $errfile: $!\n";
91 } else {
92 open (LOG, ">$outfile.log") or die "Error: cannot write LOG information to log file $outfile.log: $!\n";
93 }
94 print LOG "ANNOVAR Version:\n\t", q/$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $/, "\n";
95 print LOG "ANNOVAR Information:\n\tFor questions, comments, documentation, bug reports and program update, please visit http://www.openbioinformatics.org/annovar/\n";
96 print LOG "ANNOVAR Command:\n\t$0 @command_line\n";
97 print LOG "ANNOVAR Started:\n\t", scalar (localtime), "\n";
98
99 my $num = 0;
100 $geneanno and $num++;
101 $downdb and $num++;
102 $filter and $num++;
103 $regionanno and $num++;
104 $num <= 1 or pod2usage ("Error in argument: please specify only one of --geneanno, -regionanno, --downdb, --filter");
105 if (not $num) {
106 $geneanno++;
107 printerr "NOTICE: The --geneanno operation is set to ON by default\n";
108 }
109
110 my %dbtype1 = ('gene'=>'refGene', 'refgene'=>'refGene', 'knowngene'=>'knownGene', 'ensgene'=>'ensGene', 'band'=>'cytoBand', 'cytoband'=>'cytoBand', 'tfbs'=>'tfbsConsSites', 'mirna'=>'wgRna',
111 'mirnatarget'=>'targetScanS', 'segdup'=>'genomicSuperDups', 'omimgene'=>'omimGene', 'gwascatalog'=>'gwasCatalog',
112 '1000g_ceu'=>'CEU.sites.2009_04', '1000g_yri'=>'YRI.sites.2009_04', '1000g_jptchb'=>'JPTCHB.sites.2009_04',
113 '1000g2010_ceu'=>'CEU.sites.2010_03', '1000g2010_yri'=>'YRI.sites.2010_03', '1000g2010_jptchb'=>'JPTCHB.sites.2010_03',
114 '1000g2010jul_ceu'=>'CEU.sites.2010_07', '1000g2010jul_yri'=>'YRI.sites.2010_07', '1000g2010jul_jptchb'=>'JPTCHB.sites.2010_07',
115 '1000g2010nov_all'=>'ALL.sites.2010_11',
116 );
117
118 if ($geneanno) {
119 $dbtype ||= 'refGene';
120 $dbtype1 = $dbtype1{$dbtype} || $dbtype;
121 #$dbtype1 =~ m/^(refGene|knownGene|ensGene)$/ or pod2usage ("Error: the gene-based annotation procedure currently only support -dbtype of refGene, knownGene and ensGene"); #commented 2011feb18
122 } elsif ($regionanno) {
123 defined $dbtype or pod2usage ("Error in argument: please specify --dbtype (required for the --regionanno operation)");
124 $dbtype1 = $dbtype1{$dbtype} || $dbtype;
125 if ($dbtype =~ m/^mce(\d+)way/) { #added 2010Feb16
126 $dbtype1 = "phastConsElements$1way";
127 }
128 if ($dbtype1 eq 'gff3') {
129 defined $gff3dbfile or pod2usage ("Error in argument: please specify --gff3dbfile for the --dbtype of 'gff3'");
130 }
131 } elsif ($filter) {
132 defined $dbtype or pod2usage ("Error in argument: please specify --dbtype (required for the --filter operation)");
133 $dbtype =~ m/^avsift|generic|1000g_(ceu|yri|jptchb)|1000g2010_(ceu|yri|jptchb)|1000g20\d\d[a-z]{3}_[a-z]+|snp\d+|vcf|(ljb_\w+)$/ or pod2usage ("Error in argument: the specified --dbtype $dbtype is not valid for --filter operation (valid ones are '1000g_ceu', '1000g2010_yri', 'snp129', 'avsift', 'vcf', 'generic', etc)");
134 $dbtype1 = $dbtype1{$dbtype} || $dbtype;
135 if ($dbtype1 eq 'generic') {
136 defined $genericdbfile or pod2usage ("Error in argument: please specify --genericdbfile for the --dbtype of 'generic'");
137 }
138 if ($dbtype eq 'vcf') {
139 defined $vcfdbfile or pod2usage ("Error in argument: please specify --vcfdbfile for the --dbtype of 'vcf'");
140 }
141 } elsif ($downdb) {
142 defined $dbtype and pod2usage ("Error in argument: please do not specify --dbtype for the --downdb operation");
143 $dbtype1 = $dbtype1{$queryfile} || $queryfile;
144 }
145
146 if (not $buildver) {
147 $buildver = 'hg18';
148 printerr "NOTICE: The --buildver is set as 'hg18' by default\n";
149 }
150
151 if ($score_threshold) {
152 $score_threshold > 0 or pod2usage ("Error in argument: the --score_threshold must be a positive number (you specified $score_threshold)");
153 $geneanno || $downdb and pod2usage ("Error in argument: the --score_threshold is not useful for --geneanno or --downdb operations");
154 }
155 if ($normscore_threshold) {
156 $normscore_threshold <= 1000 or pod2usage ("Error in argument: the --normscore_threshold must be between 0 and 1000 (you specified $normscore_threshold)");
157 $regionanno or pod2usage ("Error in argument: the --score_threshold is supported only for the --regionanno operation");
158 }
159
160
161 if (defined $sift_threshold) {
162 $filter or pod2usage ("Error in argument: the --sift_threshold is supported only for the --filter operation");
163 $dbtype1 eq 'avsift' or pod2usage ("Error in argument: the --sift_threshold argument can be used only if '--dbtype avsift' is used");
164 $sift_threshold >= 0 and $sift_threshold <= 1 or pod2usage ("Error in argument: the --sift_threshold must be between 0 and 1 inclusive");
165 } else {
166 $sift_threshold = 0.05;
167 }
168
169 #operation-specific argument
170 if (defined $splicing_threshold) {
171 $geneanno or pod2usage ("Error in argument: the --splicing_threshold is supported only for the --geneanno operation");
172 } else {
173 $splicing_threshold = 2; #for splicing annotation, specify the distance threshold between variants and exon/intron boundaries
174 }
175 if (defined $maf_threshold) {
176 $filter or pod2usage ("Error in argument: the --maf_threshold is supported only for the --filter operation");
177 } else {
178 $maf_threshold = 0; #for filter-based annotations on 1000 Genomes Project data, specify the MAF threshold to be used in filtering
179 }
180 if (defined $minqueryfrac) {
181 $regionanno or pod2usage ("Error in argument: the --minqueryfrac is supported only for the --regionanno operation");
182 } else {
183 $minqueryfrac = 0; #minimum query overlap to declare a "match" with database records
184 }
185 if (defined $gff3dbfile) {
186 $dbtype eq 'gff3' or pod2usage ("Error in argument: the --gff3dbfile argument can be used only if '--dbtype gff3' is used");
187 $geneanno or $regionanno or pod2usage ("Error in argument: the --gff3dbfile argument is supported only for the --geneanno or --regionanno operation");
188 }
189 if (defined $bedfile) {
190 $dbtype eq 'bed' or pod2usage ("Error in argument: the --bedfile argument can be used only if '--dbtype bed' is used");
191 $regionanno or pod2usage ("Error in argument: the --bedfile argument is supported only for the --regionanno operation");
192 }
193 if (defined $genericdbfile) {
194 $filter or pod2usage ("Error in argument: the --genericdbfile argument is supported only for the --filter operation");
195 }
196 if (defined $wget) {
197 $downdb or pod2usage ("Error in argument: the --wget argument is supported only for the --downdb operation");
198 } else {
199 $wget = 1; #by default, use wget for downloading files from Internet
200 }
201 if (defined $precedence) {
202 $geneanno or pod2usage ("Error in argument: the --precedence argument is supported only for the --geneanno operation");
203 @precedence = split (/,/, $precedence);
204 @precedence >= 2 or pod2usage ("Error in argument: the --precedence argument should be comma delimited");
205 for my $i (0 .. @precedence-1) {
206 $precedence[$i] =~ m/^(exonic|intronic|splicing|utr5|utr3|upstream|downstream|splicing|ncrna)$/ or pod2usage ("Error in argument: the --precedence argument contains invalid keywords (valid ones are exonic|intronic|splicing|utr5|utr3|upstream|downstream|splicing)");
207 }
208 }
209
210 if (defined $colsWanted) {
211 $regionanno or pod2usage ("Error in argument: the --colWanted argument is supported only for the --geneanno operation");
212 if (lc $colsWanted eq 'all') {
213 @colsWanted = ('all');
214 } elsif (lc $colsWanted eq 'none') {
215 @colsWanted = ('none');
216 } else {
217 @colsWanted = split (/,/, $colsWanted);
218 for my $i (0 .. @colsWanted-1) {
219 $colsWanted[$i]=~m/^\d+$/ or pod2usage ("Error in argument: the --colsWanted argument ($colsWanted) must be a list of comma delimited numbers or be 'all' or be 'none'");
220 }
221 }
222 }
223
224 if (defined $scorecolumn) {
225 $regionanno or pod2usage ("Error in argument: the --scorecolumn argument is supported only for the --regionanno operation");
226 }
227
228 if ($exonsort) {
229 $geneanno or pod2usage ("Error in argument: the --exonsort argument is supported only for the --geneanno operation");
230 }
231
232 if (defined $avcolumn) {
233 $avcolumn =~ m/^\d+,\d+,\d+,\d+,\d+$/ or pod2usage ("Error in argument: the --avcolumn argument must be five integer numbers separated by comma");
234 @avcolumn = split (/,/, $avcolumn);
235 @avcolumn = map {$_-1} @avcolumn;
236 } else {
237 @avcolumn = (0..4); #by default, the first five columns are the required AVINPUT information
238 }
239
240 if (defined $webfrom) {
241 if ($webfrom ne 'ucsc' and $webfrom ne 'annovar') {
242 $webfrom =~ m#^(http://|ftp://)# or pod2usage ("Error: the --webfrom argument needs to be 'ucsc', 'annovar', or a URL");
243 }
244 }
245
246 $maf_threshold >= 0 and $maf_threshold <= 0.5 or pod2usage ("Error in argument: the --maf_threshold must be between 0 and 0.5 (you specified $maf_threshold)");
247 $minqueryfrac >= 0 and $minqueryfrac <= 1 or pod2usage ("Error in argument: the --minqueryfrac must be between 0 and 1 (you specified $minqueryfrac)");
248 $memfree and $memfree >= 100_000 || pod2usage ("Error in argument: the --memfree argument must be at least 100000 (in the order of kilobytes)");
249 $memtotal and $memtotal >= 100_000 || pod2usage ("Error in argument: the --memtotal argument must be at least 100000 (in the order of kilobytes)");
250
251 if ($chromosome) {
252 my @chr = split (/,/, $chromosome);
253 for my $i (0 .. @chr-1) {
254 if ($chr[$i] =~ m/^(\d+)-(\d+)$/) {
255 for my $j ($1 .. $2) {
256 $valichr{$j}++;
257 }
258 } else {
259 $valichr{$chr[$i]}++;
260 }
261 }
262 printerr "NOTICE: These chromosomes in database will be examined: ", join (",", sort keys %valichr), "\n";
263 }
264 }
265
266
267 sub annotateQueryByGene {
268 my ($queryfh); #query file handle
269 my ($totalquerycount, $totalinvalidcount, $batchcount) = qw/0 0 1/;
270 open ($queryfh, $queryfile) or die "Error: cannot read from --queryfile ($queryfile): $!\n";
271
272 open (OUT, ">$outfile.variant_function") or die "Error: cannot write to output file $outfile.variant_function: $!\n";
273 open (EXONIC, ">$outfile.exonic_variant_function") or die "Error: cannot write to output file $outfile.exonic_variant_function: $!\n";
274 open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n";
275
276 my ($genedb, $geneidmap, $cdslen, $mrnalen) = readUCSCGeneAnnotation ($dbloc);
277
278 $time and printerr "NOTICE: Current time (before examining variants) is ", scalar (localtime), "\n";
279 while (1) {
280 my ($linecount, $invalidcount) = newprocessNextQueryBatchByGene ($queryfh, $batchsize, $genedb, $geneidmap, $cdslen, $mrnalen);
281 $totalquerycount += $linecount;
282 $totalinvalidcount += $invalidcount;
283 $linecount == $batchsize or last;
284 $batchcount++;
285 printerr "NOTICE: Begin processing batch $batchcount (each batch contains $batchsize variants)\n";
286 }
287 close (INVALID);
288 close (EXONIC);
289 close (OUT);
290 close ($queryfh);
291 $time and printerr "NOTICE: Current time (after examining variants) is ", scalar (localtime), "\n";
292
293 $totalinvalidcount or unlink ("$outfile.invalid_input"); #delete the file as it is empty
294 printerr "NOTICE: Finished gene-based annotation on $totalquerycount genetic variants in $queryfile";
295 $totalinvalidcount and printerr " (including $totalinvalidcount with invalid format written to $outfile.invalid_input)";
296 printerr "\n";
297 printerr "NOTICE: Output files were written to $outfile.variant_function, $outfile.exonic_variant_function\n";
298 }
299
300 sub newprocessNextQueryBatchByGene {
301 my ($queryfh, $batchsize, $genedb, $geneidmap, $cdslen, $mrnalen) = @_;
302 my (%refseqvar);
303
304 my ($chr, $start, $end, $ref, $obs);
305 my ($name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend, $name2);
306 my ($invalid);
307 my ($linecount, $invalidcount) = qw/0 0/;
308
309 for my $i (1 .. $batchsize) { #process up to batchsize variants
310 my $nextline = <$queryfh>; #read the next line in variant file
311 defined $nextline or last;
312 $nextline =~ s/[\r\n]+$//;
313
314 if ($nextline =~ m/^#/ and $comment) { #comment line start with #, do not include this is $linecount
315 print OUT "#comment\t$nextline\n";
316 next;
317 }
318
319 $linecount++; #linecount does not include the comment line
320 $invalid = 0;
321
322 my @nextline = split (/\s+/, $nextline);
323 ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
324 if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) {
325 $invalid++;
326 } else {
327 ($ref, $obs) = (uc $ref, uc $obs);
328 $zerostart and $start++;
329 $chr =~ s/^chr//;
330 if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) {
331 $invalid++;
332 } elsif ($ref eq '-' and $obs eq '-' #both are empty allele
333 or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code
334 or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code
335 or $start =~ m/[^\d]/ #start is not a number
336 or $end =~ m/[^\d]/ #end is not a number
337 or $start > $end #start is more than end
338 or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref
339 or $ref eq '-' and $start != $end #length mismatch for insertion
340 ) {
341 $invalid++;
342 }
343 }
344
345
346
347 if ($invalid) {
348 print INVALID $nextline, "\n"; #invalid record found
349 $invalidcount++;
350 next;
351 }
352
353 my (%intronic, %utr5, %utr3, %exonic, %upstream, %downstream, %ncrna, %intergenic, %splicing);
354 my $foundgenic; #variant found in genic region (between start and end position of a gene in genome)
355 my ($distl, $distr, $genel, $gener); #for intergenic variant, the distance and gene name to the left and right side of gene
356 my $bin1 = int ($start/$genomebinsize)-1; #start bin
357 $bin1 < 0 and $bin1=0;
358 my $bin2 = int ($end/$genomebinsize)+1; #end bin (usually same as start bin, unless the query is really big that spans multiple megabases)
359
360 while (not exists $genedb->{$chr, $bin1} and $bin1 > int ($start/$genomebinsize)-$expandbin) { #examine at least 5 bins (by default 5Mb) to the left to make sure that a gene is found in the bin
361 $bin1 > 0 or last;
362 $bin1--;
363 }
364
365 while (not exists $genedb->{$chr, $bin2} and $bin2 < int ($end/$genomebinsize)+$expandbin) { #examine at least 5 bins (by default 5Mb) to the right to make sure that a gene is found in the bin
366 $bin2++;
367 }
368
369 my (%seen);
370 for my $nextbin ($bin1 .. $bin2) {
371 exists $genedb->{$chr, $nextbin} or next; #this genome bin has no annotated gene (a complete intergenic region)
372 for my $nextgene (@{$genedb->{$chr, $nextbin}}) { #when $genedb->{$chr, $nextbin} is undefined, this automatically create an array!!!
373 ($name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend, $name2) = @$nextgene;
374 defined $name2 or printerr "WARNING: name2 field is not provided for transcript $name (start=$txstart end=$txend)\n" and $name2='';
375 $seen{$name, $txstart} and next; #name and txstart uniquely identify a transcript and chromosome position (sometimes same transcript may map to two nearby positions, such as nearby segmental duplications)
376 $seen{$name, $txstart}++; #a transcript may be in two adjacent bins, so if one is already scanned, there is no need to work on it again
377
378 if ($transfun) { #variant_function output contains transcript name, rather than gene name
379 $name2 = $name;
380 }
381
382 if (not $foundgenic) { #this variant has not hit a genic region yet
383 if ($start > $txend) {
384 defined $distl or $distl = $start-$txend and $genel=$name2;
385 $distl > $start-$txend and $distl = $start-$txend and $genel=$name2; #identify left closest gene
386 }
387
388 if ($end < $txstart) {
389 defined $distr or $distr = $txstart-$end and $gener=$name2;
390 $distr > $txstart-$end and $distr = $txstart-$end and $gener=$name2; #identify right closest gene
391 }
392 }
393
394 if ($end < $txstart) {
395 #query ---
396 #gene <-*----*->
397 $foundgenic and last; #if found a genic annotation already, end the search of the bins
398 if ($end > $txstart - $neargene) {
399 if ($dbstrand eq '+') {
400 $upstream{$name2}++;
401 } else {
402 $downstream{$name2}++;
403 }
404 } else {
405 last; #if transcript is too far away from end, end the search of the bins
406 }
407 } elsif ($start > $txend) {
408 #query ---
409 #gene <-*----*->
410 if (not $foundgenic and $start < $txend + $neargene) {
411 if ($dbstrand eq '+') {
412 $downstream{$name2}++;
413 } else {
414 $upstream{$name2}++;
415 }
416 }
417 } elsif ($cdsstart == $cdsend+1) { #non-coding RNA (could be microRNA, or could be due to lack of CDS annotation for mRNA such as NR_026730 or BC039000). Previously we already did cdsstart++ so here the cdsstart is more than cdsend
418 if ($start >= $txstart and $start <= $txend or $end >= $txstart and $end <= $txend or $start <= $txstart and $end >= $txend) {
419 $ncrna{$name2}++;
420 $foundgenic++;
421 }
422 } else { #query overlaps with coding region of gene
423 my ($lenintron) = (0); #cumulative intron length at a given exon
424 my ($rcdsstart, $rvarstart, $rvarend); #start of coding and variant in reference mRNA sequence
425 my @exonstart = @$exonstart;
426 my @exonend = @$exonend;
427 my $foundexonic;
428 if ($dbstrand eq '+') { #forward strand, search from left to right (first exon to last exon)
429 for my $k (0 .. @exonstart-1) {
430 $k and $lenintron += ($exonstart[$k]-$exonend[$k-1]-1); #calculate cumulative intron length
431 if ($cdsstart >= $exonstart[$k]) { #calculate CDS start accurately by considering intron length
432 $rcdsstart = $cdsstart-$txstart-$lenintron+1;
433 }
434
435 #splicing calculation
436 if ($start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1 or $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) {
437 $splicing{$name2}++; #when query start site is close to exon start or exon end
438 }
439 if ($end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]+$splicing_threshold-1 or $end >= $exonend[$k]-$splicing_threshold+1 and $end <= $exonend[$k]+$splicing_threshold) {
440 $splicing{$name2}++; #when query end site is close to exon start or exon end
441 }
442 if ($start <= $exonstart[$k] and $end>=$exonstart[$k] or $start <= $exonend[$k] and $end >= $exonend[$k]) {
443 $splicing{$name2}++; #when query encompass the exon/intron boundary
444 }
445
446 if ($start < $exonstart[$k]) {
447 if ($end >= $exonstart[$k]) { #exonic
448 $rvarstart = $exonstart[$k]-$txstart-$lenintron+1;
449
450 for my $m ($k .. @exonstart-1) {
451 $m > $k and $lenintron += ($exonstart[$m]-$exonend[$m-1]-1);
452 if ($end < $exonstart[$m]) {
453 #query --------
454 #gene <--**---******---****---->
455 $rvarend = $exonend[$m-1]-$txstart-$lenintron+1 + ($exonstart[$m]-$exonend[$m-1]-1);
456 last;
457 } elsif ($end <= $exonend[$m]) {
458 #query -----------
459 #gene <--**---******---****---->
460 $rvarend = $end-$txstart-$lenintron+1;
461 last;
462 }
463 }
464 if (not defined $rvarend) {
465 $rvarend = $txend-$txstart-$lenintron+1; #if this value is longer than transcript length, it suggest whole gene deletion
466 }
467
468 #here the trick begins to differentiate UTR versus coding exonic
469 if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
470 #query ----
471 #gene <--*---*->
472 $utr5{$name2}++; #positive strand for UTR5
473 } elsif ($start > $cdsend) {
474 #query ----
475 #gene <--*---*->
476 $utr3{$name2}++; #positive strand for UTR3
477 } else {
478 $exonic{$name2}++;
479 $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '+', $i, $k+1, $nextline]; #refseq CDS start, refseq variant start
480 }
481 $foundgenic++;
482 last;
483 } elsif ($k and $start > $exonend[$k-1]) { #intronic
484 $intronic{$name2}++;
485 $foundgenic++;
486 last;
487 }
488 } elsif ($start <= $exonend[$k]) { #exonic
489 $rvarstart = $start-$txstart-$lenintron+1;
490
491 for my $m ($k .. @exonstart-1) {
492 $m > $k and $lenintron += ($exonstart[$m]-$exonend[$m-1]-1);
493 if ($end < $exonstart[$m]) {
494 #query ------
495 #gene <--**---******---****---->
496 $rvarend = $exonend[$m-1]-$txstart-$lenintron+1 + ($exonstart[$m]-$exonend[$m-1]-1);
497 last;
498 } elsif ($end <= $exonend[$m]) {
499 #query -----------
500 #gene <--**---******---****---->
501 $rvarend = $end-$txstart-$lenintron+1;
502 last;
503 }
504 }
505 if (not defined $rvarend) {
506 $rvarend = $txend-$txstart-$lenintron+1; #if this value is longer than transcript length, it suggest whole gene deletion
507 }
508
509 #here is the trick begins to differentiate UTR versus coding exonic
510 if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
511 #query ----
512 #gene <--*---*->
513 $utr5{$name2}++; #positive strand for UTR5
514 } elsif ($start > $cdsend) {
515 #query ----
516 #gene <--*---*->
517 $utr3{$name2}++; #positive strand for UTR3
518 } else {
519 $exonic{$name2}++;
520 $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '+', $i, $k+1, $nextline]; #queryindex, refseq CDS start, refseq variant start
521 }
522 $foundgenic++;
523 last;
524 }
525 }
526 } elsif ($dbstrand eq '-') { #process negative strand (in the future, this should be fused to the paragraph above for positive strands; for now, I keep them separate for easier debugging)
527 for (my $k = @exonstart-1; $k>=0; $k--) {
528 $k < @exonstart-1 and $lenintron += ($exonstart[$k+1]-$exonend[$k]-1);
529 if ($cdsend <= $exonend[$k]) { #calculate CDS start accurately by considering intron length
530 $rcdsstart = $txend-$cdsend-$lenintron+1;
531 }
532
533 #splicing calculation
534 if ($start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1 or $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) {
535 $splicing{$name2}++;
536 }
537 if ($end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]+$splicing_threshold-1 or $end >= $exonend[$k]-$splicing_threshold+1 and $end <= $exonend[$k]+$splicing_threshold) {
538 $splicing{$name2}++;
539 }
540 if ($start <= $exonstart[$k] and $end>=$exonstart[$k] or $start <= $exonend[$k] and $end >= $exonend[$k]) {
541 $splicing{$name2}++;
542 }
543
544 if ($end > $exonend[$k]) {
545 if ($start <= $exonend[$k]) {
546 $rvarstart = $txend-$exonend[$k]-$lenintron+1;
547
548 for (my $m = $k; $m >= 0; $m--) {
549 $m < $k and $lenintron += ($exonstart[$m+1]-$exonend[$m]-1);
550 if ($start > $exonend[$m]) {
551 #query --------
552 #gene <--**---******---****---->
553 #$rvarend = $txend-$exonstart[$m]-$lenintron+1 - ($exonstart[$m+1]-$exonend[$m]-1); #commented out 2011feb18
554 $rvarend = $txend-$exonstart[$m+1]+1-$lenintron + ($exonstart[$m+1]-$exonend[$m]-1); #fixed this 2011feb18
555 last; #finsih the cycle!!!!!!!!!!!!!!!!!!!
556 } elsif ($start >= $exonstart[$m]) { #start within exons
557 #query ----
558 #gene <--**---******---****---->
559 $rvarend = $txend-$start-$lenintron+1;
560 last;
561 }
562 }
563 if (not defined $rvarend) { #if rvarend is not found, then the whole tail of gene is covered
564 $rvarend = $txend-$txstart-$lenintron+1;
565 }
566
567 #here is the trick begins to differentiate UTR versus coding exonic
568 if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
569 #query ----
570 #gene <--*---*->
571 $utr3{$name2}++; #negative strand for UTR5
572 } elsif ($start > $cdsend) {
573 #query ----
574 #gene <--*---*->
575 $utr5{$name2}++; #negative strand for UTR3
576 } else {
577 $exonic{$name2}++;
578 $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '-', $i, @exonstart-$k, $nextline];
579 }
580 $foundgenic++;
581 last;
582 } elsif ($k < @exonstart-1 and $end < $exonstart[$k+1]) {
583 $intronic{$name2}++;
584 $foundgenic++;
585 last;
586 }
587 } elsif ($end >= $exonstart[$k]) {
588 $rvarstart = $txend-$end-$lenintron+1; #all the rvarstart, rvarend are with respect to the cDNA sequence (so rvarstart corresponds to end of variants)
589
590 for (my $m = $k; $m >= 0; $m--) {
591 $m < $k and $lenintron += ($exonstart[$m+1]-$exonend[$m]-1);
592 if ($start > $exonend[$m]) {
593 #query ----
594 #gene <--**---******---****---->
595 #$rvarend = $txend-$exonstart[$m]-$lenintron+1 - ($exonstart[$m+1]-$exonend[$m]-1); #commented out 2011feb18 due to bug (10 42244567 42244600 CACCTTTGCTTGATATGATAATATAGTGCCAAGG - hetero)
596 $rvarend = $txend-$exonstart[$m+1]+1 - $lenintron + ($exonstart[$m+1]-$exonend[$m]-1); #fixed this 2011feb18
597 last; #finish the circle of counting exons!!!!!
598 } elsif ($start >= $exonstart[$m]) { #the start is right located within exon
599 #query -------
600 #gene <--**---******---****---->
601 $rvarend = $txend-$start-$lenintron+1;
602 last; #finish the cycle
603 }
604 }
605 if (not defined $rvarend) { #if rvarend is not found, then the whole tail of gene is covered
606 $rvarend = $txend-$txstart-$lenintron+1;
607 }
608
609 #here the trick begins to differentiate UTR versus coding exonic
610 if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
611 #query ----
612 #gene <--*---*->
613 $utr3{$name2}++; #negative strand for UTR5
614 } elsif ($start > $cdsend) {
615 #query ----
616 #gene <--*---*->
617 $utr5{$name2}++; #negative strand for UTR3
618 } else {
619 $exonic{$name2}++;
620 $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '-', $i, @exonstart-$k, $nextline];
621 }
622 $foundgenic++;
623 last;
624 }
625 }
626 }
627 }
628 }
629 }
630 $foundgenic or $intergenic{$name2}++;
631 $i =~ m/000000$/ and printerr "NOTICE: Finished analyzing $i query variants\n";
632
633
634 my (@txname, %genename);
635
636 if ($separate) { #separately print out each effect on one line
637 if (%exonic or %splicing or %intronic or %utr5 or %utr3 or %ncrna or %upstream or %downstream) {
638 %exonic and print OUT "exonic\t", join(",", sort keys %exonic), "\t", $nextline, "\n";
639 %splicing and $end-$start+1<=$splicing_threshold and print OUT "splicing\t", join (",", sort keys %splicing), "\t", $nextline, "\n";
640 %intronic and print OUT "intronic\t", join(",", sort keys %intronic), "\t", $nextline, "\n";
641 %utr5 and print OUT "UTR5\t", join(",", sort keys %utr5), "\t", $nextline, "\n";
642 %utr3 and print OUT "UTR3\t", join(",", sort keys %utr3), "\t", $nextline, "\n";
643 %ncrna and print OUT "ncRNA\t", join(",", sort keys %ncrna), "\t", $nextline, "\n";
644 %upstream and print OUT "upstream\t", join(",", sort keys %upstream), "\t", $nextline, "\n";
645 %downstream and print OUT "downstream\t", join(",", sort keys %downstream), "\t", $nextline, "\n";
646 } elsif (%intergenic) {
647 $genel ||= "NONE";
648 $gener ||= "NONE";
649 $distl ||= "NONE";
650 $distr ||= "NONE";
651 print OUT "intergenic\t", "$genel(dist=$distl),$gener(dist=$distr)", "\t", $nextline, "\n";
652 } else {
653 die "FATAL ERROR: please report bug to ANNOVAR author with your input file\n";
654 }
655 } else {
656 if (@precedence) {
657 my $foundmatch;
658 for my $i (0 .. @precedence-2) {
659 $precedence[$i] eq 'exonic' and %exonic and $foundmatch++;
660 $precedence[$i] eq 'splicing' and %splicing and $foundmatch++;
661 $precedence[$i] eq 'intronic' and %intronic and $foundmatch++;
662 $precedence[$i] eq 'utr5' and %utr5 and $foundmatch++;
663 $precedence[$i] eq 'utr3' and %utr3 and $foundmatch++;
664 $precedence[$i] eq 'ncrna' and %ncrna and $foundmatch++;
665 $precedence[$i] eq 'upstream' and %upstream and $foundmatch++;
666 $precedence[$i] eq 'downstream' and %downstream and $foundmatch++;
667 $precedence[$i] eq 'intergenic' and %intergenic and $foundmatch++;
668 if ($foundmatch) {
669 for my $j ($i+1 .. @precedence-1) {
670 $precedence[$j] eq 'exonic' and %exonic = ();
671 $precedence[$j] eq 'splicing' and %splicing = ();
672 $precedence[$j] eq 'intronic' and %intronic = ();
673 $precedence[$j] eq 'utr5' and %utr5 = ();
674 $precedence[$j] eq 'utr3' and %utr3 = ();
675 $precedence[$j] eq 'ncrna' and %ncrna = ();
676 $precedence[$j] eq 'upstream' and %upstream = ();
677 $precedence[$j] eq 'downstream' and %downstream = ();
678 $precedence[$j] eq 'intergenic' and %intergenic = ();
679 }
680 last;
681 }
682 }
683 }
684
685
686 if (%exonic) {
687 if (%splicing and $end-$start+1<=$splicing_threshold) { #a big deletion spanning splicing site is not really a "splicing" mutation
688 print OUT "exonic;splicing\t", join(",", sort keys %exonic), ";", join (",", sort keys %splicing), "\t", $nextline, "\n";
689 } else {
690 print OUT "exonic\t", join(",", sort keys %exonic), "\t", $nextline, "\n";
691 }
692 } elsif (%splicing) {
693 print OUT "splicing\t", join (",", sort keys %splicing), "\t", $nextline, "\n";
694 } elsif (%ncrna) {
695 print OUT "ncRNA\t", join(",", sort keys %ncrna), "\t", $nextline, "\n";
696 } elsif (%utr5 or %utr3) {
697 if (%utr5 and %utr3) {
698 print OUT "UTR5;UTR3\t", join(",", sort keys %utr5), ";", join(",", sort keys %utr3), "\t", $nextline, "\n"; #use ";" to separate UTR5 and UTR3 genes
699 } elsif (%utr5) {
700 print OUT "UTR5\t", join(",", sort keys %utr5), "\t", $nextline, "\n";
701 } else {
702 print OUT "UTR3\t", join(",", sort keys %utr3), "\t", $nextline, "\n";
703 }
704 } elsif (%intronic) {
705 print OUT "intronic\t", join(",", sort keys %intronic), "\t", $nextline, "\n";
706 } elsif (%upstream or %downstream) {
707 if (%upstream and %downstream) {
708 print OUT "upstream;downstream\t", join(",", sort keys %upstream), ";", join(",", sort keys %downstream), "\t", $nextline, "\n";
709 } elsif (%upstream) {
710 print OUT "upstream\t", join(",", sort keys %upstream), "\t", $nextline, "\n";
711 } else {
712 print OUT "downstream\t", join(",", sort keys %downstream), "\t", $nextline, "\n";
713 }
714 } elsif (%intergenic) {
715 $genel ||= "NONE";
716 $gener ||= "NONE";
717 $distl ||= "NONE";
718 $distr ||= "NONE";
719 print OUT "intergenic\t", "$genel(dist=$distl),$gener(dist=$distr)", "\t", $nextline, "\n";
720 } else {
721 die "FATAL ERROR: please report bug to ANNOVAR author with your input file\n";
722 }
723 }
724 }
725 %refseqvar and annotateExonicVariants (\%refseqvar, $geneidmap, $cdslen, $mrnalen);
726
727 return ($linecount, $invalidcount);
728 }
729
730 sub annotateExonicVariants {
731 my ($refseqvar, $geneidmap, $cdslen, $mrnalen) = @_;
732 my $refseqhash;
733 my $function = {};
734 my %varinfo; #variants information (same as input line)
735
736 $refseqhash = readSeqFromFASTADB ($refseqvar);
737
738 for my $seqid (keys %$refseqvar) {
739 for my $i (0 .. @{$refseqvar->{$seqid}}-1) {
740 my ($refcdsstart, $refvarstart, $refvarend, $refstrand, $index, $exonpos, $nextline) = @{$refseqvar->{$seqid}->[$i]};
741 my ($wtnt3, $wtnt3_after, @wtnt3, $varnt3, $wtaa, $wtaa_after, $varaa, $varpos); #wtaa_after is the aa after the wtaa
742 my ($chr, $start, $end, $ref, $obs);
743
744 my @nextline = split (/\s+/, $nextline);
745 ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
746 ($ref, $obs) = (uc $ref, uc $obs);
747 $zerostart and $start++;
748 $chr =~ s/^chr//;
749
750 $varinfo{$index} = $nextline;
751
752 if (not $refseqhash->{$seqid}) { #this refseq do not have FASTA sequence so cannot be interrogated
753 $function->{$index}{unknown} = "UNKNOWN";
754 next;
755 }
756
757 my $fs = (($refvarstart-$refcdsstart) % 3);
758 if ($refvarstart-$fs-1 > length($refseqhash->{$seqid})) {
759 printerr "WARNING: Potential database annotation error seqid=$seqid, refvarstart=$refvarstart, fs=$fs, seqlength=", length($refseqhash->{$seqid}), " refcdsstart=$refcdsstart, with inputline=$nextline\n";
760 next;
761 }
762
763 $wtnt3 = substr ($refseqhash->{$seqid}, $refvarstart-$fs-1, 3);
764 if (length ($refseqhash->{$seqid}) >= $refvarstart-$fs+3) { #going into UTR
765 $wtnt3_after = substr ($refseqhash->{$seqid}, $refvarstart-$fs+2, 3);
766 } else {
767 $wtnt3_after = ''; #last amino acid in the sequence without UTR (extremely rare situation) (example: 17 53588444 53588444 - T 414 hetero)
768 }
769 @wtnt3 = split (//, $wtnt3);
770 if (@wtnt3 != 3 and $refvarstart-$fs-1>=0) { #some times there are database annotation errors (example: chr17:3,141,674-3,141,683), so the last coding frame is not complete and as a result, the cDNA sequence is not complete
771 $function->{$index}{unknown} = "UNKNOWN";
772 next;
773 }
774
775 if ($refstrand eq '-') { #change the observed nucleotide to the reverse strand
776 $obs = revcom ($obs);
777 }
778
779 if ($start == $end) {
780 if ($ref eq '-') { #insertion variant
781 #the insertion coordinate system in ANNOVAR always uses "position after the current site"
782 #in positive strand, this is okay
783 #in negative strand, the "after current site" becomes "before current site" during transcription
784 #therefore, appropriate handling is necessary to take this into account
785 #for example, for a trinucleotide GCC with frameshift of 1 and insertion of CCT
786 #in positive strand, it is G-CTT-CC
787 #but if the transcript is in negative strand, the genomic sequence should be GC-CCT-C, and transcript is G-AGG-GC
788 if ($refstrand eq '+') {
789 if ($fs == 1) {
790 $varnt3 = $wtnt3[0] . $wtnt3[1] . $obs . $wtnt3[2];
791 } elsif ($fs == 2) {
792 $varnt3 = $wtnt3[0] . $wtnt3[1] . $wtnt3[2] . $obs;
793 } else {
794 $varnt3 = $wtnt3[0] . $obs . $wtnt3[1] . $wtnt3[2];
795 }
796 } elsif ($refstrand eq '-') {
797 if ($fs == 1) {
798 $varnt3 = $wtnt3[0] . $obs . $wtnt3[1] . $wtnt3[2];
799 } elsif ($fs == 2) {
800 $varnt3 = $wtnt3[0] . $wtnt3[1] . $obs . $wtnt3[2];
801 } else {
802 $varnt3 = $obs . $wtnt3[0] . $wtnt3[1] . $wtnt3[2];
803 }
804 }
805 ($wtaa, $wtaa_after, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($wtnt3_after), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1);
806 $wtaa_after and $wtaa_after eq '*' and $wtaa_after = 'X'; #wtaa_after could be undefined, if the current aa is the stop codon (X) (example: 17 53588444 53588444 - T)
807
808 my $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarstart-$refcdsstart+2) . "ins$obs"; #cDNA level annotation
809 if (length ($obs) % 3 == 0) {
810 if ($wtaa eq '*') { #mutation on stop codon
811 if ($varaa =~ m/\*/) {
812 $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before
813 $function->{$index}{nfsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,"; #stop codon is stil present
814 } else {
815 $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,"; #stop codon is lost
816 }
817 } else {
818 if ($varaa =~ m/\*/) {
819 $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before
820 $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "delins$varaa,";
821 } else {
822 $function->{$index}{nfsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "delins$varaa,";
823 }
824 }
825 } else {
826 if ($wtaa eq '*') { #mutation on stop codon
827 if ($varaa =~ m/\*/) { #in reality, this cannot be differentiated from non-frameshift insertion, but we'll still call it frameshift
828 $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before
829 $function->{$index}{fsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,";
830 } else {
831 $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,";
832 }
833 } else {
834 if ($varaa =~ m/\*/) {
835 $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before
836 $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "_$wtaa_after" . ($varpos+1) . "delins$varaa,";
837 } else {
838 $function->{$index}{fsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "fs,";
839 }
840 }
841 }
842 } elsif ($obs eq '-') { #single nucleotide deletion
843 my $deletent;
844 if ($fs == 1) {
845 $deletent = $wtnt3[1];
846 $varnt3 = $wtnt3[0].$wtnt3[2].$wtnt3_after;
847 } elsif ($fs == 2) {
848 $deletent = $wtnt3[2];
849 $varnt3 = $wtnt3[0].$wtnt3[1].$wtnt3_after;
850 } else {
851 $deletent = $wtnt3[0];
852 $varnt3 = $wtnt3[1].$wtnt3[2].$wtnt3_after;
853 }
854 ($wtaa, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1);
855 my $canno = "c." . ($refvarstart-$refcdsstart+1) . "del$deletent";
856 if ($wtaa eq '*') { #mutation on stop codon
857 if ($varaa =~ m/\*/) { #stop codon is still stop codon
858 $function->{$index}{nfsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "X,"; #changed fsdel to nfsdel on 2011feb19
859 } else { #stop codon is lost
860 $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "$varaa,";
861 }
862 } else {
863 if ($varaa =~ m/\*/) { #new stop codon created
864 $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "X,";
865 } else {
866 $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "fs,";
867 }
868 }
869 } elsif (length ($obs) > 1) { #block substitution (since start==end, this changed from 1nt to several nt)
870 if (($refvarend-$refvarstart+1-length($obs)) % 3 == 0) {
871 $function->{$index}{nfssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "delins$obs,";
872 } else {
873 $function->{$index}{fssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "delins$obs,";
874 }
875 } else { #single nucleotide substitution variant
876 my $canno;
877 if ($fs == 1) {
878 $varnt3 = $wtnt3[0] . $obs . $wtnt3[2];
879 $canno = "c.$wtnt3[1]" . ($refvarstart-$refcdsstart+1) . $obs;
880 } elsif ($fs == 2) {
881 $varnt3 = $wtnt3[0] . $wtnt3[1]. $obs;
882 $canno = "c.$wtnt3[2]" . ($refvarstart-$refcdsstart+1) . $obs;
883 } else {
884 $varnt3 = $obs . $wtnt3[1] . $wtnt3[2];
885 $canno = "c.$wtnt3[0]" . ($refvarstart-$refcdsstart+1) . $obs;
886 }
887 ($wtaa, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1);
888
889 if ($wtaa eq $varaa) {
890 $wtaa eq '*' and ($wtaa, $varaa) = qw/X X/; #change * to X in the output
891 $function->{$index}{ssnv} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos$varaa,";
892 } elsif ($varaa eq '*') {
893 $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa${varpos}X,";
894 } elsif ($wtaa eq '*') {
895 $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos$varaa,";
896 } else {
897 $function->{$index}{nssnv} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos$varaa,";
898 }
899 }
900 } elsif ($obs eq '-') { #deletion variant involving several nucleotides
901 ($wtaa, $varpos) = (translateDNA ($wtnt3), int(($refvarstart-$refcdsstart)/3)+1); #wildtype amino acid, position of amino acid
902 my ($varposend, $canno); #the position of the last amino acid in the deletion
903 if ($refvarstart<=$refcdsstart) { #since the first amino acid is deleted, the whole gene is considered deleted
904 $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:wholegene,"; #it is exonic variant, so the varend has to hit the first exon
905 } elsif ($refvarend >= $cdslen->{$seqid}+$refcdsstart) { #3' portion of the gene is deleted
906 $varposend = int ($cdslen->{$seqid}/3); #cdslen should be multiples of 3, but just in case of database mis-annotation
907 $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($cdslen->{$seqid}+$refcdsstart-1) . "del";
908 $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,";
909 } elsif (($refvarend-$refvarstart+1) % 3 == 0) {
910 $varposend = int (($refvarend-$refcdsstart)/3) + 1;
911 $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "del";
912 $function->{$index}{nfsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,";
913 } else {
914 $varposend = int (($refvarend-$refcdsstart)/3) + 1;
915 $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "del";
916 $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,";
917 }
918 } else { #block substitution event
919 if (($refvarend-$refvarstart+1-length($obs)) % 3 == 0) {
920 $function->{$index}{nfssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "$obs,";
921 } else {
922 $function->{$index}{fssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "$obs,";
923 }
924 }
925 }
926 }
927
928 for my $index (sort {$a<=>$b} keys %$function) {
929 if ($separate) { #print out each type of exonic mutations separately (one effect in one line), rather than printing out only the most important function
930 if ($function->{$index}{fsins}) {
931 print EXONIC "line$index\t", "frameshift insertion\t$function->{$index}{fsins}\t", $varinfo{$index}, "\n";
932 }
933 if ($function->{$index}{fsdel}) {
934 print EXONIC "line$index\t", "frameshift deletion\t$function->{$index}{fsdel}\t", $varinfo{$index}, "\n";
935 }
936 if ($function->{$index}{fssub}) {
937 print EXONIC "line$index\t", "frameshift substitution\t$function->{$index}{fssub}\t", $varinfo{$index}, "\n";
938 }
939 if ($function->{$index}{stopgain}) {
940 print EXONIC "line$index\t", "stopgain SNV\t$function->{$index}{stopgain}\t", $varinfo{$index}, "\n";
941 }
942 if ($function->{$index}{stoploss}) {
943 print EXONIC "line$index\t", "stoploss SNV\t$function->{$index}{stoploss}\t", $varinfo{$index}, "\n";
944 }
945 if ($function->{$index}{nfsins}) {
946 print EXONIC "line$index\t", "nonframeshift insertion\t$function->{$index}{nfsins}\t", $varinfo{$index}, "\n";
947 }
948 if ($function->{$index}{nfsdel}) {
949 print EXONIC "line$index\t", "nonframeshift deletion\t$function->{$index}{nfsdel}\t", $varinfo{$index}, "\n";
950 }
951 if ($function->{$index}{nfssub}) {
952 print EXONIC "line$index\t", "nonframeshift substitution\t$function->{$index}{nfssub}\t", $varinfo{$index}, "\n";
953 }
954 if ($function->{$index}{nssnv}) {
955 print EXONIC "line$index\t", "nonsynonymous SNV\t$function->{$index}{nssnv}\t", $varinfo{$index}, "\n";
956 }
957 if ($function->{$index}{ssnv}) {
958 print EXONIC "line$index\t", "synonymous SNV\t$function->{$index}{ssnv}\t", $varinfo{$index}, "\n";
959 }
960 if ($function->{$index}{unknown}) {
961 print EXONIC "line$index\t", "unknown\t$function->{$index}{unknown}\t", $varinfo{$index}, "\n";
962 }
963 } else { #print out only the most important functional changes (for example, chr3:9931279-9931279 G->A can be both non-synonymous and synonymous mutations based on UCSC gene model)
964 print EXONIC "line$index\t";
965 my $sortout;
966 if ($sortout = $function->{$index}{fsins}) {
967 $exonsort and $sortout = sortExonicAnnotation ($sortout);
968 print EXONIC "frameshift insertion\t$sortout\t";
969 } elsif ($sortout = $function->{$index}{fsdel}) {
970 $exonsort and $sortout = sortExonicAnnotation ($sortout);
971 print EXONIC "frameshift deletion\t$sortout\t";
972 } elsif ($sortout = $function->{$index}{fssub}) {
973 $exonsort and $sortout = sortExonicAnnotation ($sortout);
974 print EXONIC "frameshift substitution\t$sortout\t";
975 } elsif ($sortout = $function->{$index}{stopgain}) {
976 $exonsort and $sortout = sortExonicAnnotation ($sortout);
977 print EXONIC "stopgain SNV\t$sortout\t";
978 } elsif ($sortout = $function->{$index}{stoploss}) {
979 $exonsort and $sortout = sortExonicAnnotation ($sortout);
980 print EXONIC "stoploss SNV\t$sortout\t";
981 } elsif ($sortout = $function->{$index}{nfsins}) {
982 $exonsort and $sortout = sortExonicAnnotation ($sortout);
983 print EXONIC "nonframeshift insertion\t$sortout\t";
984 } elsif ($sortout = $function->{$index}{nfsdel}) {
985 $exonsort and $sortout = sortExonicAnnotation ($sortout);
986 print EXONIC "nonframeshift deletion\t$sortout\t";
987 } elsif ($sortout = $function->{$index}{nfssub}) {
988 $exonsort and $sortout = sortExonicAnnotation ($sortout);
989 print EXONIC "nonframeshift substitution\t$sortout\t";
990 } elsif ($sortout = $function->{$index}{nssnv}) {
991 $exonsort and $sortout = sortExonicAnnotation ($sortout);
992 print EXONIC "nonsynonymous SNV\t$sortout\t";
993 } elsif ($sortout = $function->{$index}{ssnv}) {
994 $exonsort and $sortout = sortExonicAnnotation ($sortout);
995 print EXONIC "synonymous SNV\t$sortout\t";
996 } elsif ($sortout = $function->{$index}{unknown}) {
997 $exonsort and $sortout = sortExonicAnnotation ($sortout);
998 print EXONIC "unknown\t$sortout\t";
999 }
1000 print EXONIC $varinfo{$index}, "\n";
1001 }
1002 }
1003 }
1004
1005 sub sortExonicAnnotation {
1006 my ($anno) = @_;
1007 my @anno1 = split (/,/, $anno);
1008 my @anno2;
1009 for my $i (0 .. @anno1-1) {
1010 my @temp = split (/:/, $anno1[$i]);
1011 $temp[2] =~ s/^exon//;
1012 push @anno2, [$anno1[$i], @temp];
1013 }
1014 @anno2 = sort {$a->[3] <=> $b->[3] or $a->[2] cmp $b->[2]} @anno2; #first sort by exon number, then by transcript name
1015 my @anno3 = map {$_->[0]} @anno2;
1016 return join (',', @anno3);
1017 }
1018
1019 sub filterQuery {
1020 open (FIL, ">$outfile.${buildver}_${dbtype1}_filtered") or die "Error: cannot write to output file $outfile.${buildver}_${dbtype1}_filtered: $!\n";
1021 open (DROPPED, ">$outfile.${buildver}_${dbtype1}_dropped") or die "Error: cannot write to output file $outfile.${buildver}_${dbtype1}_dropped: $!\n";
1022 open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n";
1023
1024 printerr "NOTICE: Variants matching filtering criteria are written to $outfile.${buildver}_${dbtype1}_dropped, other variants are written to $outfile.${buildver}_${dbtype1}_filtered\n";
1025
1026 open (QUERY, $queryfile) or die "Error: cannot read from query file $queryfile: $!\n";
1027
1028 my (%variant, $filedone, $batchdone);
1029 my ($linecount, $batchlinecount, $invalid, $invalidcount) = (0, 0);
1030 my ($chr, $start, $end, $ref, $obs, $info);
1031 while (1) {
1032 $_ = <QUERY>;
1033 if (not defined $_) {
1034 $filedone++;
1035 } else {
1036 s/[\r\n]+$//;
1037
1038 if (m/^#/ and $comment) { #comment line start with #, do not include this is $linecount
1039 print FIL "$_\n";
1040 print DROPPED "#comment\t#comment\t$_\n";
1041 next;
1042 }
1043
1044 $linecount++;
1045 $batchlinecount++;
1046 if ($batchlinecount == $batchsize) {
1047 $batchdone++;
1048 }
1049
1050 if ($memfree or $memtotal) { #if these arguments are specified
1051 if ($linecount =~ m/00000$/) { #about 40Mb memory per 10k lines for a typical input dataset
1052 my ($availmem, $allmem) = currentAvailMemory();
1053 $verbose and printerr "NOTICE: Current available system memory is $availmem kb (this program uses $allmem bytes memory), after reading $linecount query\n";
1054 if ($availmem and $availmem <= $memfree+50_000) { #some subsequent steps may take ~50Mb memory, so here we try to allocate some more memory
1055 $batchdone++;
1056 }
1057 if ($memtotal and $allmem >= $memtotal-50_000) { #when --memtotal is specified, ensure that program use less memory
1058 $batchdone++;
1059 }
1060 }
1061 }
1062
1063 $invalid = 0; #reset invalid status
1064
1065 my @nextline = split (/\s+/, $_);
1066 ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
1067 if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) {
1068 $invalid++;
1069 } else {
1070 ($ref, $obs) = (uc $ref, uc $obs);
1071 $zerostart and $start++;
1072 $chr =~ s/^chr//;
1073 if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) {
1074 $invalid++;
1075 } elsif ($ref eq '-' and $obs eq '-' #both are empty allele
1076 or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code
1077 or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code
1078 or $start =~ m/[^\d]/ #start is not a number
1079 or $end =~ m/[^\d]/ #end is not a number
1080 or $start > $end #start is more than end
1081 or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref
1082 or $ref eq '-' and $start != $end #length mismatch for insertion
1083 ) {
1084 $invalid++;
1085 }
1086 }
1087
1088 if ($invalid) {
1089 print INVALID $_, "\n"; #invalid record found
1090 $invalidcount++;
1091 next;
1092 }
1093
1094 if ($start == $end and $ref eq '-') { #insertion
1095 $obs = "0$obs";
1096 } elsif ($obs eq '-') { #deletion
1097 $obs = $end-$start+1;
1098 } elsif ($end>$start or $start==$end and length($obs)>1) { #block substitution #fixed the bug here 2011feb19
1099 $obs = ($end-$start+1) . $obs;
1100 }
1101
1102 if (exists $variant{$chr, $start, $obs}) {
1103 $variant{$chr, $start, $obs} .= "\n$_";
1104 } else {
1105 $variant{$chr, $start, $obs} = "$ref\n$_";
1106 }
1107 }
1108
1109 if ($filedone or $batchdone) {
1110 printerr "NOTICE: Processing next batch with ${\(scalar keys %variant)} unique variants in $batchlinecount input lines\n";
1111 filterNextBatch (\%variant);
1112 %variant = ();
1113 $batchlinecount = 0; #reset the line count for this batch
1114 $batchdone = 0;
1115 }
1116 if ($filedone) {
1117 last;
1118 }
1119 }
1120 close (INVALID); close (DROPPED); close (FIL);
1121 if ($invalidcount) {
1122 printerr "NOTICE: Variants with invalid input format were written to $outfile.invalid_input\n";
1123 } else {
1124 unlink ("$outfile.invalid_input");
1125 }
1126 }
1127
1128 sub filterNextBatch {
1129 my ($variant) = @_;
1130 my $dbfile;
1131
1132 if ($dbtype1 eq 'generic') {
1133 $dbfile = File::Spec->catfile ($dbloc, $genericdbfile);
1134 } elsif ($dbtype1 eq 'vcf') {
1135 $dbfile = File::Spec->catfile ($dbloc, $vcfdbfile);
1136 } else {
1137 $dbfile = File::Spec->catfile ($dbloc, "${buildver}_$dbtype1.txt");
1138 }
1139
1140 open (DB, $dbfile) or die "Error: cannot read from input database file $dbfile: $!\n";
1141 printerr "NOTICE: Scanning filter database $dbfile...";
1142
1143 my (@record, $chr, $start, $end, $ref, $obs, $score, $qual, $fil, $info);
1144 my ($rsid, $strand, $ucscallele, $twoallele, $class, $af, $attribute);
1145 my $count_invalid_dbline;
1146 while (<DB>) {
1147 my (@obs2, @score2); #for 1000G2010 data set in VCF format, some tri-allelic SNPs are present; in the future, some quad-allelic SNPs may be also present in VCF files
1148 s/[\r\n]+$//;
1149 m/\S/ or next; #skip empty lines in the database file (sometimes this occurs)
1150 m/^#/ and next; #skip the comment line
1151 if ($dbtype eq 'avsift') {
1152 @record = split (/\t/, $_);
1153 @record == 8 or die "Error: invalid record found in DB file $dbfile (8 tab-delimited fields expected): <$_>\n";
1154 ($chr, $start, $end, $ref, $obs, $score) = @record;
1155 if ($chromosome) {
1156 $valichr{$chr} or next;
1157 }
1158 if ($score < $sift_threshold) { #this is a deleterious mutation, skip it (equal sign should not be used, otherwise the score=0 will be skipped)
1159 next;
1160 }
1161 } elsif ($dbtype =~ m/^ljb_/) {
1162 @record = split (/\t/, $_);
1163 @record >= 5 or die "Error: invalid record found in DB file $dbfile (at least 5 tab-delimited fields expected): <$_>\n";
1164 ($chr, $start, $end, $ref, $obs, $score) = @record;
1165 if ($chromosome) {
1166 $valichr{$chr} or next;
1167 }
1168 if (defined $score and defined $score_threshold and $score < $score_threshold) {
1169 next;
1170 }
1171 } elsif ($dbtype =~ m/^snp\d+/) {
1172 @record = split (/\t/, $_, -1); #-1 is required before some dbSNP records have many empty tab fields in the end
1173 @record == 18 or @record == 26 or die "Error: invalid record found in dbSNP database file $dbfile (18 or 26 fields expected but found ${\(scalar @record)}): <$_>\n" . join("\n",@record);
1174 $record[1] =~ s/^chr// or die "Error: invalid record found in DB file (2nd field should start with 'chr'): <$_>\n";
1175 ($chr, $start, $end, $rsid, $strand, $ucscallele, $twoallele, $class) = @record[1,2,3,4,6,8,9,11];
1176 $start++; #UCSC use zero-start system
1177 if ($chromosome) {
1178 $valichr{$chr} or next;
1179 }
1180 unless ($class eq 'single' or $class eq 'deletion' or $class eq 'in-del' or $class eq 'insertion') { #enum('unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion')
1181 next;
1182 }
1183
1184 my @allele = split (/\//, $twoallele);
1185
1186 #before Jan 2011, only di-allelic SNPs are handled in ANNOVAR
1187 #@allele == 2 or next; #many entries have no allele information (for example, rs71010435)
1188 #in Jan 2011 version, I decided to handle tri-allelic and quad-allelic SNP as well
1189
1190 @allele >= 2 or next; #Jan 2011 modification
1191 if ($strand eq '-') { #handle reverse strand annotation (the vast majority of records in dbSNP should be already in + strand)
1192 for my $i (0 .. @allele-1) {
1193 $allele[$i] = revcom ($allele[$i]);
1194 }
1195 #$ucscallele = revcom ($ucscallele); #added Jan 24, 2011 (per Kevin Ha) removed Feb 10, 2011 (per Eric Stawiski)
1196 #note that some SNPs (e.g., rs28434453) may have multiple location in diferent chromosome or strand; I may want to handle this by a special flag in the future
1197 #585 chr1 13301 13302 rs28434453 0 - C C C/T genomic single etc...
1198 #1367 chr15 102517867 102517868 rs28434453 0 + G G C/T genomic single etc...
1199 }
1200
1201 #in-del is usually annotated below, so they require special treatment
1202 #587 chr1 384538 384539 rs3971283 0 + T T -/ATT genomic in-del unknown 0 0 unknown exact 3
1203 if ($class eq 'in-del') { #indel are usually annotated as -/xxx, where xxx is the alternative allele
1204 $obs = length ($ucscallele) . $allele[1]; #prefix a number before the alleles, indicating block substitution
1205 defined $allele[1] or die "no allele 1 <$_>";
1206 } elsif ($class eq 'insertion') {
1207 $start--;
1208 $obs = "0$allele[1]";
1209 } elsif ($class eq 'deletion') {
1210 $obs = length ($ucscallele);
1211 } else {
1212 for my $i (0 .. @allele-1) {
1213 if ($ucscallele eq $allele[$i]) {
1214 @obs2 = @allele;
1215 splice (@obs2, $i, 1);
1216 for my $j (0 .. @obs2-1) {
1217 push @score2, $rsid;
1218 }
1219 }
1220 }
1221 if (@obs2) {
1222 $obs = shift @obs2;
1223 $score = shift @score2;
1224 } else {
1225 $verbose and printerr ("Database error: wildtype base $ucscallele is not part of the allele description in <$_>\n");
1226 next;
1227 }
1228 }
1229 $score = $rsid;
1230 } elsif ($dbtype =~ m/^1000g_(\w+)/ or $dbtype =~ m/^1000g2010_(\w+)/ or $dbtype =~ m/^1000g2010\w\w\w_(\w+)/) { #dbtype1 should NOT be used here
1231 @record = split (/\t/, $_);
1232 @record == 5 or @record == 6 or die "Error: invalid record found in 1000G database file $dbfile (5 or 6 fields expected): <$_>\n";
1233 ($chr, $start, $ref, $obs, $af) = @record; #there is no "END" in 1000G input file
1234 if ($chromosome) {
1235 $valichr{$chr} or next;
1236 }
1237 if ($maf_threshold) {
1238 if ($af > 0.5) { #the frequency is the non-reference allele frequency, which could exceed 0.5
1239 1-$af >= $maf_threshold or next;
1240 } else {
1241 $af >= $maf_threshold or next;
1242 }
1243 }
1244 $score = $af;
1245 } elsif ($dbtype eq 'generic') {
1246 ($chr, $start, $end, $ref, $obs, $score) = split (/\t/, uc $_); #make sure to use upper case, as query is always in upper case
1247 defined $obs or die "Error: the generic database file must contains at least five tab-delimited fields per line (but observed line: $_)\n";
1248 defined $score or $score = "NA";
1249 if ($chromosome) {
1250 $valichr{$chr} or next;
1251 }
1252 defined $obs or die "Error: invalid record found in DB file $dbfile (at least 5 fields expected for 'generic' dbtype): <$_>\n";
1253 if ($start == $end and $ref eq '-') { #insertion
1254 $obs = "0$obs";
1255 }
1256 if ($obs eq '-') { #deletion
1257 $obs = $end-$start+1;
1258 } elsif ($start != $end) { #block substitution
1259 $obs = ($end-$start+1) . $obs;
1260 }
1261 if (defined $score and defined $score_threshold and $score < $score_threshold) {
1262 next;
1263 }
1264 } elsif ($dbtype eq 'vcf') { #vcf file is adopted by 1000 Genomes Project; it can describe both SNPs and indels, and it may contain both summary level statistics and individual level genotype calls
1265 ($chr, $start, $rsid, $ref, $obs, $qual, $fil, $info) = split (/\t/, $_);
1266 if ($chromosome) {
1267 $valichr{$chr} or next;
1268 }
1269
1270 my ($ac, $an);
1271
1272 if ($info =~ m/AF=([^;]+)/) {
1273 $score = $1;
1274 if ($obs =~ m/(\w),(\w)/) { #1000G November; this format is not really valid because it does not handle tri-allelic SNP
1275 ($obs, @obs2) = ($1, $2);
1276 @score2 = ($score);
1277 }
1278 } elsif ($info =~ m/AC=(\S+?);AN=(\d+)/) {
1279 my ($alleles, $count) = ($1, $2);
1280 if ($alleles =~ m/^(\d+),(.+)/) {
1281 $score = sprintf ("%.3f", $1/$count);
1282 @score2 = split (/,/, $2);
1283 @score2 = map {sprintf("%.3f", $_/$count)} @score2;
1284 ($obs, @obs2) = split (/,/, $obs); #the obs is composed of two alleles
1285 } else {
1286 $af = sprintf ("%.3f", $alleles/$count);
1287 $score = $af;
1288 #this is an invalid record in 1000GJuly: 1 2266231 rs11589451 C T,A . PASS AA=c;AC=20;AN=120;DP=237
1289 if ($obs =~ m/(\w),/) {
1290 $count_invalid_dbline++;
1291 $verbose and printerr "WARNING: Invalid input line found in $dbfile (more than one alleles are observed, but only one is annotated with allelic counts): <$_>\n";
1292 next;
1293 }
1294 }
1295 } else {
1296 printerr "WARNING: the VCF file does not contain allele frequency information. ANNOVAR cannot process this file\n";
1297 exit;
1298 }
1299
1300 if (length ($ref) == 1 and length ($obs) == 1) {#single base substitution
1301 1; #the obs and obs2 is already handled
1302 } elsif ($obs =~ m/^\-((\w)(\w*))$/) { #deletion (1000G March)
1303 $2 eq $ref or $ref eq 'N' or die "Error: mismatch of deleted allele and reference allele: <$_>\n";
1304 $obs = length ($1);
1305 } elsif ($obs =~ m/^\+(\w+)$/) { #insertion (1000G March)
1306 $obs = "0$1";
1307 } elsif ($ref =~ m/^[ACGTN]+$/ and $obs =~ m/^[ACGTN]+$/) {
1308 if (length ($obs) == 1) { #deletion (1000G July)
1309 substr ($ref, 0, 1) eq $obs or die "Error: mismatch of deleted allele and reference allele: ref=$ref obs=$obs in <$_>\n";
1310 $start++;
1311 $obs = length ($ref)-1;
1312 } elsif (length ($ref) == 1) { #duplication (1000G July)
1313 substr ($obs, 0, 1) eq $ref or die "Error: mismatch of duplicated allele and reference allele: ref=$ref obs=$obs in <$_>\n";
1314 $start++;
1315 $obs = "0" . substr ($obs, 1);
1316 }
1317 } else {
1318 die "Error: invalid record found in VCF file: ref=$ref obs=$obs <$_>\n";
1319 }
1320 } else {
1321 die "invalid dbtype: $dbtype\n";
1322 }
1323
1324 if ($variant->{$chr, $start, $obs}) {
1325 my ($ref, @info) = split (/\n/, $variant->{$chr, $start, $obs}); #most likely, only one piece of information
1326 for my $i (0 .. @info-1) {
1327 print DROPPED join ("\t", $dbtype, $score), "\t", $info[$i], "\n";
1328 }
1329 delete $variant->{$chr, $start, $obs};
1330 }
1331 if (@obs2) {
1332 for my $j (0 .. @obs2-1) {
1333 if ($variant->{$chr, $start, $obs2[$j]}) {
1334 my ($ref, @info) = split (/\n/, $variant->{$chr, $start, $obs2[$j]}); #most likely, only one piece of information
1335 for my $i (0 .. @info-1) {
1336 print DROPPED join ("\t", $dbtype, $score2[$j]), "\t", $info[$i], "\n";
1337 }
1338 delete $variant->{$chr, $start, $obs2[$j]};
1339 }
1340 }
1341 }
1342 }
1343 for my $key (keys %$variant) {
1344 my ($chr, $start, $obs) = split ($;, $key); #hash key separator
1345 my ($ref, @info) = split (/\n/, $variant->{$key});
1346 my $len;
1347 if ($obs =~ m/^(\d+)(.*)/) {
1348 ($len, $obs) = ($1, $2);
1349 $obs ||= '-'; #deletion
1350 if ($len) {
1351 $end = $start+$len-1;
1352 } else {
1353 $end = $start;
1354 }
1355 } else {
1356 $end = $start;
1357 }
1358 for my $i (0 .. @info-1) {
1359 print FIL $info[$i], "\n";
1360 }
1361 }
1362 printerr "Done\n";
1363 $count_invalid_dbline and printerr "WARNING: $count_invalid_dbline lines in dbfile $dbfile were ignored due to invalid formats\n";
1364 }
1365
1366 sub annotateQueryByRegion {
1367 open (QUERY, $queryfile) or die "Error: cannot read from --queryfile ($queryfile): $!\n";
1368 open (OUT, ">$outfile.${buildver}_$dbtype1") or die "Error: cannot write to output file $outfile.${buildver}_$dbtype1: $!\n";
1369 open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n";
1370
1371 my ($regiondb, $parent) = ({}, {});
1372
1373 if ($dbtype eq 'gff3') {
1374 ($regiondb, $parent) = readGFF3RegionAnnotation ();
1375 } elsif ($dbtype eq 'bed') {
1376 ($regiondb) = readBedRegionAnnotation ();
1377 } else {
1378 ($regiondb) = readUCSCRegionAnnotation ();
1379 }
1380
1381 my ($chr, $start, $end, $ref, $obs);
1382 my ($invalid);
1383 my ($linecount, $invalidcount) = qw/0 0/;
1384
1385 $time and printerr "NOTICE: Current time (before examining variants) is ", scalar (localtime), "\n";
1386 while (<QUERY>) {
1387 s/[\r\n]+$//;
1388
1389 if (m/^#/ and $comment) { #comment line start with #, do not include this is $linecount
1390 print OUT "#comment\t#comment\t$_\n";
1391 next;
1392 }
1393
1394 $linecount++;
1395
1396 $invalid = 0; #reset invalid status
1397
1398 my @nextline = split (/\s+/, $_);
1399 ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
1400 if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) {
1401 $invalid++;
1402 } else {
1403 ($ref, $obs) = (uc $ref, uc $obs);
1404 $zerostart and $start++;
1405 $chr =~ s/^chr//;
1406 if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) {
1407 $invalid++;
1408 } elsif ($ref eq '-' and $obs eq '-' #both are empty allele
1409 or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code
1410 or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code
1411 or $start =~ m/[^\d]/ #start is not a number
1412 or $end =~ m/[^\d]/ #end is not a number
1413 or $start > $end #start is more than end
1414 or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref
1415 or $ref eq '-' and $start != $end #length mismatch for insertion
1416 ) {
1417 $invalid++;
1418 }
1419 }
1420
1421
1422 if ($invalid) {
1423 print INVALID $_, "\n"; #invalid record found
1424 $invalidcount++;
1425 next;
1426 }
1427
1428 my $bin1 = int ($start/$genomebinsize); #start bin
1429 my $bin2 = int ($end/$genomebinsize); #end bin (usually same as start bin, unless the query is really big that spans multiple megabases)
1430 my ($foundhit, $score, $name);
1431 for my $bin ($bin1 .. $bin2) {
1432 for my $nextgene (@{$regiondb->{$chr, $bin}}) {
1433 my ($txstart, $txend, $txscore, $txname) = @$nextgene;
1434
1435 if ($end < $txstart) {
1436 #db: <------------------------->
1437 #query: <--->
1438 last; #if genomic region is too far away from end, end the search of the bins
1439 } elsif ($end <= $txend) { #query contained completely within db region
1440 if ($start >= $txstart) {
1441 #db: <-------------------------->
1442 #query: <------------------>
1443 } else { #query overlap but upstream of db region
1444 #db: <------------------------->
1445 #query: <---------------------->
1446 if ($minqueryfrac) {
1447 if (($end-$txstart+1)/($end-$start+1) < $minqueryfrac) {
1448 next;
1449 }
1450 }
1451 }
1452 $foundhit++;
1453 $score ||= $txscore; $name ||= $txname;
1454 if ($score < $txscore) {
1455 $score = $txscore;
1456 $name=$txname;
1457 }
1458 if ($score == $txscore and defined $name and $name ne $txname) {
1459 $name .= ",$txname";
1460 }
1461 if ($dbtype1 eq 'cytoBand') { #a new chromosome band is encountered
1462 $name ne $txname and $name .= ",$txname";
1463 }
1464 } elsif ($start <= $txend) {
1465 if ($start >= $txstart) { #query overlap but downstream of db region
1466 #db: <------------------------>
1467 #query: <----------------------->
1468 if ($minqueryfrac) {
1469 if (($txend-$start+1)/($end-$start+1) < $minqueryfrac) {
1470 next;
1471 }
1472 }
1473 } else {
1474 #db region completely contained within query
1475 #db: <------------------------->
1476 #query: <------------------------------>
1477 if ($minqueryfrac) {
1478 if (($txend-$txstart+1)/($end-$start+1) < $minqueryfrac) {
1479 next;
1480 }
1481 }
1482 }
1483 $foundhit++;
1484 $score ||= $txscore; $name ||= $txname;
1485 if ($score < $txscore) {
1486 $score = $txscore;
1487 $name=$txname;
1488 }
1489 if ($score == $txscore and defined $name and $name ne $txname) {
1490 $name .= ",$txname";
1491 }
1492 if ($dbtype1 eq 'cytoBand') { #a new chromosome band is encountered
1493 $name ne $txname and $name .= ",$txname";
1494 }
1495 } else {
1496 #query ---
1497 #gene <-*----*->
1498 }
1499 }
1500 }
1501 $linecount =~ m/000000$/ and printerr "NOTICE: Finished processing $linecount variants in queryfile\n";
1502 if ($foundhit) {
1503 $name ||= '';
1504 my @name = split (/,/, $name);
1505 my %name = map {$_, 1} @name;
1506 @name = keys %name;
1507
1508 if ($dbtype1 eq 'cytoBand') {
1509 map {s/^chr//} @name;
1510 if (@name >= 2) {
1511 $name[$#name] =~ s/^\d+//;
1512 $name = $name[0] . '-' . $name[$#name];
1513 } else {
1514 $name = $name[0];
1515 }
1516 print OUT "$dbtype\t$name\t$_", "\n";
1517 } else {
1518 $name = join (",", @name);
1519 print OUT "$dbtype\t", $score?"Score=$score;":"", $name?"Name=$name":"", "\t", $_, "\n";
1520 }
1521 }
1522 }
1523 close (QUERY);
1524 close (OUT);
1525 close (INVALID);
1526 $time and printerr "NOTICE: Current time (after examining variants) is ", scalar (localtime), "\n";
1527
1528 printerr "NOTICE: Finished region-based annotation on $linecount genetic variants in $queryfile";
1529 if ($invalidcount) {
1530 printerr " (including $invalidcount with invalid format written to $outfile.invalid_input)";
1531 } else {
1532 unlink ("$outfile.invalid_input");
1533 }
1534 printerr "\n";
1535 printerr "NOTICE: Output files were written to $outfile.${buildver}_$dbtype1\n";
1536 }
1537
1538 sub readGFF3RegionAnnotation {
1539 my ($dbfile);
1540 my ($regioncount, $dbcount) = (0, 0);
1541 my (@record, %regiondb, %parent);
1542
1543 $dbfile = File::Spec->catfile ($dbloc, $gff3dbfile);
1544 -f $dbfile or die "Error: required database $dbfile does not exists. Please use 'annotate_variation.pl -downdb $dbtype $dbloc -buildver $buildver' to download annotation database.\n";
1545
1546 open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n";
1547 printerr "NOTICE: Reading annotation database $dbfile ... ";
1548 $_ = <DB>;
1549 $_ =~ m/^##gff-version\s+3/ or die "Error: invalid header line found in the GFF3 database $dbfile (expect to see '##gff-version 3'): <$_>\n";
1550 while (<DB>) {
1551 m/^#/ and next; #skip comments line
1552 m/^##FASTA/ and last; #reached the FASTA sequence section of GFF3 file
1553 $dbcount++;
1554 s/[\r\n]+$//; #deleting the newline characters
1555 @record = split (/\t/, $_);
1556 @record == 9 or die "Error: invalid records found in the GFF3 database $dbfile (9 fields expected): <$_>\n";
1557 my ($chr, $start, $end, $score, $attribute) = @record[0,3,4,5,8];
1558 $chr=~s/^chr//; #sometimes the chr prefix is present and should be removed (query usually does not contain this chr prefix)
1559 my $name;
1560 defined $score_threshold and $score < $score_threshold and next; #if --score_threshold is set, the low scoring segment will be skipped
1561
1562 my @feature = split (/;/, $attribute);
1563 for my $i (0 .. @feature-1) {
1564 $feature[$i] =~ m/ID=(\S+)/ and $name = $1;
1565 }
1566 defined $name or die "Error: invalid record in GFF3 database $dbfile (ID field not found): <$_>\n";
1567 for my $i (0 .. @feature-1) {
1568 if ($feature[$i] =~ m/Parent=(.+)/) {
1569 my @parent = split (/,/, $1);
1570 for my $j (0 .. @parent-1) {
1571 $parent{$name} .= $parent[$j];
1572 }
1573 }
1574 }
1575
1576 my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize));
1577 for my $nextbin ($bin1 .. $bin2) {
1578 push @{$regiondb{$chr, $nextbin}}, [$start, $end, $score, $name];
1579 }
1580 $regioncount++;
1581 if ($verbose and $dbcount =~ m/000000$/) {
1582 my ($availmem, $allmem) = currentAvailMemory();
1583 printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n";
1584 }
1585 }
1586 close (DB);
1587 for my $key (keys %regiondb) { #pre-sort gene DB by txstart to faciliate future use
1588 @{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}};
1589 }
1590 printerr "Done with $regioncount regions\n";
1591 return (\%regiondb, \%parent);
1592 }
1593
1594 sub readBedRegionAnnotation {
1595 my ($dbfile);
1596 my ($regioncount, $dbcount) = (0, 0);
1597 my (@record, %regiondb);
1598 my ($chr, $start, $end);
1599
1600 $dbfile = File::Spec->catfile ($dbloc, $bedfile);
1601
1602 -f $dbfile or die "Error: required bedfile $dbfile does not exists.\n";
1603
1604 open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n";
1605 printerr "NOTICE: Reading annotation database $dbfile ... ";
1606
1607 while (<DB>) {
1608 $dbcount++;
1609 s/[\r\n]+$//; #deleting the newline characters
1610 @record = split (/\t/, $_);
1611
1612 ($chr, $start, $end) = @record;
1613
1614
1615 $chr =~ s/^chr//;
1616 $start++; #due to the zero-opening coordinate system in UCSC
1617
1618 my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize));
1619 for my $nextbin ($bin1 .. $bin2) {
1620 push @{$regiondb{$chr, $nextbin}}, [$start, $end, 0, 'NA'];
1621 }
1622 $regioncount++;
1623 if ($verbose and $dbcount =~ m/000000$/) {
1624 my ($availmem, $allmem) = currentAvailMemory();
1625 printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n";
1626 }
1627 }
1628 close (DB);
1629
1630 for my $key (keys %regiondb) { #pre-sort gene DB by txstart to faciliate future use
1631 @{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}};
1632 }
1633 printerr "Done with $regioncount regions\n";
1634 return (\%regiondb);
1635 }
1636
1637 sub readUCSCRegionAnnotation {
1638 my ($dbfile);
1639 my ($regioncount, $dbcount) = (0, 0);
1640 my (@record, %regiondb);
1641 my ($chr, $start, $end, $score, $normscore, $name);
1642 my ($expectedLength, @positionCols, @scoreCols, @colsToOutput);
1643
1644 if ($dbtype1 =~ m/^mce(\d+way)$/) {
1645 $dbfile = File::Spec->catfile ($dbloc, "${buildver}_phastConsElements$1.txt");
1646 } else {
1647 $dbfile = File::Spec->catfile ($dbloc, "${buildver}_$dbtype1.txt");
1648 }
1649 -f $dbfile or die "Error: required database $dbfile does not exists. Please use 'annotate_variation.pl -downdb $dbtype $dbloc' to download annotation database.\n";
1650
1651 #################$$$
1652 ### The following SWITCH structure is modified Jan 2011 to faciliate future expansion
1653 ### $expectedLength is the number of cols expected in each line
1654 ### @postionCols => location of ($chr,$start,$end) columns
1655 ### @scoreCols => location of ($score, $normscore) columns leave empty is set not present (then set to zero below) ; WARNING must be empty or of length 2
1656 ### @colsToOutPut => location of ($name) columns to put into $name concatinated with ":" below
1657
1658 if ($dbtype1 =~ m/^phastConsElements\d+way/) {
1659 $expectedLength=6;
1660 @positionCols=(1,2,3);
1661 @scoreCols=(4,5); #normalized score
1662 @colsToOutput=(4); #lod=xxx is the Name output
1663 } elsif ($dbtype1 eq 'evofold') {
1664 $expectedLength=10;
1665 @positionCols=(1,2,3);
1666 @scoreCols=(5,5);
1667 @colsToOutput=(4);
1668 } elsif ($dbtype1 eq 'tfbsConsSites') {
1669 $expectedLength=8;
1670 @positionCols=(1,2,3);
1671 @scoreCols=(7,5);
1672 @colsToOutput=(4);
1673 } elsif ($dbtype1 eq 'wgRna') {
1674 $expectedLength=10;
1675 @positionCols=(1,2,3);
1676 @scoreCols=(5,5);
1677 @colsToOutput=(4);
1678 } elsif ($dbtype1 eq 'targetScanS') {
1679 $expectedLength=7;
1680 @positionCols=(1,2,3);
1681 @scoreCols=(5,5);
1682 @colsToOutput=(4);
1683 } elsif ($dbtype1 eq 'genomicSuperDups') {
1684 $expectedLength=30;
1685 @positionCols=(1,2,3);
1686 @scoreCols=(27,27);
1687 @colsToOutput=(4);
1688 } elsif ($dbtype1 eq 'omimGene') {
1689 $expectedLength=5;
1690 @positionCols=(1,2,3);
1691 @scoreCols=();
1692 @colsToOutput=(4);
1693 } elsif ($dbtype1 eq 'gwasCatalog') {
1694 $expectedLength=23;
1695 @positionCols=(1,2,3);
1696 @scoreCols=();
1697 @colsToOutput=(10);
1698 } elsif ($dbtype1 eq 'dgv') {
1699 $expectedLength=16;
1700 @positionCols=(1,2,3);
1701 @scoreCols=();
1702 @colsToOutput=(4);
1703 } elsif ($dbtype1 eq 'cytoBand') { #special handling required
1704 $expectedLength=5;
1705 @positionCols=(0,1,2);
1706 @scoreCols=();
1707 @colsToOutput=(0,3);
1708 } elsif ($dbtype1 =~ m/^chr\w+_chainSelf$/) { #example: chr1_selfChain
1709 $expectedLength=13;
1710 @positionCols=(2,4,5);
1711 @scoreCols=(12,12);
1712 @colsToOutput=(11);
1713 } elsif ($dbtype1 =~ m/^chr\w+_chain\w+$/) { #example: chr1_chainPanTro2
1714 $expectedLength=12;
1715 @positionCols=(2,4,5);
1716 @scoreCols=();
1717 @colsToOutput=(11);
1718 } elsif ($dbtype1 eq 'snp130' or $dbtype1 eq 'snp131') {
1719 $expectedLength=18;
1720 @positionCols=(1,2,3);
1721 @scoreCols=();
1722 @colsToOutput=(4);
1723 } else {
1724 #other UCSC format if file is not defined above
1725 $expectedLength='';
1726 @positionCols=(1,2,3);
1727 @scoreCols=();
1728 @colsToOutput=(4);
1729 }
1730
1731 if ($scorecolumn) {
1732 @scoreCols = ($scorecolumn, $scorecolumn);
1733 }
1734
1735 open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n";
1736 printerr "NOTICE: Reading annotation database $dbfile ... ";
1737
1738 if ($expectedLength eq '') { # if DB is unknown "generic format" use first line to get $expectedLength : file rewound afterwards
1739 my $line = <DB>;
1740 @record = split (/\t/, $line);
1741 $expectedLength=@record;
1742 seek (DB, 0, 0);
1743 };
1744
1745 ########$$ Check to see if user has defined columns to output (intergers or all allowed)
1746 if (defined $colsWanted) {
1747 if ($colsWanted[0] eq 'all') {
1748 @colsToOutput= 0 .. ($expectedLength-1);
1749 } elsif ($colsWanted[0] eq 'none') {
1750 @colsToOutput = ();
1751 } else{
1752 @colsToOutput = @colsWanted;
1753 }
1754 };
1755
1756 ########$$ check that the columns requested exist in the current DB
1757 for my $i (0 .. @colsToOutput-1) {
1758 if ($colsToOutput[$i] > $expectedLength) {
1759 die "Error: The DB file $dbfile has only $expectedLength columns but output column $colsToOutput[$i] is requested by --colsWanted!\n";
1760 }
1761 }
1762
1763 while (<DB>) {
1764 $dbcount++;
1765 s/[\r\n]+$//; #deleting the newline characters
1766 @record = split (/\t/, $_);
1767
1768 @record == $expectedLength or die "Error: invalid record in dbfile $dbfile ($expectedLength fields expected): <$_>\n";
1769 ($chr, $start, $end) = @record[@positionCols];
1770 if (@colsToOutput) { #I think there should always be a Name in the output column
1771 $name = join (':', @record[@colsToOutput]);
1772 }
1773
1774 if(@scoreCols){
1775 ($score, $normscore)=(@record[@scoreCols])
1776 } else{
1777 ($score, $normscore) = qw/0 0/;
1778 }
1779
1780 #########$$ Unusual exceptions for phastCons
1781 if ($dbtype1 =~ m/^phastConsElements\d+way/) {
1782 $score =~ s/^lod=// or die "Error: invalid lod score designation (no 'lod=' found) in dbfile $dbfile: <$_>\n";
1783 } ##lod= in the score for conservation tracks
1784
1785 #########$$ Unusual exceptions for cytoBand
1786 if ($dbtype1 eq 'cytoBand' and not defined $colsWanted) { #the name for chromosome band is concatenated as single word
1787 $name =~ s/://;
1788 }
1789
1790 defined $score_threshold and $score < $score_threshold and next; #if --score_threshold is set, the low scoring segment will be skipped
1791 defined $normscore_threshold and $normscore < $normscore_threshold and next; #if --normscore_threshold is set, the low scoring segment will be skipped
1792
1793 $chr =~ s/^chr//;
1794 $start++; #due to the zero-opening coordinate system in UCSC
1795
1796 my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize));
1797 for my $nextbin ($bin1 .. $bin2) {
1798 if ($rawscore) { #print out rawscore, rather than normalized score (default)
1799 $normscore = $score;
1800 }
1801 if (defined $name) {
1802 push @{$regiondb{$chr, $nextbin}}, [$start, $end, $normscore, $name];
1803 } else { #name is not requested in the output
1804 push @{$regiondb{$chr, $nextbin}}, [$start, $end, $normscore];
1805 }
1806 }
1807 $regioncount++;
1808 if ($verbose and $dbcount =~ m/000000$/) {
1809 my ($availmem, $allmem) = currentAvailMemory();
1810 printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n";
1811 }
1812 }
1813 close (DB);
1814
1815 for my $key (keys %regiondb) { #pre-sort gene DB by txstart to faciliate future use
1816 @{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}};
1817 }
1818 printerr "Done with $regioncount regions";
1819 if (defined $score_threshold or $normscore_threshold) {
1820 printerr " (that passed --score_threhsold or --normscore_threshold from a total of $dbcount regions)\n";
1821 } else {
1822 printerr "\n";
1823 }
1824 return (\%regiondb);
1825 }
1826
1827
1828 sub translateDNA {
1829 my ($seq) = @_;
1830 my ($nt3, $protein);
1831 $seq = uc $seq;
1832 #length ($seq) % 3 == 0 or printerr "WARNING: length of DNA sequence to be translated is not multiples of 3: <length=${\(length $seq)}>\n";
1833 while ($seq =~ m/(...)/g) {
1834 defined $codon1{$1} or printerr "WARNING: invalid triplets found in DNA sequence to be translated: <$1>\n";
1835 $protein .= $codon1{$1};
1836 }
1837 return $protein;
1838 }
1839
1840 sub translateRNA {
1841 my ($seq) = @_;
1842 my ($nt3, $protein);
1843 $seq = uc $seq;
1844 #length ($seq) % 3 == 0 or printerr "WARNING: length of RNA sequence to be translated is not multiples of 3: <length=${\(length $seq)}>\n";
1845 while ($seq =~ m/(...)/g) {
1846 defined $codonr1{$1} or printerr "WARNING: invalid triplets found in RNA sequence to be translated: <$1>\n";
1847 $protein .= $codonr1{$1};
1848 }
1849 return $protein;
1850 }
1851
1852 sub revcom {
1853 my ($seq) = @_;
1854 $seq = reverse $seq;
1855 $seq =~ tr/acgtACGT/tgcaTGCA/;
1856 return ($seq);
1857 }
1858
1859 sub readSeqFromFASTADB {
1860 my ($refseqvar) = @_;
1861 my (%seqhash);
1862 my $seqdbfile;
1863
1864 #the four statements below should be condensed in the future (they are identical)
1865 $seqdbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1" . "Mrna.fa");
1866
1867 my ($seqid, $curseq) = ('', '');
1868
1869 -f $seqdbfile or die "Error: FASTA sequence file $seqdbfile does not exist. Please use 'annotate_variation.pl --downdb $dbtype $dbloc' download the database.\n";
1870 open (SEQ, $seqdbfile) or die "Error: cannot read from seqdbfile $seqdbfile: $!\n";
1871 printerr "NOTICE: Reading FASTA sequences from $seqdbfile ... ";
1872 while (<SEQ>) {
1873 if (m/^>(\S+)/) {
1874 if ($refseqvar->{$seqid}) {
1875 not defined $seqhash{$seqid} and $seqhash{$seqid} = $curseq; #finish reading the sequence for seqid and save it (unless the sequence is already read from the file)
1876 }
1877 $seqid = $1;
1878 $curseq = '';
1879 } else {
1880 if ($refseqvar->{$seqid}) {
1881 s/[\r\n]+$//;
1882 $curseq .= uc $_; #only use upper case characters
1883 }
1884 }
1885 }
1886 if ($refseqvar->{$seqid}) { #finish the last sequence in the file
1887 not defined $seqhash{$seqid} and $seqhash{$seqid} = $curseq;
1888 }
1889 close (SEQ);
1890 printerr "Done with ", scalar keys %seqhash, " sequences\n";
1891 if (keys %seqhash < keys %$refseqvar) {
1892 my (@seqnotfound, @seqnotfound_example);
1893 for $seqid (keys %$refseqvar) {
1894 exists $seqhash{$seqid} or push @seqnotfound, $seqid;
1895 }
1896 printerr "WARNING: A total of ${\(scalar @seqnotfound)} sequences cannot be found in $seqdbfile";
1897 @seqnotfound_example = splice (@seqnotfound, 0, 3);
1898 printerr " (example: @seqnotfound_example)\n";
1899 }
1900 return (\%seqhash);
1901 }
1902
1903 sub readKgXref {
1904 my ($inputfile) = @_;
1905 my (%gene_xref);
1906 open (XREF, $inputfile) or die "Error: cannot read from kgxref file $inputfile: $!\n";
1907 while (<XREF>) {
1908 m/^#/ and next;
1909 s/[\r\n]+$//;
1910 my @record = split (/\t/, $_);
1911 @record == 8 or die "Error: invalid record found in knownGene cross-reference file (6 fields expected): <$_>\n";
1912 #some genes were given names that are prefixed with "Em:" which should be removed due to the presence of ":" in exonic variant annotation
1913 #Em:AC006547.7 Em:AC005003.4 Em:U62317.15 Em:AC008101.5 Em:AC004997.11 Em:U51561.2
1914 $record[4] =~ s/^Em:/Em./;
1915 if ($gene_xref{$record[0]}) { #BC003168 occur twice in kgxref file (OSBPL10, BC003168)
1916 if ($gene_xref{$record[0]} =~ m/^(BC|AK)\d+$/) {
1917 $gene_xref{$record[0]} = $record[4];
1918 }
1919 } else {
1920 $gene_xref{$record[0]} = $record[4];
1921 }
1922 }
1923 close (XREF);
1924 return (\%gene_xref);
1925 }
1926
1927 sub readUCSCGeneAnnotation { #read RefGene annotation database from the UCSC Genome Browser, convert 0-based coordinates to 1-based coordinates
1928 my ($dbloc) = @_;
1929 my ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes);
1930 my (%genedb, %geneidmap, %name2count, %cdslen, %mrnalen);
1931 my ($genecount, $ncgenecount) = (0, 0);
1932
1933 my $dbfile;
1934 my $kgxref;
1935
1936 if ($dbtype1 eq 'refGene') {
1937 $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt");
1938 } elsif ($dbtype1 eq 'knownGene') {
1939 $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt");
1940 my $kgxreffile = File::Spec->catfile($dbloc, $buildver . "_kgXref.txt");
1941 -f $kgxreffile or die "Error: the knownGene cross-reference file $kgxreffile does not exist. Please use 'annotate_variation.pl --downdb knownGene $dbloc' to download the database.\n";
1942 $kgxref = readKgXref ($kgxreffile);
1943 } elsif ($dbtype1 eq 'ensGene') {
1944 $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt");
1945 } else {
1946 $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt"); #added 2011feb18
1947 #die "FATAL ERROR: the dbype $dbtype1 is not supported in the readUCSCGeneAnnotation() subroutine.\n"; #commented 2011feb18
1948 }
1949 -f $dbfile or die "Error: The gene annotation database $dbfile does not exist. Please use 'annotate_variation.pl --downdb $dbtype $dbloc -build $buildver' to download the database.\n";
1950
1951 open (GENEDB, $dbfile) or die "Error: cannot read from gene annotaion database $dbfile: $!\n";
1952 printerr "NOTICE: Reading gene annotation from $dbfile ... ";
1953 while (<GENEDB>) {
1954 s/[\r\n]+$//; #deleting the newline characters
1955 my @record = split (/\t/, $_);
1956
1957 if ($dbtype1 eq 'refGene') {
1958 @record == 16 or die "Error: invalid record in $dbfile (expecting 16 tab-delimited fields in refGene file): <$_>\n";
1959 ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15]; #human hg18, mouse
1960 } elsif ($dbtype1 eq 'knownGene') {
1961 @record >= 11 or die "Error: invalid record in $dbfile (>=11 fields expected in knownGene file): <$_>\n"; #mm8=11, hg18=hg19=12
1962 ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend) = @record[0..9];
1963 $name2 = $kgxref->{$name} || $name;
1964 } elsif ($dbtype1 eq 'ensGene') {
1965 @record == 16 or die "Error: invalid record in $dbfile (expecting 16 fields in ensGene file): <$_>\n";
1966 ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15];
1967 } else {
1968 @record >= 11 or die "Error: invalid record in $dbfile (>=11 fields expected in $dbtype1 gene definition file): <$_>\n";
1969 ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15];
1970 defined $name2 or $name2=$name;
1971 #die "FATAL ERROR: the --dbtype $dbtype is not supported in readUCSCGeneAnnotation() subroutine.\n"; #commented 2011feb18
1972 }
1973
1974 #handle situations where the same transcript is mapped to several chromosomes or regions (for example, NM_019105 is mapped to chr6, chr6_cox_hap1, chr6_qbl_hap2; NM_002538 is mapped to chr5 positive and negative strand and also in chr5_h2_hap1)
1975 if ($chr =~ m/hap\d+$/) {
1976 next; #this is a temporary solution on 2011feb19, to ignore alternative haplotype chromosomes
1977 }
1978
1979 $chr =~ s/^chr// or die "Error: invalid record found in $dbfile (chrom field not found): <$_>\n"; #UCSC always prefix "chr" to the chromosome identifier, so this is a good check to make sure that the file is the correct file
1980 $dbstrand eq '+' or $dbstrand eq '-' or die "Error: invalid dbstrand information found in $dbfile (dbstrand has to be + or -): <$_>\n"; #dbstrand is important to know and cannot be optional
1981 my @exonstart = split (/,/, $exonstart); #remove trailing comma
1982 my @exonend = split (/,/, $exonend); #remove trailing comma
1983 $exoncount == @exonstart or die "Error: invalid record found in $dbfile (exoncount discordance): <$exoncount vs ${\(scalar @exonstart)}>\n";
1984 @exonstart == @exonend or die "Error: invalid record found in $dbfile (exonstart and exonend count discordance): <${\(scalar @exonstart)} vs ${\(scalar @exonend)}>\n";
1985 $txstart++; $cdsstart++; map {$_++} @exonstart; #convert 0-based coordinate to 1-based coordinate
1986
1987 #LOGIC here:
1988 #first calcluate mRNA length, and if the transcript maps to multiple locations with discordant mRNA length, only consider the leftmost chromosome and leftmost coordinate (because the FASTA file is sorted in this manner)
1989
1990 my $cdslength = 0;
1991 my $mrnalength = 0;
1992 for my $i (0 .. @exonstart-1) { #this calculation is valid regardless of strand
1993 $mrnalength += $exonend[$i]-$exonstart[$i]+1;
1994 if ($cdsstart >= $exonstart[$i] and $cdsstart <= $exonend[$i]) {
1995 if ($cdsend <= $exonend[$i]) {
1996 $cdslength = $cdsend-$cdsstart+1;
1997 last;
1998 } else {
1999 $cdslength += $exonend[$i]-$cdsstart+1;
2000 next;
2001 }
2002 }
2003 if ($cdslength and $cdsend < $exonstart[$i]) {
2004 die "FATAL ERROR: impossible scenario for $name in $dbfile (cdsend is less than exon start)";
2005 } elsif ($cdslength and $cdsend <= $exonend[$i]) {
2006 $cdslength += $cdsend-$exonstart[$i]+1;
2007 last;
2008 } elsif ($cdslength and $cdsend > $exonend[$i]) {
2009 $cdslength += $exonend[$i]-$exonstart[$i]+1;
2010 }
2011
2012 }
2013
2014 if ($cdsstart != $cdsend+1) { #coding gene
2015 if (defined $mrnalen{$name} and $mrnalen{$name} != $mrnalength) {
2016 $verbose and printerr "WARNING: $name occurs more than once in $dbfile with different mRNA length. The first occurences with identical mRNA length will be uesd in analysis.\n";
2017 next;
2018 }
2019
2020
2021 if (defined $cdslen{$name} and $cdslen{$name} != $cdslength) {
2022 $verbose and printerr "WARNING: $name occurs more than once in $dbfile with different CDS length. The first occurences with identical mRNA length will be uesd in analysis.\n";
2023 next;
2024 }
2025 }
2026
2027 $cdslen{$name} = $cdslength;
2028 $mrnalen{$name} = $mrnalength;
2029
2030 my ($bin1, $bin2) = (int(($txstart - $neargene)/$genomebinsize), int(($txend + $neargene)/$genomebinsize));
2031 for my $nextbin ($bin1 .. $bin2) {
2032 push @{$genedb{$chr, $nextbin}}, [$name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, [@exonstart], [@exonend], $name2];
2033 }
2034 $geneidmap{$name} = $name2;
2035 $genecount++;
2036 $name2count{$name2}++;
2037 $cdsstart == $cdsend+1 and $ncgenecount++; #non-coding gene has the same start and end site
2038 }
2039 close (GENEDB);
2040 for my $key (keys %genedb) { #pre-sort gene DB by txstart to faciliate future use
2041 @{$genedb{$key}} = sort {$a->[2] <=> $b->[2]} @{$genedb{$key}};
2042 }
2043 printerr "Done with $genecount transcripts (including $ncgenecount without coding sequence annotation) for ", scalar (keys %name2count), " unique genes\n";
2044 return (\%genedb, \%geneidmap, \%cdslen, \%mrnalen);
2045 }
2046
2047 sub downloadDB {
2048 my ($cwd, $msg, $sc);
2049
2050 $cwd = Cwd::cwd();
2051
2052 -w $dbloc or die "Error: the directory $dbloc is not writable by the current user\n";
2053 chdir ($dbloc) or die "Error: the directory $dbloc cannot be accessed\n";
2054
2055 my (@urlin, @filein, @fileout, %fail); #the fail hash contains index of files that fail to be downloaded
2056 my $count_success;
2057 if ($dbtype1 eq 'refGene') {
2058 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/refGene.txt.gz";
2059 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/refLink.txt.gz";
2060 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_refGeneMrna.fa.gz";
2061 } elsif ($dbtype1 eq 'knownGene') {
2062 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/knownGene.txt.gz";
2063 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/kgXref.txt.gz";
2064 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_knownGeneMrna.fa.gz";
2065 } elsif ($dbtype1 eq 'ensGene') {
2066 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/ensGene.txt.gz";
2067 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_ensGeneMrna.fa.gz";
2068 } elsif ($dbtype1 eq 'seq') {
2069 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/chromFa.zip"; #example: hg18, hg19
2070 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/chromFa.tar.gz"; #example: panTro2
2071 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/$buildver.fa.gz"; #example: bosTau4
2072 } elsif ($dbtype1 =~ m/^mce(\d+way)$/) { #it could be 17 way, 28 way, 30 way, 44 way, etc, depending on genome and on build
2073 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/phastConsElements$1.txt.gz";
2074 } elsif ($dbtype1 eq 'avsift') {
2075 $buildver eq 'hg18' or $buildver eq 'hg19' or die "Error: currently the --dbtype of avsift only support --buildver of 'hg18' or 'hg19'\n";
2076 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_avsift.txt.gz";
2077 } elsif ($dbtype1 eq '1000g') { #dbtype1 is same as queryfile, when --downdb is used
2078 $buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g' only support --buildver of 'hg18'\n";
2079 push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/CEU.sites.2009_04.gz";
2080 push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/YRI.sites.2009_04.gz";
2081 push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/JPTCHB.sites.2009_04.gz";
2082 } elsif ($dbtype1 eq '1000g2010') { #dbtype1 is same as queryfile, when --downdb is used
2083 $buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g2010' only support --buildver of 'hg18'\n";
2084 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_CEU.sites.2010_03.txt.gz";
2085 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_YRI.sites.2010_03.txt.gz";
2086 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_JPTCHB.sites.2010_03.txt.gz";
2087 } elsif ($dbtype1 eq '1000g2010jul') { #dbtype1 is same as queryfile, when --downdb is used
2088 $buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g2010jul' only support --buildver of 'hg18'\n";
2089 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_CEU.sites.2010_07.txt.gz";
2090 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_YRI.sites.2010_07.txt.gz";
2091 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_JPTCHB.sites.2010_07.txt.gz";
2092 } elsif ($dbtype1 eq '1000g2010nov') {
2093 $buildver eq 'hg19' or die "Error: currently the --dbtype of '1000g2010nov' only support --buildver of 'hg19'\n";
2094 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg19_ALL.sites.2010_11.txt.gz";
2095 } elsif ($dbtype1 eq 'null') {
2096 1;
2097 } else {
2098 if ($webfrom) {
2099 if ($webfrom eq 'annovar') {
2100 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_$dbtype1.txt.gz";
2101 } elsif ($webfrom eq 'ucsc') {
2102 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/$dbtype1.txt.gz";
2103 } else {
2104 push @urlin, "$webfrom/$dbtype1.txt.gz";
2105 }
2106 } else {
2107 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/$dbtype1.txt.gz"; #default goes to UCSC
2108 }
2109 }
2110
2111 @filein = @urlin;
2112 map {s/.+\///} @filein;
2113 @fileout = @filein;
2114 map {s/\.gz$//; s/\.zip$//} @fileout;
2115
2116 if ($wget) {
2117 $msg = qx/wget --help 2>&1/ || ''; #collect the output of the system command
2118 } else {
2119 $msg = ''; #when --nowget is specified, do not use wget to retrieve files from Internet
2120 }
2121 if ($msg =~ m/Usage/) {
2122 checkProgramUpdate ("wget");
2123 for my $i (0 .. @urlin-1) {
2124 printerr "NOTICE: Downloading annotation database $urlin[$i] ... ";
2125 if ($verbose) {
2126 $sc = "wget -t 1 -T 10 -O $filein[$i] $urlin[$i]";
2127 } else {
2128 $sc = "wget -t 1 -T 10 -q -O $filein[$i] $urlin[$i]";
2129 }
2130 if (system ($sc)) { #time-out is 10 seconds, with 1 retry attempt
2131 printerr "Failed\n";
2132 $verbose and print "WARNING: unable to execute system command: <$sc>\n";
2133 unlink ($filein[$i]); #delete the temporary files generated by wget
2134 $fail{$i}++;
2135 } else {
2136 printerr "OK\n";
2137 $count_success++;
2138 }
2139 }
2140 } else {
2141 eval {
2142 require Net::FTP;
2143 require LWP::UserAgent;
2144 };
2145 if ($@) {
2146 printerr "WARNING: cannot retrieve remote files automatically (by 'wget' command or by standard Net::FTP/LWP::UserAgent Perl module).\n";
2147 printerr "Please manually download the following file, uncompress the files to $dbloc directory, then add a ${buildver}_ prefix to the file names.\n";
2148 printerr join ("\n", @urlin), "\n";
2149 exit (100);
2150 }
2151
2152 checkProgramUpdate ("lwp");
2153 my ($http, $ftp);
2154 for my $i (0 .. @urlin-1) {
2155 printerr "NOTICE: Downloading annotation database $urlin[$i] ... ";
2156 if ($urlin[$i] =~ m/^http/) {
2157 $http = LWP::UserAgent->new (timeout=>10, show_progress=>$verbose);
2158 $http->env_proxy;
2159
2160 my $response = $http->get ($urlin[$i], ':content_file'=>$filein[$i]);
2161 if ($response->is_success) {
2162 printerr "Done\n";
2163 $count_success++;
2164 } else {
2165 printerr "Failed\n";
2166 $verbose and printerr "WARNING: cannot retrieve remote files ($urlin[$i]) via LWP::UserAgent Perl module: ", $response->status_line, "\n";
2167 $fail{$i}++;
2168 }
2169 } elsif ($urlin[$i] =~ m#^ftp://([^\\\/]+)#) { #for hgdownload.cse.ucsc.edu, ftp-trace.ncbi.nih.gov, ftp.ensembl.org, etc
2170 my $urlroot = $1;
2171 if ($ftp = Net::FTP->new($urlroot, Timeout=>10, Debug=>$verbose)) {
2172 $ftp->login("anonymous", 'anonymous@');
2173 $ftp->binary();
2174 my $url = $urlin[$i];
2175 $url =~ s#ftp://[\w\.\-]+/##; #remove the URL root
2176 if (not $ftp->get($url)) {
2177 printerr "Failed\n";
2178 $verbose and printerr "WARNING: cannot retrieve remote file ($url) in FTP server $urlroot\n";
2179 $fail{$i}++;
2180 } else {
2181 printerr "Done\n";
2182 $count_success++;
2183 }
2184 } else {
2185 printerr "Failed\n";
2186 $verbose and printerr "WARNING: cannot retrieve remote file ($urlin[$i]) via Net::FTP Perl module\n";
2187 $fail{$i}++;
2188 }
2189
2190 } else {
2191 die "Error: The URL $urlin[$i] uses an unsupported protocol. Download cannot continue\n";
2192 }
2193 }
2194 }
2195
2196 $count_success and printerr "NOTICE: Uncompressing downloaded files\n";
2197 for my $i (0 .. @filein-1) {
2198 $fail{$i} and next;
2199 if ($filein[$i] =~ m/\.zip$/) {
2200 $msg = qx/unzip --help 2>&1/ || ''; #collect the output of the system command
2201 if ($msg =~ m/Usage/i) {
2202 if ($verbose) {
2203 system ("unzip -o $filein[$i]");
2204 } else {
2205 system ("unzip -o -q $filein[$i]");
2206 }
2207 } else {
2208 printerr "ERROR: unzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n";
2209 exit (101);
2210 }
2211 } elsif ($filein[$i] =~ m/\.tar\.gz$/) { #panTro2 FASTA sequence is stored as tar.gz rather than zip
2212 $msg = qx/tar --help 2>&1/ || ''; #collect the output of the system command
2213 if ($msg =~ m/Usage/i) {
2214 system ("tar -x -z -f $filein[$i]");
2215 } else {
2216 printerr "ERROR: tar/gunzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n";
2217 exit (102);
2218 }
2219 } elsif ($filein[$i] =~ m/\.gz$/) {
2220 $msg = qx/gunzip --help 2>&1/ || ''; #collect the output of the system command
2221 if ($msg =~ m/Usage/i) {
2222 system ("gunzip -f $filein[$i]");
2223 } else {
2224 printerr "ERROR: gunzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n";
2225 exit (103);
2226 }
2227 }
2228 }
2229
2230 for my $i (0 .. @fileout-1) {
2231 $fail{$i} and next; #skip the file that failed to be downloaded
2232 my $fileout = $fileout[$i];
2233 $dbtype1 eq 'seq' and next; #the zip file contains dozens of FASTA files so cannot rename them automatically
2234 if (not $fileout =~ m/^${buildver}_/) { #if the buildver is not the prefix of the files
2235 rename ($fileout, "${buildver}_$fileout") or die "Error: cannot rename $fileout to ${buildver}_$fileout\n";
2236 $fileout = "${buildver}_$fileout";
2237 }
2238 if (not $fileout =~ m/\.txt$/ and not $fileout =~ m/\.fa$/) {
2239 rename ($fileout, "$fileout.txt");
2240 }
2241 }
2242
2243 $count_success and printerr "NOTICE: Finished downloading annotation files for $buildver build version, with files saved at the '$dbloc' directory\n";
2244 $cwd and chdir ($cwd);
2245 if (%fail) {
2246 my @failindex = keys %fail;
2247 if ($dbtype1 eq 'seq' and @failindex == 1) { #not really a fail, because for seq, ANNOVAR attempts on tar.gz and zip file
2248 1;
2249 } else {
2250 printerr "WARNING: Some files cannot be downloaded, including ", join (', ', @urlin[@failindex]), "\n";
2251 }
2252
2253 for my $index (@failindex) {
2254 if ($urlin[$index] =~ m#^http://www\.openbioinformatics\.org.+Mrna.fa.gz$#) {
2255 printerr "---------------------------ADDITIONAL PROCEDURE---------------------------\n";
2256 printerr "--------------------------------------------------------------------------\n";
2257 printerr "NOTICE: the FASTA file $urlin[$index] is not available to download but can be generated by the ANNOVAR software. ";
2258 printerr "PLEASE RUN THE FOLLOWING TWO COMMANDS CONSECUTIVELY TO GENERATE THE FASTA FILES:\n\n";
2259 printerr "\tannotate_variation.pl --buildver $buildver --downdb seq $dbloc/${buildver}_seq\n";
2260 printerr "\tretrieve_seq_from_fasta.pl $dbloc/${buildver}_$dbtype1.txt -seqdir $dbloc/${buildver}_seq -format $dbtype1 -outfile $dbloc/${buildver}_${dbtype1}Mrna.fa\n";
2261 printerr "--------------------------------------------------------------------------\n";
2262 printerr "--------------------------------------------------------------------------\n";
2263 }
2264 }
2265 }
2266 }
2267
2268 sub currentAvailMemory {
2269 my ($availmem, $allmem) = (0, 0);
2270 if ($^O eq "MSWin32") { #no easy solution to get available memory from Windows.
2271 ($availmem, $allmem) = (0, 0);
2272 } elsif ($^O eq 'linux' or $^O eq 'aix' or $^O eq 'solaris') {
2273 if (open (TOP, "top -b -n 1 2>&1 |")) {
2274 my $index;
2275 while (<TOP>) {
2276 if (m/^Mem:.+\s(\d+)k free/) {
2277 $availmem = $1;
2278 }
2279 s/^\s+//;
2280 my @field = split (/\s+/, $_);
2281 @field >= 10 or next; #make sure that the PID lines are reached
2282 if ($field[0] eq 'PID') {
2283 for my $i (0 .. @field-1) {
2284 $field[$i] eq 'RES' and $index = $i;
2285 }
2286 }
2287 if ($field[0] eq $$) {
2288 defined $index or die "Error: invalid output from top command: the line with PID and RES is not found\n";
2289 $allmem = $field[$index];
2290 if ($allmem =~ m/^([\d\.]+)(\w)$/) {
2291 if ($2 eq 'g') {
2292 $allmem = $1 * 1_000_000;
2293 } elsif ($2 eq 'm') {
2294 $allmem = $1 * 1_000;
2295 } elsif ($2 eq 'k') {
2296 $allmem = $1;
2297 } else {
2298 printerr "WARNING: unrecognizable output from top command: <$_>\n";
2299 }
2300 }
2301 last;
2302 }
2303 }
2304 }
2305 } else {
2306 ($availmem, $allmem) = (0, 0);
2307 }
2308 return ($availmem, $allmem);
2309 }
2310
2311 sub printerr {
2312 print STDERR @_;
2313 print LOG @_;
2314 }
2315
2316 sub checkProgramUpdate {
2317 my ($method) = @_;
2318 my $sc;
2319 my ($curdate, $webdate, $webdate1) = $LAST_CHANGED_DATE;
2320 my (@webcontent);
2321 $method eq 'wget' or $method eq 'lwp' or die "Error: update checking method can be only 'wget' or 'lwp'";
2322 printerr "NOTICE: Web-based checking to see whether ANNOVAR new version is available ... ";
2323 $LAST_CHANGED_DATE =~ m/LastChangedDate: (\d+)\-(\d+)-(\d+)/ or printerr "Failed\n" and return;
2324 $curdate = $1.$2.$3;
2325 if ($method eq 'wget') {
2326 $sc = "wget -t 1 -T 10 -q -O .annovar_date http://www.openbioinformatics.org/annovar/download/annovar_date";
2327 if (system ($sc)) {
2328 printerr "Failed\n";
2329 return;
2330 } else {
2331 if (not open (AVDATE, ".annovar_date")) {
2332 printerr "Cannot access version information\n";
2333 } else {
2334 printerr "Done\n";
2335 @webcontent = <AVDATE>; #$LAST_CHANGED_DATE = '$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $';
2336 close (AVDATE);
2337 unlink (".annovar_date");
2338 }
2339 }
2340 } elsif ($method eq 'lwp') {
2341 my $http = LWP::UserAgent->new (timeout=>10);
2342 $http->env_proxy;
2343 my $response = $http->get("http://www.openbioinformatics.org/annovar/download/annovar_date");
2344 if ($response->is_success) {
2345 printerr "Done\n";
2346 $_ = $response->decoded_content;
2347 @webcontent = split (/\n/, $_);
2348 } else {
2349 printerr "Failed\n";
2350 return;
2351 }
2352 }
2353
2354 $webdate = $webcontent[0];
2355 $webdate =~ s/[\r\n]+$//;
2356 $webdate1 = $webdate;
2357 $webdate1 =~ s/\-//g; #remove the - sign in webdate
2358 if ($curdate < $webdate1) {
2359 printerr "----------------------------UPDATE AVAILABLE------------------------------\n";
2360 printerr "--------------------------------------------------------------------------\n";
2361 printerr "WARNING: A new version of ANNOVAR (dated $webdate) is available!\n";
2362 printerr " Download from http://www.openbioinformatics.org/annovar/\n";
2363
2364 if (@webcontent >= 2) {
2365 printerr "Changes made in the $webdate version:\n";
2366 for my $i (1 .. @webcontent-1) {
2367 if ($webcontent[$i] =~ m/^(\d{4})\-(\d{2})\-(\d{2})[\r\n]+$/) {
2368 $webdate = "$1-$2-$3";
2369 $webdate1 = "$1$2$3";
2370 if ($curdate >= $webdate1) { #the current version is more recent than this date
2371 last;
2372 } else {
2373 printerr "Changes made in the $webdate version:\n";
2374 }
2375 } else {
2376 printerr " * $webcontent[$i]";
2377 }
2378 }
2379 }
2380 printerr "--------------------------------------------------------------------------\n";
2381 printerr "--------------------------------------------------------------------------\n";
2382 }
2383 }
2384
2385 =head1 SYNOPSIS
2386
2387 annotate_variation.pl [arguments] <query-file|table-name> <database-location>
2388
2389 Optional arguments:
2390 -h, --help print help message
2391 -m, --man print complete documentation
2392 -v, --verbose use verbose output
2393
2394 Arguments to download databases or perform annotations
2395 --downdb download UCSC Genome Browser annotation database
2396 --geneanno annotate variants by functional consequences on genes
2397 --regionanno annotate variants by targetting specific genomics regions
2398 --filter filter variants based on a position list
2399 --webfrom <string> specify the source of database (default usually works fine)
2400
2401 Arguments to control input and output files
2402 --outfile <file> output file prefix
2403 --zerostart input query file uses half-open zero-start coordinate
2404 --dbtype <string> database type
2405 --buildver <string> genome build version (default: hg18 for human)
2406 --gff3dbfile <file> specify the GFF3 DB file used in region-based annotation
2407 --genericdbfile <file> specify the generic DB file used in filter-based annotation
2408 --vcfdbfile <file> specify the DB file in VCF format in filter-based annotation
2409 --bedfile <file> specify a BED file in region-based annotation
2410 --time print out local time during program run
2411 --separate separately print out all function of a variant (default: one line per variant)
2412 --colsWanted <string> specify which columns to output in -regionanno by comma-delimited numbers
2413 --comment print out comment line (those starting with #) in output files
2414 --scorecolumn <int> the column with scores in database file (for region-based annotation)
2415 --exonsort sort the exon number in output line (for gene-based annotation)
2416 --transcript_function use transcript name rather than gene name in gene-based annotation output
2417
2418 Arguments to fine-tune the annotation procedure
2419 --batchsize <int> batch size for processing variants per batch (default: 5m)
2420 --genomebinsize <int> bin size to speed up search (default: 100k for -geneanno, 10k for -regionanno)
2421 --expandbin <int> check nearby bin to find neighboring genes (default: 2m/genomebinsize)
2422 --neargene <int> distance threshold to define upstream/downstream of a gene
2423 --score_threshold <float> minimum score of DB regions to use in annotation
2424 --normscore_threshold <float> minimum normalized score of DB regions to use in annotation
2425 --rawscore output includes the raw score (not normalized score) in UCSC Browser Track
2426 --minqueryfrac <float> minimum percentage of query overlap to define match to DB (default: 0)
2427 --splicing_threshold <int> distance between splicing variants and exon/intron boundary (default: 2)
2428 --maf_threshold <float> filter 1000G variants with MAF above this threshold (default: 0)
2429 --sift_threshold <float> SIFT threshold for deleterious prediction (default: 0.05)
2430 --precedence <string> comma-delimited to specify precedence of variant function (default: exonic>intronic...)
2431
2432 Arguments to control memory usage
2433 --memfree <int> ensure minimum amount of free system memory (default: 100000, in the order of kb)
2434 --memtotal <int> limit total amount of memory used by ANNOVAR (default: 0, unlimited, in the order of kb)
2435 --chromosome <string> examine these specific chromosomes in database file
2436
2437
2438 Function: annotate a list of genetic variants against genome annotation
2439 databases saved at local disk.
2440
2441 Example: #download gene annotation database (for hg18 build) and save to humandb/ directory
2442 annotate_variation.pl -downdb gene humandb/
2443 annotate_variation.pl -buildver mm9 -downdb mce30way mousedb/
2444 annotate_variation.pl -downdb snp130 humandb/
2445
2446 #gene-based annotation of variants in the varlist file (by default --geneanno is ON)
2447 annotate_variation.pl ex1.human humandb/
2448
2449 #region-based annotate variants
2450 annotate_variation.pl -regionanno -dbtype mce44way ex1.human humandb/
2451 annotate_variation.pl -regionanno -dbtype gff3 -gff3dbfile tfbs.gff3 ex1.human humandb/
2452
2453 #filter rare or unreported variants (in 1000G/dbSNP) or predicted deleterious variants
2454 annotate_variation.pl -filter -dbtype 1000g_ceu -maf 0.01 ex1.human humandb/
2455 annotate_variation.pl -filter -dbtype snp130 ex1.human humandb/
2456 annotate_variation.pl -filter -dbtype avsift ex1.human humandb/
2457
2458 Version: $LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $
2459
2460 =head1 OPTIONS
2461
2462 =over 8
2463
2464 =item B<--help>
2465
2466 print a brief usage message and detailed explanation of options.
2467
2468 =item B<--man>
2469
2470 print the complete manual of the program.
2471
2472 =item B<--verbose>
2473
2474 use verbose output.
2475
2476 =item B<--downdb>
2477
2478 download annotation databases from UCSC Genome Browser, Ensembl, 1000 Genomes
2479 Project or other resources. The annotation files in this database are required
2480 for the functional annotation of variants.
2481
2482 =item B<--geneanno>
2483
2484 perform gene-based annotation. For each variant, examine whether it hit exon,
2485 intron, intergenic region, or close to a transcript, or hit a non-coding RNA
2486 gene, or is located in a untranslated region. In addition, for an exonic variant,
2487 determine whether it causes splicing change, non-synonymous amino acid change,
2488 synonymous amino acid change or frameshift changes.
2489
2490 =item B<--regionanno>
2491
2492 perform region-based annotation. For each variant, examine whether it overlaps
2493 with a specific genomic region, such as the most conserved elements, the
2494 predicted transcription factor binding sites, the specific cytogeneic bands, the
2495 evolutionarily conserved RNA secondary structures and so on.
2496
2497 =item B<--filter>
2498
2499 perform filter-based annotation. For each variants, filter it against a
2500 variation database, such as the 1000 Genomes Project database and the dbSNP
2501 database, and identify a subset that have not been reported in these databases
2502 as novel variants.
2503
2504 =item B<--outfile>
2505
2506 specify the output file prefix. Several output files will be generated using
2507 this prefix and different suffixes. A directory name can also be specified as
2508 part of the argument, so that the output files can be written to a different
2509 directory than the current directory.
2510
2511 =item B<--zerostart>
2512
2513 utilize the half-open zero-start coordinate system that is used by many UCSC
2514 Genome Browser annotation tables. By default, the 1-based coordinate system will
2515 be used.
2516
2517 =item B<--dbtype>
2518
2519 specify the database type to be used in gene-based, region-based or filter-based
2520 annotations. For gene-based annotation, by default refGene annotations from the
2521 UCSC Genome Browser will be used for annotating variants. However, users can
2522 switch to utilize Ensembl annotations instead, or use the UCSC Gene annotations.
2523 In general, RefSeq gene annotations are more conservative, and UCSC Gene
2524 annotations are most liberal with many predicted genes and non-coding RNAs. For
2525 region-based annotations, users can select any UCSC annotation databases (by
2526 providing the database name), or alternatively select a Generic Feature Format
2527 version 3 (GFF3) formatted file for annotation (by providing 'gff3' as the --
2528 dbtype and providing the --gff3dbfile argument). For filter-based annotations,
2529 users can select a dbSNP file, a 1000G file, a generic format file (with simple
2530 columns including chr, start, end, reference, observed, score), a VCF format
2531 (which is the current popular format for variants exchange), or a avsift format
2532 (which is identital to the generic format but is provided for convenience).
2533
2534 =item B<--buildver>
2535
2536 genome build version to use. By default, the hg18 build for human genome is
2537 used. The build version will be used by ANNOVAR to identify corresponding database files
2538 automatically, for example, when gene-based annotation is used for hg18 build,
2539 ANNOVAR will search for the hg18_refGene.txt file, but if the hg19 is used as --
2540 buildver, ANNOVAR will examine hg19_refGene.txt instead.
2541
2542 =item B<--gff3dbfile>
2543
2544 specify the GFF3-formatted database file used in the region-based annotation.
2545
2546 =item B<--genericdbfile>
2547
2548 specify the generic format database file used in the filter-based annotation.
2549
2550 =item B<--vcfdbfile>
2551
2552 specify the database file in VCF format in the filter-based annotation. VCF has
2553 been a popular format for summarizing SNP and indel calls in a population of
2554 samples, and has been adopted by 1000 Genomes Project in their most recent data
2555 release.
2556
2557 =item B<--time>
2558
2559 print out the local time during execution of the program
2560
2561 =item B<--separate>
2562
2563 for gene-based annotation, separate the effects of each variant, so that each
2564 effect (intronic, exonic, splicing) is printed in one output line. By default,
2565 all effects are printed in the same line, in the comma-separated form of
2566 'UTR3,UTR5' or 'exonic,splicing'.
2567
2568 =item B<--colsWanted>
2569
2570 specify which columns are desired in the output for -regionanno. By default,
2571 ANNOVAR inteligently selects the columns based on the DB type. However, users
2572 can use a list of comma-delimited numbers, or use 'all', or use 'none', to
2573 request custom output columns.
2574
2575 =item B<--comment>
2576
2577 specify that the program should include comment lines in the output files.
2578 Comment lines are defined as any line starting with #. By default, these lines
2579 are not recognized as valid ANNOVAR input and are therefore written to the
2580 INVALID_INPUT file. This argument can be very useful to keep columns headers in
2581 the output file, if the input file use comment line to flag the column headers
2582 (usually the first line in the input file).
2583
2584 =item B<--scorecolumn>
2585
2586 specify the the column with desired output scores in UCSC database file (for
2587 region-based annotation). The default usually works okay.
2588
2589 =item B<--exonsort>
2590
2591 sort the exon number in output line in the exonic_variant_function file during
2592 gene-based annotation. If a mutation affects multiple transcripts, the ones with
2593 the smaller exon number will be printed before the transcript with larger exon
2594 number in the output.
2595
2596 =item B<--batchsize>
2597
2598 this argument specifies the batch size for processing variants by gene-based
2599 annotation. Normally 5 million variants (usually one human genome will have
2600 about 3-5 million variants depending on ethnicity) are annotated as a batch, to
2601 reduce the amounts of memory. The users can adjust the parameters: larger values
2602 make the program slightly faster, at the expense of slightly larger memory
2603 requirements. In a 64bit computer, the default settings usually take 1GB memory
2604 for gene-based annotation for human genome for a typical query file, but this
2605 depends on the complexity of the query (note that the query has a few required
2606 fields, but may have many optional fields and those fields need to be read and
2607 kept in memory).
2608
2609 =item B<--genomebinsize>
2610
2611 the bin size of genome to speed up search. By default 100kb is used for gene-
2612 based annotation, so that variant annotation focused on specific bins only
2613 (based on the start-end site of a given variant), rather than searching the
2614 entire chromosomes for each variant. By default 10kb is used for region-based
2615 annotation. The filter-based annotations look for variants directly so no bin is
2616 used.
2617
2618 =item B<--expandbin>
2619
2620 expand bin to both sides to find neighboring genes/regions. For gene-based
2621 annotation, ANNOVAR tries to find nearby genes for any intergenic variant, with
2622 a maximum number of nearby bins to search. By default, ANNOVAR will
2623 automatically set this argument to search 2 megabases to the left and right of
2624 the variant in genome.
2625
2626 =item B<--neargene>
2627
2628 the distance threshold to define whether a variant is in the upstream or
2629 downstream region of a gene. By default 1 kilobase from the start or end site of
2630 a transcript is defined as upstream or downstream, respectively. This is useful,
2631 for example, when one wants to identify variants that are located in the
2632 promoter regions of genes across the genome.
2633
2634 =item B<--score_threshold>
2635
2636 the minimum score to consider when examining region-based annotations on UCSC
2637 Genome Browser tables. Some tables do not have such scores and this argument
2638 will not be effective.
2639
2640 =item B<--normscore_threshold>
2641
2642 the minimum normalized score to consider when examining region-based annotations
2643 on UCSC Genome Browser tables. The normalized score is calculated by UCSC,
2644 ranging from 0 to 1000, to make visualization easier. Some tables do not have
2645 such scores and this argument will not be effective.
2646
2647 =item B<--rawscore>
2648
2649 for region-based annotation, print out raw scores from UCSC Genome Browser
2650 tables, rather than normalized scores. By default, normalized scores are printed
2651 in the output files. Normalized scores are compiled by UCSC Genome Browser for
2652 each track, and they usually range from 0 to 1000, but there are some
2653 exceptions.
2654
2655 =item B<--minqueryfrac>
2656
2657 The minimum fraction of overlap between a query and a database record to decide
2658 on their match. By default, any overlap is regarded as a match, but this may not
2659 work best when query consist of large copy number variants.
2660
2661 =item B<--splicing_threshold>
2662
2663 distance between splicing variants and exon/intron boundary, to claim that a
2664 variant is a splicing variant. By default, 2bp is used. ANNOVAR is relatively
2665 more stringent than some other software to claim variant as regulating splicing.
2666 In addition, if a variant is an exonic variant, it will not be reported as
2667 splicing variant even if it is within 2bp to an exon/intron boundary.
2668
2669 =item B<--maf_threshold>
2670
2671 the minor allele frequency (MAF) threshold to be used in the filter-based
2672 annotation for the 1000 Genomes Project databases. By default, any variant
2673 annotated in the 1000G will be used in filtering.
2674
2675 =item B<--memfree>
2676
2677 the minimum amount of free system memory that ANNOVAR should ensure to have. By
2678 default, if ANNOVAR takes too much memory such that only 100Mb system memory is
2679 available, ANNOVAR will stop reading annotation database into memory, and will
2680 start annotation procedure, and then clear the memory, and then read the next
2681 block of annotation database again. This argument ensures that ANNOVAR will not
2682 attempt to use virtual memory in the system (which makes ANNOVAR extremely
2683 slow).
2684
2685 =item B<--memtotal>
2686
2687 the total amount of memory that ANNOVAR should use at most. By default, this
2688 value is zero, meaning that there is no limit on that. Decreasing this threshold
2689 reduce the memory requirement by ANNOVAR, but may increase the execution time.
2690
2691 =item B<--chromosome>
2692
2693 examine these specific chromosomes in database file. The argument takes comma-
2694 delimited values, and the dash can be correctly recognized. For example, 5-10,X
2695 represent chromosome 5 through chromosome 10 plus chromosome X.
2696
2697 =back
2698
2699 =head1 DESCRIPTION
2700
2701 ANNOVAR is a software tool that can be used to functionally annotate a list of genetic variants,
2702 possibly generated from next-generation sequencing experiments. For example,
2703 given a whole-genome resequencing data set for a human with specific diseases,
2704 typically around 3 million SNPs and around half million insertions/deletions
2705 will be identified. Given this massive amounts of data (and candidate disease-
2706 causing variants), it is necessary to have a fast algorithm that scans the data
2707 and identify a prioritized subset of variants that are most likely functional
2708 for follow-up Sanger sequencing studies and functional assays.
2709
2710 Currently, these various types of functional annotations produced by ANNOVAR can
2711 be (1) gene-based annotations (the default behavior), such as exonic variants,
2712 intronic variants, intergenic variants, downstream variants, UTR variants,
2713 splicing site variants, stc. For exonic variants, ANNOVAR will try to predict
2714 whether each of the variants is non-synonymous SNV, synonymous SNV,
2715 frameshifting change, nonframeshifting change. (2) region-based annotation, to
2716 identify whether a given variant overlaps with a specific type of genomic
2717 region, for example, predicted transcription factor binding site or predicted
2718 microRNAs.(3) filter-based annotation, to filter a list of variants so that only
2719 those not observed in variation databases (such as 1000 Genomes Project and
2720 dbSNP) are printed out.
2721
2722 Currently, I am expanding the functionality of ANNOVAR on (1) Fusion gene
2723 detection from large deletions, where a deletion joins the reading frame of two
2724 genes (same orientation of transcription) together to create a new gene. (2)
2725 Assignment of functional importance score to each observed mutation in the
2726 genome. This will be extremely important for the development of association
2727 tests for rare variants, and for prioritization of variants in downstream
2728 functional studies after a successful genome-wide association studies (GWAS).
2729
2730 =over 8
2731
2732 =item * B<variant file format>
2733
2734 A sample variant file contains one variant per line, with the fields being chr,
2735 start, end, reference allele, observed allele, other information. The other
2736 information can be anything (for example, it may contain sample identifiers for
2737 the corresponding variant.) An example is shown below:
2738
2739 16 49303427 49303427 C T rs2066844 R702W (NOD2)
2740 16 49314041 49314041 G C rs2066845 G908R (NOD2)
2741 16 49321279 49321279 - C rs2066847 c.3016_3017insC (NOD2)
2742 16 49290897 49290897 C T rs9999999 intronic (NOD2)
2743 16 49288500 49288500 A T rs8888888 intergenic (NOD2)
2744 16 49288552 49288552 T - rs7777777 UTR5 (NOD2)
2745 18 56190256 56190256 C T rs2229616 V103I (MC4R)
2746
2747 =item * B<database file format: UCSC Genome Browser annotation database>
2748
2749 Most but not all of the gene annotation databases are directly downloaded from
2750 UCSC Genome Browser, so the file format is identical to what was used by the
2751 genome browser. The users can check Table Browser (for example, human hg18 table
2752 browser is at http://www.genome.ucsc.edu/cgi-bin/hgTables?org=Human&db=hg18) to
2753 see what fields are available in the annotation file. Note that even for the
2754 same species (such as humans), the file format might be different between
2755 different genome builds (such as between hg16, hg17 and hg18). ANNOVAR will try
2756 to be smart about guessing file format, based on the combination of the --
2757 buildver argument and the number of columns in the input file. In general, the
2758 database file format should not be something that users need to worry about.
2759
2760 =item * B<database file format: GFF3 format for gene-based annotations)>
2761
2762 As of June 2010, ANNOVAR cannot perform gene-based annotations using GFF3 input
2763 files, and any annotations on GFF3 is region-based. However, this is expected to
2764 be changed in the future.
2765
2766 =item * B<database file format: GFF3 format for region-based annotations)>
2767
2768 Currently, region-based annotations can support the Generic Feature Format
2769 version 3 (GFF3) formatted files. The GFF3 has become the de facto golden
2770 standards for many model organism databases, such that many users may want to
2771 take a custom annotation database and run ANNOVAR on them, and it would be the
2772 most convenient if the custom file is made with GFF3 format.
2773
2774 =item * B<database file format: generic format for filter-based annotations)>
2775
2776 The 'generic' format is designed for filter-based annotation that looks for
2777 exact variants. The format is almost identical to the ANNOVAR input format, with
2778 chr, start, end, reference allele, observed allele and scores (higher scores are
2779 regarded as better).
2780
2781 =item * B<database file format: VCF format for filter-based annotations)>
2782
2783 The 1000 Genomes Project now provide their variant annotations in VCF format, so
2784 I implemented the functionality to directly interrogate VCF files. A VCF file
2785 may contain summary information for variants (for example, this variant has MAF
2786 of 5% in this population), or it may contain the actual variant calls for each
2787 individual in a specific population. As of March 2010, the files from 1000G website
2788 only contains the first type of information (that is, alleles and their
2789 frequencies in population). For the purpose of simplicity, ANNOVAR only
2790 interrogates the first type of information.
2791
2792 =item * B<database file format: avsift for filter-based annotations)>
2793
2794 avsift refers to a file that ANNOVAR developers compiled for fast annotation of
2795 SIFT scores for non-synonymous variants in the human genome. It conforms to the
2796 generic format described above. However, users can directly specify '--dbtype
2797 avsift' in command line to perform avsift annotations, making it more convenient
2798 for users. Alternatively, users can use '--dbtype generic -genericdbfile
2799 hg18_avsift.txt' for the annotation, and the effects are usually the same.
2800
2801 =item * B<sequence file format>
2802
2803 ANNOVAR can directly examine FASTA-formatted sequence files. For mRNA sequences,
2804 the name of the sequences are the mRNA identifier. For genomic sequences, the
2805 name of the sequences in the files are usually chr1, chr2, chr3, etc, so that
2806 ANNOVAR knows which sequence corresponds to which chromosome. Unfortunately,
2807 UCSC uses things like chr6_random to annotate un-assembled sequences, as opposed
2808 to using the actual contig identifiers. This causes some issues (depending on
2809 how reads alignment algorithms works), but in general should not be something
2810 that user need to worry about. If the users absolutely care about the exact
2811 contigs rather than chr*_random, then they will need to re-align the short reads
2812 at chr*_random to a different FASTA file that contains the contigs, and then
2813 execute ANNOVAR on the newly identified variants.
2814
2815 =item * B<invalid input>
2816
2817 If the query file contains input lines with invalid format, ANNOVAR will skip
2818 such line and continue with the annotation on next lines. These invalid input
2819 lines will be written to a file with suffix invalid_input. Users should manually
2820 examine this file and identify sources of error.
2821
2822 =back
2823
2824 ANNOVAR is freely available to the academic community for non-commercial use. For
2825 questions or comments, please contact kai@openbioinformatics.org.
2826
2827 =cut