view gfapts/inc/annovar/annotate_variation.pl @ 1:028f435b6cfb draft default tip

Uploaded
author rdaveau
date Fri, 03 Aug 2012 05:50:41 -0400
parents f753b30013e6
children
line wrap: on
line source

#!/usr/bin/perl
use warnings;
use strict;
use Pod::Usage;
use Getopt::Long;
use File::Spec;
use Cwd;

our $VERSION = 			'$Revision: 466 $';
our $LAST_CHANGED_DATE =	'$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $';

our ($verbose, $help, $man);
our ($queryfile, $dbloc);
our ($outfile, $separate, $batchsize, $dbtype, $neargene, $genomebinsize, $geneanno, $regionanno, $filter, $downdb, $buildver, $score_threshold, $normscore_threshold, $minqueryfrac, $expandbin, $splicing_threshold,
	$maf_threshold, $chromosome, $zerostart, $rawscore, $memfree, $memtotal, $sift_threshold, $gff3dbfile, $genericdbfile, $vcfdbfile, $time, $wget, $precedence,
	$webfrom, $colsWanted, $comment, $scorecolumn, $transfun, $exonsort, $avcolumn, $bedfile);
our (%valichr, $dbtype1);
our (@precedence, @colsWanted, @avcolumn);
sub printerr;			#declare a subroutine

our %codon1 = (TTT=>"F", TTC=>"F", TCT=>"S", TCC=>"S", TAT=>"Y", TAC=>"Y", TGT=>"C", TGC=>"C", TTA=>"L", TCA=>"S", TAA=>"*", TGA=>"*", TTG=>"L", TCG=>"S", TAG=>"*", TGG=>"W", CTT=>"L", CTC=>"L", CCT=>"P", CCC=>"P", CAT=>"H", CAC=>"H", CGT=>"R", CGC=>"R", CTA=>"L", CTG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", ATT=>"I", ATC=>"I", ACT=>"T", ACC=>"T", AAT=>"N", AAC=>"N", AGT=>"S", AGC=>"S", ATA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", ATG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GTT=>"V", GTC=>"V", GCT=>"A", GCC=>"A", GAT=>"D", GAC=>"D", GGT=>"G", GGC=>"G", GTA=>"V", GTG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G");
our %codon3 = (TTT=>"Phe", TTC=>"Phe", TCT=>"Ser", TCC=>"Ser", TAT=>"Tyr", TAC=>"Tyr", TGT=>"Cys", TGC=>"Cys", TTA=>"Leu", TCA=>"Ser", TAA=>"*", TGA=>"*", TTG=>"Leu", TCG=>"Ser", TAG=>"*", TGG=>"Trp", CTT=>"Leu", CTC=>"Leu", CCT=>"Pro", CCC=>"Pro", CAT=>"His", CAC=>"His", CGT=>"Arg", CGC=>"Arg", CTA=>"Leu", CTG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", ATT=>"Ile", ATC=>"Ile", ACT=>"Thr", ACC=>"Thr", AAT=>"Asn", AAC=>"Asn", AGT=>"Ser", AGC=>"Ser", ATA=>"Ile", ACA=>"Thr", AAA=>"Lys", AGA=>"Arg", ATG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"Arg", GTT=>"Val", GTC=>"Val", GCT=>"Ala", GCC=>"Ala", GAT=>"Asp", GAC=>"Asp", GGT=>"Gly", GGC=>"Gly", GTA=>"Val", GTG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly");
our %codonfull = (TTT=>"Phenylalanine", TTC=>"Phenylalanine", TCT=>"Serine", TCC=>"Serine", TAT=>"Tyrosine", TAC=>"Tyrosine", TGT=>"Cysteine", TGC=>"Cysteine", TTA=>"Leucine", TCA=>"Serine", TAA=>"Stop", TGA=>"Stop", TTG=>"Leucine", TCG=>"Serine", TAG=>"Stop", TGG=>"Tryptophan", CTT=>"Leucine", CTC=>"Leucine", CCT=>"Proline", CCC=>"Proline", CAT=>"Histidine", CAC=>"Histidine", CGT=>"Arginine", CGC=>"Arginine", CTA=>"Leucine", CTG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", ATT=>"Isoleucine", ATC=>"Isoleucine", ACT=>"Threonine", ACC=>"Threonine", AAT=>"Asparagine", AAC=>"Asparagine", AGT=>"Serine", AGC=>"Serine", ATA=>"Isoleucine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Arginine", ATG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Arginine", GTT=>"Valine", GTC=>"Valine", GCT=>"Alanine", GCC=>"Alanine", GAT=>"Aspartic acid", GAC=>"Aspartic acid", GGT=>"Glycine", GGC=>"Glycine", GTA=>"Valine", GTG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine");
our %codonr1 = (UUU=>"F", UUC=>"F", UCU=>"S", UCC=>"S", UAU=>"Y", UAC=>"Y", UGU=>"C", UGC=>"C", UUA=>"L", UCA=>"S", UAA=>"*", UGA=>"*", UUG=>"L", UCG=>"S", UAG=>"*", UGG=>"W", CUU=>"L", CUC=>"L", CCU=>"P", CCC=>"P", CAU=>"H", CAC=>"H", CGU=>"R", CGC=>"R", CUA=>"L", CUG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", AUU=>"I", AUC=>"I", ACU=>"T", ACC=>"T", AAU=>"N", AAC=>"N", AGU=>"S", AGC=>"S", AUA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", AUG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GUU=>"V", GUC=>"V", GCU=>"A", GCC=>"A", GAU=>"D", GAC=>"D", GGU=>"G", GGC=>"G", GUA=>"V", GUG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G");
our %codonr3 = (UUU=>"Phe", UUC=>"Phe", UCU=>"Ser", UCC=>"Ser", UAU=>"Tyr", UAC=>"Tyr", UGU=>"Cys", UGC=>"Cys", UUA=>"Leu", UCA=>"Ser", UAA=>"*", UGA=>"*", UUG=>"Leu", UCG=>"Ser", UAG=>"*", UGG=>"Trp", CUU=>"Leu", CUC=>"Leu", CCU=>"Pro", CCC=>"Pro", CAU=>"His", CAC=>"His", CGU=>"Arg", CGC=>"Arg", CUA=>"Leu", CUG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", AUU=>"Ile", AUC=>"Ile", ACU=>"Thr", ACC=>"Thr", AAU=>"Asn", AAC=>"Asn", AGU=>"Ser", AGC=>"Ser", AUA=>"Ile", ACA=>"Thr", AAA=>"Lys", AGA=>"Arg", AUG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"Arg", GUU=>"Val", GUC=>"Val", GCU=>"Ala", GCC=>"Ala", GAU=>"Asp", GAC=>"Asp", GGU=>"Gly", GGC=>"Gly", GUA=>"Val", GUG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly");
our %codonrfull = (UUU=>"Phenylalanine", UUC=>"Phenylalanine", UCU=>"Serine", UCC=>"Serine", UAU=>"Tyrosine", UAC=>"Tyrosine", UGU=>"Cysteine", UGC=>"Cysteine", UUA=>"Leucine", UCA=>"Serine", UAA=>"Stop", UGA=>"Stop", UUG=>"Leucine", UCG=>"Serine", UAG=>"Stop", UGG=>"Tryptophan", CUU=>"Leucine", CUC=>"Leucine", CCU=>"Proline", CCC=>"Proline", CAU=>"Histidine", CAC=>"Histidine", CGU=>"Arginine", CGC=>"Arginine", CUA=>"Leucine", CUG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", AUU=>"Isoleucine", AUC=>"Isoleucine", ACU=>"Threonine", ACC=>"Threonine", AAU=>"Asparagine", AAC=>"Asparagine", AGU=>"Serine", AGC=>"Serine", AUA=>"Isoleucine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Arginine", AUG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Arginine", GUU=>"Valine", GUC=>"Valine", GCU=>"Alanine", GCC=>"Alanine", GAU=>"Aspartic acid", GAC=>"Aspartic acid", GGU=>"Glycine", GGC=>"Glycine", GUA=>"Valine", GUG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine");
our %iupac = (R=>'AG', Y=>'CT', S=>'GC', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-');

processArguments ();			#process program arguments, set up default values, check for errors, check for existence of db files
if ($geneanno) {
	annotateQueryByGene ();		#generate gene-based annoations (classify variants into intergenic, introgenic, non-synonymous, synonymous, UTR, frameshift, etc)
} elsif ($regionanno) {
	annotateQueryByRegion ();	#generate region-based annotations (most conserved elements, transcription factor binding sites, etc)
} elsif ($filter) {
	filterQuery ();			#generate filter-based annotations (identify variants not reported in variation databases)
} elsif ($downdb) {
	downloadDB ();			#download annotation databases from Internet
}

sub processArguments {
	my @command_line = @ARGV;		#command line argument
	GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'outfile=s'=>\$outfile, 'separate'=>\$separate,
	'batchsize=s'=>\$batchsize, 'dbtype=s'=>\$dbtype, 'neargene=i'=>\$neargene, 'genomebinsize=s'=>\$genomebinsize,
	'geneanno'=>\$geneanno, 'regionanno'=>\$regionanno, , 'filter'=>\$filter, 'downdb'=>\$downdb, 'buildver=s'=>\$buildver, 'score_threshold=f'=>\$score_threshold, 
	'normscore_threshold=i'=>\$normscore_threshold,	'minqueryfrac=f'=>\$minqueryfrac, 'expandbin=i'=>\$expandbin, 'splicing_threshold=i'=>\$splicing_threshold,
	'maf_threshold=f'=>\$maf_threshold, 'chromosome=s'=>\$chromosome, 'zerostart'=>\$zerostart, 'rawscore'=>\$rawscore, 'memfree=i'=>\$memfree, 
	'memtotal=i'=>\$memtotal, 'sift_threshold=f'=>\$sift_threshold, 'gff3dbfile=s'=>\$gff3dbfile, 'genericdbfile=s'=>\$genericdbfile, 'vcfdbfile=s'=>\$vcfdbfile,
	'time'=>\$time, 'wget!'=>\$wget, 'precedence=s'=>\$precedence, 'webfrom=s'=>\$webfrom, 'colsWanted=s'=>\$colsWanted, 'comment'=>\$comment,
	'scorecolumn=i'=>\$scorecolumn, 'transcript_function'=>\$transfun, 'exonsort'=>\$exonsort, 'avcolumn=s'=>\$avcolumn, 'bedfile=s'=>\$bedfile) or pod2usage ();
	
	$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
	$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
	@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
	@ARGV == 2 or pod2usage ("Syntax error");

	($queryfile, $dbloc) = @ARGV;

	$dbloc =~ s/[\\\/]$//;			#delete the trailing / or \ sign as part of the directory name
	if (defined $batchsize) {
		$batchsize =~ s/k$/000/;
		$batchsize =~ s/m$/000000/;
		$batchsize =~ m/^\d+$/ or pod2usage ("Error: the --batchsize argument must be a positive integer (suffix of k or m is okay)");
	} else {
		$batchsize = 5_000_000;
	}
	if (defined $genomebinsize) {
		$genomebinsize =~ s/k$/000/;
		$genomebinsize =~ s/m$/000000/;
		$genomebinsize =~ m/^\d+$/ or pod2usage ("Error: the --genomebinsize argument must be a positive integer (suffix of k or m is okay)");
		$genomebinsize > 1000 or pod2suage ("Error: the --genomebinsize argument must be larger than 1000");
	} else {
		if ($geneanno) {
			$genomebinsize = 100_000;		#gene usually span large genomic regions
		} else {
			$genomebinsize = 10_000;		#MCE, TFBS, miRNA, etc are small genomic regions
		}
	}

	$verbose ||= 0;			#when it is not specified, it is zero
	$neargene ||= 1_000;		#for upstream/downstream annotation of variants, specify the distance threshold between variants and genes
	$expandbin ||= int(2_000_000/$genomebinsize);		#for gene-based annotations, when intergenic variants are found, expand to specified number of nearby bins to find closest genes
	$outfile ||= $queryfile;	#specify the prefix of output file names

	#set up log file
	if ($downdb) {
		if (not -d $dbloc) {
			mkdir ($dbloc) or die "Error: the directory $dbloc does not exist and cannot be created\n";
		}
		my $errfile = File::Spec->catfile ($dbloc, "annovar_downdb.log");
		open (LOG, ">$errfile") or die "Error: cannot write LOG information to log file $errfile: $!\n";
	} else {
		open (LOG, ">$outfile.log") or die "Error: cannot write LOG information to log file $outfile.log: $!\n";
	}
	print LOG "ANNOVAR Version:\n\t", q/$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $/, "\n";
	print LOG "ANNOVAR Information:\n\tFor questions, comments, documentation, bug reports and program update, please visit http://www.openbioinformatics.org/annovar/\n";
	print LOG "ANNOVAR Command:\n\t$0 @command_line\n";
	print LOG "ANNOVAR Started:\n\t", scalar (localtime), "\n";
	
	my $num = 0;
	$geneanno and $num++;
	$downdb and $num++;
	$filter and $num++;
	$regionanno and $num++;
	$num <= 1 or pod2usage ("Error in argument: please specify only one of --geneanno, -regionanno, --downdb, --filter");
	if (not $num) {
		$geneanno++;
		printerr "NOTICE: The --geneanno operation is set to ON by default\n";
	}
	
	my %dbtype1 = ('gene'=>'refGene', 'refgene'=>'refGene', 'knowngene'=>'knownGene', 'ensgene'=>'ensGene', 'band'=>'cytoBand', 'cytoband'=>'cytoBand', 'tfbs'=>'tfbsConsSites', 'mirna'=>'wgRna',
			'mirnatarget'=>'targetScanS', 'segdup'=>'genomicSuperDups', 'omimgene'=>'omimGene', 'gwascatalog'=>'gwasCatalog', 
			'1000g_ceu'=>'CEU.sites.2009_04', '1000g_yri'=>'YRI.sites.2009_04', '1000g_jptchb'=>'JPTCHB.sites.2009_04', 
			'1000g2010_ceu'=>'CEU.sites.2010_03', '1000g2010_yri'=>'YRI.sites.2010_03', '1000g2010_jptchb'=>'JPTCHB.sites.2010_03',
			'1000g2010jul_ceu'=>'CEU.sites.2010_07', '1000g2010jul_yri'=>'YRI.sites.2010_07', '1000g2010jul_jptchb'=>'JPTCHB.sites.2010_07',
			'1000g2010nov_all'=>'ALL.sites.2010_11',
			);
	
	if ($geneanno) {
		$dbtype ||= 'refGene';
		$dbtype1 = $dbtype1{$dbtype} || $dbtype;
		#$dbtype1 =~ m/^(refGene|knownGene|ensGene)$/ or pod2usage ("Error: the gene-based annotation procedure currently only support -dbtype of refGene, knownGene and ensGene");	#commented 2011feb18
	} elsif ($regionanno) {
		defined $dbtype or pod2usage ("Error in argument: please specify --dbtype (required for the --regionanno operation)");
		$dbtype1 = $dbtype1{$dbtype} || $dbtype;
		if ($dbtype =~ m/^mce(\d+)way/) {			#added 2010Feb16
			$dbtype1 = "phastConsElements$1way";
		}
		if ($dbtype1 eq 'gff3') {
			defined $gff3dbfile or pod2usage ("Error in argument: please specify --gff3dbfile for the --dbtype of 'gff3'");
		}
	} elsif ($filter) {
		defined $dbtype or pod2usage ("Error in argument: please specify --dbtype (required for the --filter operation)");
		$dbtype =~ m/^avsift|generic|1000g_(ceu|yri|jptchb)|1000g2010_(ceu|yri|jptchb)|1000g20\d\d[a-z]{3}_[a-z]+|snp\d+|vcf|(ljb_\w+)$/ or pod2usage ("Error in argument: the specified --dbtype $dbtype is not valid for --filter operation (valid ones are '1000g_ceu', '1000g2010_yri', 'snp129', 'avsift', 'vcf', 'generic', etc)");
		$dbtype1 = $dbtype1{$dbtype} || $dbtype;
		if ($dbtype1 eq 'generic') {
			defined $genericdbfile or pod2usage ("Error in argument: please specify --genericdbfile for the --dbtype of 'generic'");
		}
		if ($dbtype eq 'vcf') {
			defined $vcfdbfile or pod2usage ("Error in argument: please specify --vcfdbfile for the --dbtype of 'vcf'");
		}
	} elsif ($downdb) {
		defined $dbtype and pod2usage ("Error in argument: please do not specify --dbtype for the --downdb operation");
		$dbtype1 = $dbtype1{$queryfile} || $queryfile;
	}
	
	if (not $buildver) {
		$buildver = 'hg18';
		printerr "NOTICE: The --buildver is set as 'hg18' by default\n";
	}
	
	if ($score_threshold) {
		$score_threshold > 0 or pod2usage ("Error in argument: the --score_threshold must be a positive number (you specified $score_threshold)");
		$geneanno || $downdb and pod2usage ("Error in argument: the --score_threshold is not useful for --geneanno or --downdb operations");
	}
	if ($normscore_threshold) {
		$normscore_threshold <= 1000 or pod2usage ("Error in argument: the --normscore_threshold must be between 0 and 1000 (you specified $normscore_threshold)");
		$regionanno or pod2usage ("Error in argument: the --score_threshold is supported only for the --regionanno operation");
	}
	
	
	if (defined $sift_threshold) {
		$filter or pod2usage ("Error in argument: the --sift_threshold is supported only for the --filter operation");
		$dbtype1 eq 'avsift' or pod2usage ("Error in argument: the --sift_threshold argument can be used only if '--dbtype avsift' is used");
		$sift_threshold >= 0 and $sift_threshold <= 1 or pod2usage ("Error in argument: the --sift_threshold must be between 0 and 1 inclusive");
	} else {
		$sift_threshold = 0.05;
	}
	
	#operation-specific argument
	if (defined $splicing_threshold) {
		$geneanno or pod2usage ("Error in argument: the --splicing_threshold is supported only for the --geneanno operation");
	} else {
		$splicing_threshold = 2;	#for splicing annotation, specify the distance threshold between variants and exon/intron boundaries
	}
	if (defined $maf_threshold) {
		$filter or pod2usage ("Error in argument: the --maf_threshold is supported only for the --filter operation");
	} else {
		$maf_threshold = 0;		#for filter-based annotations on 1000 Genomes Project data, specify the MAF threshold to be used in filtering
	}
	if (defined $minqueryfrac) {
		$regionanno or pod2usage ("Error in argument: the --minqueryfrac is supported only for the --regionanno operation");
	} else {
		$minqueryfrac = 0;		#minimum query overlap to declare a "match" with database records
	}
	if (defined $gff3dbfile) {
		$dbtype eq 'gff3' or pod2usage ("Error in argument: the --gff3dbfile argument can be used only if '--dbtype gff3' is used");
		$geneanno or $regionanno or pod2usage ("Error in argument: the --gff3dbfile argument is supported only for the --geneanno or --regionanno operation");
	}
	if (defined $bedfile) {
		$dbtype eq 'bed' or pod2usage ("Error in argument: the --bedfile argument can be used only if '--dbtype bed' is used");
		$regionanno or pod2usage ("Error in argument: the --bedfile argument is supported only for the --regionanno operation");
	}
	if (defined $genericdbfile) {
		$filter or pod2usage ("Error in argument: the --genericdbfile argument is supported only for the --filter operation");
	}
	if (defined $wget) {
		$downdb or pod2usage ("Error in argument: the --wget argument is supported only for the --downdb operation");
	} else {
		$wget = 1;			#by default, use wget for downloading files from Internet
	}
	if (defined $precedence) {
		$geneanno or pod2usage ("Error in argument: the --precedence argument is supported only for the --geneanno operation");
		@precedence = split (/,/, $precedence);
		@precedence >= 2 or pod2usage ("Error in argument: the --precedence argument should be comma delimited");
		for my $i (0 .. @precedence-1) {
			$precedence[$i] =~ m/^(exonic|intronic|splicing|utr5|utr3|upstream|downstream|splicing|ncrna)$/ or pod2usage ("Error in argument: the --precedence argument contains invalid keywords (valid ones are exonic|intronic|splicing|utr5|utr3|upstream|downstream|splicing)");
		}
	}
	
	if (defined $colsWanted) {
		$regionanno or pod2usage ("Error in argument: the --colWanted argument is supported only for the --geneanno operation");
		if (lc $colsWanted eq 'all') {
			@colsWanted = ('all');
		} elsif (lc $colsWanted eq 'none') {
			@colsWanted = ('none');
		} else {
			@colsWanted = split (/,/, $colsWanted);
			for my $i (0 .. @colsWanted-1) {
				$colsWanted[$i]=~m/^\d+$/ or pod2usage ("Error in argument: the --colsWanted argument ($colsWanted) must be a list of comma delimited numbers or be 'all' or be 'none'");
			}
		}
	}
	
	if (defined $scorecolumn) {
		$regionanno or pod2usage ("Error in argument: the --scorecolumn argument is supported only for the --regionanno operation");
	}
	
	if ($exonsort) {
		$geneanno or pod2usage ("Error in argument: the --exonsort argument is supported only for the --geneanno operation");
	}
	
	if (defined $avcolumn) {
		$avcolumn =~ m/^\d+,\d+,\d+,\d+,\d+$/ or pod2usage ("Error in argument: the --avcolumn argument must be five integer numbers separated by comma");
		@avcolumn = split (/,/, $avcolumn);
		@avcolumn = map {$_-1} @avcolumn;
	} else {
		@avcolumn = (0..4);		#by default, the first five columns are the required AVINPUT information
	}
	
	if (defined $webfrom) {
		if ($webfrom ne 'ucsc' and $webfrom ne 'annovar') {
			$webfrom =~ m#^(http://|ftp://)# or pod2usage ("Error: the --webfrom argument needs to be 'ucsc', 'annovar', or a URL");
		}
	}
	
	$maf_threshold >= 0 and $maf_threshold <= 0.5 or pod2usage ("Error in argument: the --maf_threshold must be between 0 and 0.5 (you specified $maf_threshold)");
	$minqueryfrac >= 0 and $minqueryfrac <= 1 or pod2usage ("Error in argument: the --minqueryfrac must be between 0 and 1 (you specified $minqueryfrac)");
	$memfree and $memfree >= 100_000 || pod2usage ("Error in argument: the --memfree argument must be at least 100000 (in the order of kilobytes)");
	$memtotal and $memtotal >= 100_000 || pod2usage ("Error in argument: the --memtotal argument must be at least 100000 (in the order of kilobytes)");
	
	if ($chromosome) {
		my @chr = split (/,/, $chromosome);
		for my $i (0 .. @chr-1) {
			if ($chr[$i] =~ m/^(\d+)-(\d+)$/) {
				for my $j ($1 .. $2) {
					$valichr{$j}++;
				}
			} else {
				$valichr{$chr[$i]}++;
			}
		}
		printerr "NOTICE: These chromosomes in database will be examined: ", join (",", sort keys %valichr), "\n";
	}
}


sub annotateQueryByGene {
	my ($queryfh);							#query file handle
	my ($totalquerycount, $totalinvalidcount, $batchcount) = qw/0 0 1/;
	open ($queryfh, $queryfile) or die "Error: cannot read from --queryfile ($queryfile): $!\n";
	
	open (OUT, ">$outfile.variant_function") or die "Error: cannot write to output file $outfile.variant_function: $!\n";
	open (EXONIC, ">$outfile.exonic_variant_function") or die "Error: cannot write to output file $outfile.exonic_variant_function: $!\n";
	open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n";

	my ($genedb, $geneidmap, $cdslen, $mrnalen) = readUCSCGeneAnnotation ($dbloc);

	$time and printerr "NOTICE: Current time (before examining variants) is ", scalar (localtime), "\n";
	while (1) {
		my ($linecount, $invalidcount) = newprocessNextQueryBatchByGene ($queryfh, $batchsize, $genedb, $geneidmap, $cdslen, $mrnalen);
		$totalquerycount += $linecount;
		$totalinvalidcount += $invalidcount;
		$linecount == $batchsize or last;
		$batchcount++;
		printerr "NOTICE: Begin processing batch $batchcount (each batch contains $batchsize variants)\n";
	}
	close (INVALID);
	close (EXONIC);
	close (OUT);
	close ($queryfh);
	$time and printerr "NOTICE: Current time (after examining variants) is ", scalar (localtime), "\n";

	$totalinvalidcount or unlink ("$outfile.invalid_input");	#delete the file as it is empty
	printerr "NOTICE: Finished gene-based annotation on $totalquerycount genetic variants in $queryfile";
	$totalinvalidcount and printerr " (including $totalinvalidcount with invalid format written to $outfile.invalid_input)";
	printerr "\n";
	printerr "NOTICE: Output files were written to $outfile.variant_function, $outfile.exonic_variant_function\n";
}

sub newprocessNextQueryBatchByGene {
	my ($queryfh, $batchsize, $genedb, $geneidmap, $cdslen, $mrnalen) = @_;
	my (%refseqvar);
	
	my ($chr, $start, $end, $ref, $obs);
	my ($name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend, $name2);
	my ($invalid);
	my ($linecount, $invalidcount) = qw/0 0/;
	
	for my $i (1 .. $batchsize) {					#process up to batchsize variants
		my $nextline = <$queryfh>;				#read the next line in variant file
		defined $nextline or last;
		$nextline =~ s/[\r\n]+$//;
		
		if ($nextline =~ m/^#/ and $comment) {			#comment line start with #, do not include this is $linecount
			print OUT "#comment\t$nextline\n";
			next;
		}
		
		$linecount++;						#linecount does not include the comment line
		$invalid = 0;
		
		my @nextline = split (/\s+/, $nextline);
		($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
		if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) {
			$invalid++;
		} else {
			($ref, $obs) = (uc $ref, uc $obs);
			$zerostart and $start++;
			$chr =~ s/^chr//;
			if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) {
				$invalid++;
			} elsif ($ref eq '-' and $obs eq '-' 		#both are empty allele
				or $ref =~ m/[^ACTG0\-]/ 		#non-standard nucleotide code
				or $obs =~ m/[^ACGT0\-]/ 		#non-standard nucleotide code
				or $start =~ m/[^\d]/ 			#start is not a number
				or $end =~ m/[^\d]/ 			#end is not a number
				or $start > $end			#start is more than end
				or $ref ne '0' and $end-$start+1 != length ($ref) 	#length mismatch with ref
				or $ref eq '-' and $start != $end	#length mismatch for insertion
				) {		
				$invalid++;
			}
		}



		if ($invalid) {
			print INVALID $nextline, "\n";			#invalid record found
			$invalidcount++;
			next;
		}
		
		my (%intronic, %utr5, %utr3, %exonic, %upstream, %downstream, %ncrna, %intergenic, %splicing);
		my $foundgenic;						#variant found in genic region (between start and end position of a gene in genome)
		my ($distl, $distr, $genel, $gener);			#for intergenic variant, the distance and gene name to the left and right side of gene
		my $bin1 = int ($start/$genomebinsize)-1;		#start bin
		$bin1 < 0 and $bin1=0;
		my $bin2 = int ($end/$genomebinsize)+1;			#end bin (usually same as start bin, unless the query is really big that spans multiple megabases)
		
		while (not exists $genedb->{$chr, $bin1} and $bin1 > int ($start/$genomebinsize)-$expandbin) {		#examine at least 5 bins (by default 5Mb) to the left to make sure that a gene is found in the bin
			$bin1 > 0 or last;
			$bin1--;
		}
		
		while (not exists $genedb->{$chr, $bin2} and $bin2 < int ($end/$genomebinsize)+$expandbin) {		#examine at least 5 bins (by default 5Mb) to the right to make sure that a gene is found in the bin
			$bin2++;
		}

		my (%seen);
		for my $nextbin ($bin1 .. $bin2) {
			exists $genedb->{$chr, $nextbin} or next;		#this genome bin has no annotated gene (a complete intergenic region)
			for my $nextgene (@{$genedb->{$chr, $nextbin}}) {	#when $genedb->{$chr, $nextbin} is undefined, this automatically create an array!!!
				($name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend, $name2) = @$nextgene;
				defined $name2 or printerr "WARNING: name2 field is not provided for transcript $name (start=$txstart end=$txend)\n" and $name2='';
				$seen{$name, $txstart} and next;		#name and txstart uniquely identify a transcript and chromosome position (sometimes same transcript may map to two nearby positions, such as nearby segmental duplications)
				$seen{$name, $txstart}++;			#a transcript may be in two adjacent bins, so if one is already scanned, there is no need to work on it again
				
				if ($transfun) {	#variant_function output contains transcript name, rather than gene name
					$name2 = $name;
				}
				
				if (not $foundgenic) {				#this variant has not hit a genic region yet
					if ($start > $txend) {
						defined $distl or $distl = $start-$txend and $genel=$name2;
						$distl > $start-$txend and $distl = $start-$txend and $genel=$name2;	#identify left closest gene
					}
		
					if ($end < $txstart) {
						defined $distr or $distr = $txstart-$end and $gener=$name2;
						$distr > $txstart-$end and $distr = $txstart-$end and $gener=$name2;	#identify right closest gene
					}
				}
				
				if ($end < $txstart) {
					#query ---
					#gene		<-*----*->
					$foundgenic and last;					#if found a genic annotation already, end the search of the bins
					if ($end > $txstart - $neargene) {
						if ($dbstrand eq '+') {
							$upstream{$name2}++;
						} else {
							$downstream{$name2}++;
						}
					} else {
						last;						#if transcript is too far away from end, end the search of the bins
					}
				} elsif ($start > $txend) {
					#query            ---
					#gene  <-*----*->
					if (not $foundgenic and $start < $txend + $neargene) {
						if ($dbstrand eq '+') {
							$downstream{$name2}++;
						} else {
							$upstream{$name2}++;
						}
					}
				} elsif ($cdsstart == $cdsend+1) {				#non-coding RNA (could be microRNA, or could be due to lack of CDS annotation for mRNA such as NR_026730 or BC039000). Previously we already did cdsstart++ so here the cdsstart is more than cdsend
					if ($start >= $txstart and $start <= $txend or $end >= $txstart and $end <= $txend or $start <= $txstart and $end >= $txend) {
						$ncrna{$name2}++;
						$foundgenic++;
					}
				} else {							#query overlaps with coding region of gene
					my ($lenintron) = (0);					#cumulative intron length at a given exon
					my ($rcdsstart, $rvarstart, $rvarend);			#start of coding and variant in reference mRNA sequence
					my @exonstart = @$exonstart;
					my @exonend = @$exonend;
					my $foundexonic;
					if ($dbstrand eq '+') {					#forward strand, search from left to right (first exon to last exon)
						for my $k (0 .. @exonstart-1) {
							$k and $lenintron += ($exonstart[$k]-$exonend[$k-1]-1);		#calculate cumulative intron length 
							if ($cdsstart >= $exonstart[$k]) {				#calculate CDS start accurately by considering intron length
								$rcdsstart = $cdsstart-$txstart-$lenintron+1;
							}
							
							#splicing calculation
							if ($start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1 or $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) {
								$splicing{$name2}++;		#when query start site is close to exon start or exon end
							}
							if ($end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]+$splicing_threshold-1 or $end >= $exonend[$k]-$splicing_threshold+1 and $end <= $exonend[$k]+$splicing_threshold) {
								$splicing{$name2}++;		#when query end site is close to exon start or exon end
							}
							if ($start <= $exonstart[$k] and $end>=$exonstart[$k] or $start <= $exonend[$k] and $end >= $exonend[$k]) {
								$splicing{$name2}++;		#when query encompass the exon/intron boundary
							}
							
							if ($start < $exonstart[$k]) {
								if ($end >= $exonstart[$k]) {	#exonic 
									$rvarstart = $exonstart[$k]-$txstart-$lenintron+1;
									
									for my $m ($k .. @exonstart-1) {
										$m > $k and $lenintron += ($exonstart[$m]-$exonend[$m-1]-1);
										if ($end < $exonstart[$m]) {
											#query           --------
											#gene     <--**---******---****---->
											$rvarend = $exonend[$m-1]-$txstart-$lenintron+1 + ($exonstart[$m]-$exonend[$m-1]-1);
											last;
										} elsif ($end <= $exonend[$m]) {
											#query           -----------
											#gene     <--**---******---****---->
											$rvarend = $end-$txstart-$lenintron+1;
											last;
										}
									}
									if (not defined $rvarend) {
										$rvarend = $txend-$txstart-$lenintron+1;		#if this value is longer than transcript length, it suggest whole gene deletion
									}
									
									#here the trick begins to differentiate UTR versus coding exonic
									if ($end < $cdsstart) {					#usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
										#query  ----
										#gene     <--*---*->
										$utr5{$name2}++;		#positive strand for UTR5
									} elsif ($start > $cdsend) {
										#query             ----
										#gene     <--*---*->
										$utr3{$name2}++;		#positive strand for UTR3
									} else {									
										$exonic{$name2}++;
										$obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '+', $i, $k+1, $nextline];	#refseq CDS start, refseq variant start
									}
									$foundgenic++;
									last;
								} elsif ($k and $start > $exonend[$k-1]) {	#intronic
									$intronic{$name2}++;
									$foundgenic++;
									last;
								}
							} elsif ($start <= $exonend[$k]) {	#exonic
								$rvarstart = $start-$txstart-$lenintron+1;
								
								for my $m ($k .. @exonstart-1) {
									$m > $k and $lenintron += ($exonstart[$m]-$exonend[$m-1]-1);
									if ($end < $exonstart[$m]) {
										#query              ------
										#gene     <--**---******---****---->
										$rvarend = $exonend[$m-1]-$txstart-$lenintron+1 + ($exonstart[$m]-$exonend[$m-1]-1);
										last;
									} elsif ($end <= $exonend[$m]) {
										#query           -----------
										#gene     <--**---******---****---->
										$rvarend = $end-$txstart-$lenintron+1;
										last;
									}
								}
								if (not defined $rvarend) {
									$rvarend = $txend-$txstart-$lenintron+1;		#if this value is longer than transcript length, it suggest whole gene deletion
								}
								
								#here is the trick begins to differentiate UTR versus coding exonic
								if ($end < $cdsstart) {					#usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
									#query  ----
									#gene     <--*---*->
									$utr5{$name2}++;		#positive strand for UTR5
								} elsif ($start > $cdsend) {
									#query             ----
									#gene     <--*---*->
									$utr3{$name2}++;		#positive strand for UTR3
								} else {
									$exonic{$name2}++;
									$obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '+', $i, $k+1, $nextline];		#queryindex, refseq CDS start, refseq variant start
								}
								$foundgenic++;
								last;
							}
						}
					} elsif ($dbstrand eq '-') {		#process negative strand (in the future, this should be fused to the paragraph above for positive strands; for now, I keep them separate for easier debugging)
						for (my $k = @exonstart-1; $k>=0; $k--) {
							$k < @exonstart-1 and $lenintron += ($exonstart[$k+1]-$exonend[$k]-1);
							if ($cdsend <= $exonend[$k]) {		#calculate CDS start accurately by considering intron length
								$rcdsstart = $txend-$cdsend-$lenintron+1;
							}
							
							#splicing calculation
							if ($start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1 or $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) {
								$splicing{$name2}++;
							}
							if ($end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]+$splicing_threshold-1 or $end >= $exonend[$k]-$splicing_threshold+1 and $end <= $exonend[$k]+$splicing_threshold) {
								$splicing{$name2}++;
							}
							if ($start <= $exonstart[$k] and $end>=$exonstart[$k] or $start <= $exonend[$k] and $end >= $exonend[$k]) {
								$splicing{$name2}++;
							}
							
							if ($end > $exonend[$k]) {
								if ($start <= $exonend[$k]) {
									$rvarstart = $txend-$exonend[$k]-$lenintron+1;
									
									for (my $m = $k; $m >= 0; $m--) {
										$m < $k and $lenintron += ($exonstart[$m+1]-$exonend[$m]-1);
										if ($start > $exonend[$m]) {
											#query           --------
											#gene     <--**---******---****---->
											#$rvarend = $txend-$exonstart[$m]-$lenintron+1 - ($exonstart[$m+1]-$exonend[$m]-1);	#commented out 2011feb18
											$rvarend = $txend-$exonstart[$m+1]+1-$lenintron + ($exonstart[$m+1]-$exonend[$m]-1);	#fixed this 2011feb18
											last;		#finsih the cycle!!!!!!!!!!!!!!!!!!!
										} elsif ($start >= $exonstart[$m]) {		#start within exons
											#query               ----
											#gene     <--**---******---****---->
											$rvarend = $txend-$start-$lenintron+1;
											last;
										}
									}
									if (not defined $rvarend) {				#if rvarend is not found, then the whole tail of gene is covered
										$rvarend = $txend-$txstart-$lenintron+1;
									}
									
									#here is the trick begins to differentiate UTR versus coding exonic
									if ($end < $cdsstart) {					#usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
										#query  ----
										#gene     <--*---*->
										$utr3{$name2}++;		#negative strand for UTR5
									} elsif ($start > $cdsend) {
										#query             ----
										#gene     <--*---*->
										$utr5{$name2}++;		#negative strand for UTR3
									} else {
										$exonic{$name2}++;
										$obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '-', $i, @exonstart-$k, $nextline];
									}
									$foundgenic++;
									last;
								} elsif ($k < @exonstart-1 and $end < $exonstart[$k+1]) {
									$intronic{$name2}++;
									$foundgenic++;
									last;
								}
							} elsif ($end >= $exonstart[$k]) {
								$rvarstart = $txend-$end-$lenintron+1;		#all the rvarstart, rvarend are with respect to the cDNA sequence (so rvarstart corresponds to end of variants)
								
								for (my $m = $k; $m >= 0; $m--) {
									$m < $k and $lenintron += ($exonstart[$m+1]-$exonend[$m]-1);
									if ($start > $exonend[$m]) {
										#query           ----
										#gene     <--**---******---****---->
										#$rvarend = $txend-$exonstart[$m]-$lenintron+1 - ($exonstart[$m+1]-$exonend[$m]-1);		#commented out 2011feb18 due to bug (10 42244567 42244600 CACCTTTGCTTGATATGATAATATAGTGCCAAGG - hetero)
										$rvarend = $txend-$exonstart[$m+1]+1 - $lenintron + ($exonstart[$m+1]-$exonend[$m]-1);		#fixed this 2011feb18
										last;			#finish the circle of counting exons!!!!!
									} elsif ($start >= $exonstart[$m]) {			#the start is right located within exon
										#query        -------
										#gene     <--**---******---****---->
										$rvarend = $txend-$start-$lenintron+1;
										last;						#finish the cycle
									}
								}
								if (not defined $rvarend) {					#if rvarend is not found, then the whole tail of gene is covered
									$rvarend = $txend-$txstart-$lenintron+1;
								}
								
								#here the trick begins to differentiate UTR versus coding exonic
								if ($end < $cdsstart) {			#usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
									#query  ----
									#gene     <--*---*->
									$utr3{$name2}++;		#negative strand for UTR5
								} elsif ($start > $cdsend) {
									#query             ----
									#gene     <--*---*->
									$utr5{$name2}++;		#negative strand for UTR3
								} else {
									$exonic{$name2}++;
									$obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '-', $i, @exonstart-$k, $nextline];
								}
								$foundgenic++;
								last;
							}
						}
					}
				}
			}
		}
		$foundgenic or $intergenic{$name2}++;
		$i =~ m/000000$/ and printerr "NOTICE: Finished analyzing $i query variants\n";

	
		my (@txname, %genename);
		
		if ($separate) {		#separately print out each effect on one line
			if (%exonic or %splicing or %intronic or %utr5 or %utr3 or %ncrna or %upstream or %downstream) {
				%exonic and print OUT "exonic\t", join(",", sort keys %exonic), "\t", $nextline, "\n";
				%splicing and $end-$start+1<=$splicing_threshold and print OUT "splicing\t", join (",", sort keys %splicing), "\t", $nextline, "\n";
				%intronic and print OUT "intronic\t", join(",", sort keys %intronic), "\t", $nextline, "\n";
				%utr5 and print OUT "UTR5\t", join(",", sort keys %utr5), "\t", $nextline, "\n";
				%utr3 and print OUT "UTR3\t", join(",", sort keys %utr3), "\t", $nextline, "\n";
				%ncrna and print OUT "ncRNA\t", join(",", sort keys %ncrna), "\t", $nextline, "\n";
				%upstream and print OUT "upstream\t", join(",", sort keys %upstream), "\t", $nextline, "\n";
				%downstream and print OUT "downstream\t", join(",", sort keys %downstream), "\t", $nextline, "\n";
			} elsif (%intergenic) {
				$genel ||= "NONE";
				$gener ||= "NONE";
				$distl ||= "NONE";
				$distr ||= "NONE";
				print OUT "intergenic\t", "$genel(dist=$distl),$gener(dist=$distr)", "\t", $nextline, "\n";
			} else {
				die "FATAL ERROR: please report bug to ANNOVAR author with your input file\n";
			}
		} else {			
			if (@precedence) {
				my $foundmatch;
				for my $i (0 .. @precedence-2) {
					$precedence[$i] eq 'exonic' and %exonic and $foundmatch++;
					$precedence[$i] eq 'splicing' and %splicing and $foundmatch++;
					$precedence[$i] eq 'intronic' and %intronic and $foundmatch++;
					$precedence[$i] eq 'utr5' and %utr5 and $foundmatch++;
					$precedence[$i] eq 'utr3' and %utr3 and $foundmatch++;
					$precedence[$i] eq 'ncrna' and %ncrna and $foundmatch++;
					$precedence[$i] eq 'upstream' and %upstream and $foundmatch++;
					$precedence[$i] eq 'downstream' and %downstream and $foundmatch++;
					$precedence[$i] eq 'intergenic' and %intergenic and $foundmatch++;
					if ($foundmatch) {
						for my $j ($i+1 .. @precedence-1) {
							$precedence[$j] eq 'exonic' and %exonic = ();
							$precedence[$j] eq 'splicing' and %splicing = ();
							$precedence[$j] eq 'intronic' and %intronic = ();
							$precedence[$j] eq 'utr5' and %utr5 = ();
							$precedence[$j] eq 'utr3' and %utr3 = ();
							$precedence[$j] eq 'ncrna' and %ncrna = ();
							$precedence[$j] eq 'upstream' and %upstream = ();
							$precedence[$j] eq 'downstream' and %downstream = ();
							$precedence[$j] eq 'intergenic' and %intergenic = ();
						}
						last;
					}
				}
			}
			
			
			if (%exonic) {
				if (%splicing and $end-$start+1<=$splicing_threshold) {		#a big deletion spanning splicing site is not really a "splicing" mutation
					print OUT "exonic;splicing\t", join(",", sort keys %exonic), ";", join (",", sort keys %splicing), "\t", $nextline, "\n";
				} else {
					print OUT "exonic\t", join(",", sort keys %exonic), "\t", $nextline, "\n";
				}
			} elsif (%splicing) {
				print OUT "splicing\t", join (",", sort keys %splicing), "\t", $nextline, "\n";
			} elsif (%ncrna) {
				print OUT "ncRNA\t", join(",", sort keys %ncrna), "\t", $nextline, "\n";
			} elsif (%utr5 or %utr3) {
				if (%utr5 and %utr3) {
					print OUT "UTR5;UTR3\t", join(",", sort keys %utr5), ";", join(",", sort keys %utr3), "\t", $nextline, "\n";		#use ";" to separate UTR5 and UTR3 genes
				} elsif (%utr5) {
					print OUT "UTR5\t", join(",", sort keys %utr5), "\t", $nextline, "\n";
				} else {
					print OUT "UTR3\t", join(",", sort keys %utr3), "\t", $nextline, "\n";
				}
			} elsif (%intronic) {
				print OUT "intronic\t", join(",", sort keys %intronic), "\t", $nextline, "\n";
			} elsif (%upstream or %downstream) {
				if (%upstream and %downstream) {
					print OUT "upstream;downstream\t", join(",", sort keys %upstream), ";", join(",", sort keys %downstream), "\t", $nextline, "\n";
				} elsif (%upstream) {
					print OUT "upstream\t", join(",", sort keys %upstream), "\t", $nextline, "\n";
				} else {
					print OUT "downstream\t", join(",", sort keys %downstream), "\t", $nextline, "\n";
				}
			} elsif (%intergenic) {
				$genel ||= "NONE";
				$gener ||= "NONE";
				$distl ||= "NONE";
				$distr ||= "NONE";
				print OUT "intergenic\t", "$genel(dist=$distl),$gener(dist=$distr)", "\t", $nextline, "\n";
			} else {
				die "FATAL ERROR: please report bug to ANNOVAR author with your input file\n";
			}
		}
	}
	%refseqvar and annotateExonicVariants (\%refseqvar, $geneidmap, $cdslen, $mrnalen);

	return ($linecount, $invalidcount);
}

sub annotateExonicVariants {
	my ($refseqvar, $geneidmap, $cdslen, $mrnalen) = @_;
	my $refseqhash;
	my $function = {};
	my %varinfo;								#variants information (same as input line)

	$refseqhash = readSeqFromFASTADB ($refseqvar);

	for my $seqid (keys %$refseqvar) {
		for my $i (0 .. @{$refseqvar->{$seqid}}-1) {
			my ($refcdsstart, $refvarstart, $refvarend, $refstrand, $index, $exonpos, $nextline) = @{$refseqvar->{$seqid}->[$i]};
			my ($wtnt3, $wtnt3_after, @wtnt3, $varnt3, $wtaa, $wtaa_after, $varaa, $varpos);		#wtaa_after is the aa after the wtaa
			my ($chr, $start, $end, $ref, $obs);
			
			my @nextline = split (/\s+/, $nextline);
			($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
			($ref, $obs) = (uc $ref, uc $obs);
			$zerostart and $start++;
			$chr =~ s/^chr//;
			
			$varinfo{$index} = $nextline;			
			
			if (not $refseqhash->{$seqid}) {					#this refseq do not have FASTA sequence so cannot be interrogated
				$function->{$index}{unknown} = "UNKNOWN";
				next;
			}
						
			my $fs = (($refvarstart-$refcdsstart) % 3);
			if ($refvarstart-$fs-1 > length($refseqhash->{$seqid})) {
				printerr "WARNING: Potential database annotation error seqid=$seqid, refvarstart=$refvarstart, fs=$fs, seqlength=", length($refseqhash->{$seqid}), " refcdsstart=$refcdsstart, with inputline=$nextline\n";
				next;
			}

			$wtnt3 = substr ($refseqhash->{$seqid}, $refvarstart-$fs-1, 3);
			if (length ($refseqhash->{$seqid}) >= $refvarstart-$fs+3) {	#going into UTR
				$wtnt3_after = substr ($refseqhash->{$seqid}, $refvarstart-$fs+2, 3);
			} else {
				$wtnt3_after = '';					#last amino acid in the sequence without UTR (extremely rare situation) (example: 17        53588444        53588444        -       T       414     hetero)
			}
			@wtnt3 = split (//, $wtnt3);
			if (@wtnt3 != 3 and $refvarstart-$fs-1>=0) {			#some times there are database annotation errors (example: chr17:3,141,674-3,141,683), so the last coding frame is not complete and as a result, the cDNA sequence is not complete
				$function->{$index}{unknown} = "UNKNOWN";
				next;
			}
			
			if ($refstrand eq '-') {					#change the observed nucleotide to the reverse strand
				$obs = revcom ($obs);
			}
			
			if ($start == $end) {
				if ($ref eq '-') {					#insertion variant
					#the insertion coordinate system in ANNOVAR always uses "position after the current site"
					#in positive strand, this is okay
					#in negative strand, the "after current site" becomes "before current site" during transcription
					#therefore, appropriate handling is necessary to take this into account
					#for example, for a trinucleotide GCC with frameshift of 1 and insertion of CCT
					#in positive strand, it is G-CTT-CC
					#but if the transcript is in negative strand, the genomic sequence should be GC-CCT-C, and transcript is G-AGG-GC
					if ($refstrand eq '+') {
						if ($fs == 1) {
							$varnt3 = $wtnt3[0] . $wtnt3[1] . $obs . $wtnt3[2];
						} elsif ($fs == 2) {
							$varnt3 = $wtnt3[0] . $wtnt3[1] . $wtnt3[2] . $obs;
						} else {
							$varnt3 = $wtnt3[0] . $obs . $wtnt3[1] . $wtnt3[2];
						}
					} elsif ($refstrand eq '-') {
						if ($fs == 1) {
							$varnt3 = $wtnt3[0] . $obs . $wtnt3[1] . $wtnt3[2];
						} elsif ($fs == 2) {
							$varnt3 = $wtnt3[0] . $wtnt3[1] . $obs . $wtnt3[2];
						} else {
							$varnt3 = $obs . $wtnt3[0] . $wtnt3[1] . $wtnt3[2];
						}
					}
					($wtaa, $wtaa_after, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($wtnt3_after), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1);
					$wtaa_after and $wtaa_after eq '*' and $wtaa_after = 'X';		#wtaa_after could be undefined, if the current aa is the stop codon (X) (example: 17        53588444        53588444        -       T)

					my $canno = "c." . ($refvarstart-$refcdsstart+1) .  "_" . ($refvarstart-$refcdsstart+2) . "ins$obs";		#cDNA level annotation
					if (length ($obs) % 3 == 0) {
						if ($wtaa eq '*') {			#mutation on stop codon
							if ($varaa =~ m/\*/) {
								$varaa =~ s/\*.*/X/;	#delete all aa after stop codon, but keep the aa before
								$function->{$index}{nfsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,";		#stop codon is stil present
							} else {
								$function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,";	#stop codon is lost
							}
						} else {
							if ($varaa =~ m/\*/) {
								$varaa =~ s/\*.*/X/;	#delete all aa after stop codon, but keep the aa before
								$function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "delins$varaa,";
							} else {
								$function->{$index}{nfsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "delins$varaa,";
							}
						}
					} else {
						if ($wtaa eq '*') {			#mutation on stop codon
							if ($varaa =~ m/\*/) {		#in reality, this cannot be differentiated from non-frameshift insertion, but we'll still call it frameshift
								$varaa =~ s/\*.*/X/;	#delete all aa after stop codon, but keep the aa before
								$function->{$index}{fsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,";
							} else {
								$function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,";
							}
						} else {
							if ($varaa =~ m/\*/) {
								$varaa =~ s/\*.*/X/;	#delete all aa after stop codon, but keep the aa before
								$function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "_$wtaa_after" . ($varpos+1) . "delins$varaa,";
							} else {
								$function->{$index}{fsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "fs,";
							}
						}
					}
				} elsif ($obs eq '-') {					#single nucleotide deletion
					my $deletent;
					if ($fs == 1) {
						$deletent = $wtnt3[1];
						$varnt3 = $wtnt3[0].$wtnt3[2].$wtnt3_after;
					} elsif ($fs == 2) {
						$deletent = $wtnt3[2];
						$varnt3 = $wtnt3[0].$wtnt3[1].$wtnt3_after;
					} else {
						$deletent = $wtnt3[0];
						$varnt3 = $wtnt3[1].$wtnt3[2].$wtnt3_after;
					}
					($wtaa, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($varnt3),  int(($refvarstart-$refcdsstart)/3)+1);
					my $canno = "c." . ($refvarstart-$refcdsstart+1) . "del$deletent";
					if ($wtaa eq '*') {				#mutation on stop codon
						if ($varaa =~ m/\*/) {			#stop codon is still stop codon
							$function->{$index}{nfsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "X,";	#changed fsdel to nfsdel on 2011feb19
						} else {				#stop codon is lost
							$function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "$varaa,";
						}
					} else {
						if ($varaa =~ m/\*/) {			#new stop codon created
							$function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "X,";
						} else {
							$function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "fs,";
						}
					}
				} elsif (length ($obs) > 1) {				#block substitution (since start==end, this changed from 1nt to several nt)
					if (($refvarend-$refvarstart+1-length($obs)) % 3 == 0) {
						$function->{$index}{nfssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "delins$obs,";
					} else {
						$function->{$index}{fssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "delins$obs,";
					}
				} else {						#single nucleotide substitution variant
					my $canno;
					if ($fs == 1) {
						$varnt3 = $wtnt3[0] . $obs . $wtnt3[2];
						$canno = "c.$wtnt3[1]" . ($refvarstart-$refcdsstart+1) . $obs;
					} elsif ($fs == 2) {
						$varnt3 = $wtnt3[0] . $wtnt3[1]. $obs;
						$canno = "c.$wtnt3[2]" . ($refvarstart-$refcdsstart+1) . $obs;
					} else {
						$varnt3 = $obs . $wtnt3[1] . $wtnt3[2];
						$canno = "c.$wtnt3[0]" . ($refvarstart-$refcdsstart+1) . $obs;
					}
					($wtaa, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1);
					
					if ($wtaa eq $varaa) {
						$wtaa eq '*' and ($wtaa, $varaa) = qw/X X/;		#change * to X in the output
						$function->{$index}{ssnv} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos$varaa,";
					} elsif ($varaa eq '*') {
						$function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa${varpos}X,";
					} elsif ($wtaa eq '*') {
						$function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos$varaa,";
					} else {
						$function->{$index}{nssnv} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos$varaa,";
					}
				}
			} elsif ($obs eq '-') {				#deletion variant involving several nucleotides
				($wtaa, $varpos) = (translateDNA ($wtnt3), int(($refvarstart-$refcdsstart)/3)+1);		#wildtype amino acid, position of amino acid
				my ($varposend, $canno);		#the position of the last amino acid in the deletion
				if ($refvarstart<=$refcdsstart) {	#since the first amino acid is deleted, the whole gene is considered deleted
					$function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:wholegene,";	#it is exonic variant, so the varend has to hit the first exon
				} elsif ($refvarend >= $cdslen->{$seqid}+$refcdsstart) {	#3' portion of the gene is deleted
					$varposend = int ($cdslen->{$seqid}/3);		#cdslen should be multiples of 3, but just in case of database mis-annotation
					$canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($cdslen->{$seqid}+$refcdsstart-1) . "del";
					$function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,";
				} elsif (($refvarend-$refvarstart+1) % 3 == 0) {
					$varposend = int (($refvarend-$refcdsstart)/3) + 1;
					$canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "del";
					$function->{$index}{nfsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,";
				} else {
					$varposend = int (($refvarend-$refcdsstart)/3) + 1;
					$canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "del";
					$function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,";
				}
			} else {							#block substitution event
				if (($refvarend-$refvarstart+1-length($obs)) % 3 == 0) {
					$function->{$index}{nfssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "$obs,";
				} else {
					$function->{$index}{fssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "$obs,";
				}
			}
		}
	}
	
	for my $index (sort {$a<=>$b} keys %$function) {
		if ($separate) {			#print out each type of exonic mutations separately (one effect in one line), rather than printing out only the most important function
			if ($function->{$index}{fsins}) {
				print EXONIC "line$index\t", "frameshift insertion\t$function->{$index}{fsins}\t", $varinfo{$index}, "\n";
			}
			if ($function->{$index}{fsdel}) {
				print EXONIC "line$index\t", "frameshift deletion\t$function->{$index}{fsdel}\t", $varinfo{$index}, "\n";
			}
			if ($function->{$index}{fssub}) {
				print EXONIC "line$index\t", "frameshift substitution\t$function->{$index}{fssub}\t", $varinfo{$index}, "\n";
			} 
			if ($function->{$index}{stopgain}) {
				print EXONIC "line$index\t", "stopgain SNV\t$function->{$index}{stopgain}\t", $varinfo{$index}, "\n";
			} 
			if ($function->{$index}{stoploss}) {
				print EXONIC "line$index\t", "stoploss SNV\t$function->{$index}{stoploss}\t", $varinfo{$index}, "\n";
			} 
			if ($function->{$index}{nfsins}) {
				print EXONIC "line$index\t", "nonframeshift insertion\t$function->{$index}{nfsins}\t", $varinfo{$index}, "\n";
			} 
			if ($function->{$index}{nfsdel}) {
				print EXONIC "line$index\t", "nonframeshift deletion\t$function->{$index}{nfsdel}\t", $varinfo{$index}, "\n";
			} 
			if ($function->{$index}{nfssub}) {
				print EXONIC "line$index\t", "nonframeshift substitution\t$function->{$index}{nfssub}\t", $varinfo{$index}, "\n";
			} 
			if ($function->{$index}{nssnv}) {
				print EXONIC "line$index\t", "nonsynonymous SNV\t$function->{$index}{nssnv}\t", $varinfo{$index}, "\n";
			} 
			if ($function->{$index}{ssnv}) {
				print EXONIC "line$index\t", "synonymous SNV\t$function->{$index}{ssnv}\t", $varinfo{$index}, "\n";
			} 
			if ($function->{$index}{unknown}) {
				print EXONIC "line$index\t", "unknown\t$function->{$index}{unknown}\t", $varinfo{$index}, "\n";
			}
		} else {				#print out only the most important functional changes (for example, chr3:9931279-9931279 G->A can be both non-synonymous and synonymous mutations based on UCSC gene model)
			print EXONIC "line$index\t";
			my $sortout;
			if ($sortout = $function->{$index}{fsins}) {
				$exonsort and $sortout = sortExonicAnnotation ($sortout);
				print EXONIC "frameshift insertion\t$sortout\t";
			} elsif ($sortout = $function->{$index}{fsdel}) {
				$exonsort and $sortout = sortExonicAnnotation ($sortout);
				print EXONIC "frameshift deletion\t$sortout\t";
			} elsif ($sortout = $function->{$index}{fssub}) {
				$exonsort and $sortout = sortExonicAnnotation ($sortout);
				print EXONIC "frameshift substitution\t$sortout\t";
			} elsif ($sortout = $function->{$index}{stopgain}) {
				$exonsort and $sortout = sortExonicAnnotation ($sortout);
				print EXONIC "stopgain SNV\t$sortout\t";
			} elsif ($sortout = $function->{$index}{stoploss}) {
				$exonsort and $sortout = sortExonicAnnotation ($sortout);
				print EXONIC "stoploss SNV\t$sortout\t";
			} elsif ($sortout = $function->{$index}{nfsins}) {
				$exonsort and $sortout = sortExonicAnnotation ($sortout);
				print EXONIC "nonframeshift insertion\t$sortout\t";
			} elsif ($sortout = $function->{$index}{nfsdel}) {
				$exonsort and $sortout = sortExonicAnnotation ($sortout);
				print EXONIC "nonframeshift deletion\t$sortout\t";
			} elsif ($sortout = $function->{$index}{nfssub}) {
				$exonsort and $sortout = sortExonicAnnotation ($sortout);
				print EXONIC "nonframeshift substitution\t$sortout\t";
			} elsif ($sortout = $function->{$index}{nssnv}) {
				$exonsort and $sortout = sortExonicAnnotation ($sortout);
				print EXONIC "nonsynonymous SNV\t$sortout\t";
			} elsif ($sortout = $function->{$index}{ssnv}) {
				$exonsort and $sortout = sortExonicAnnotation ($sortout);
				print EXONIC "synonymous SNV\t$sortout\t";
			} elsif ($sortout = $function->{$index}{unknown}) {
				$exonsort and $sortout = sortExonicAnnotation ($sortout);
				print EXONIC "unknown\t$sortout\t";
			}
			print EXONIC $varinfo{$index}, "\n";
		}
	}
}

sub sortExonicAnnotation {
	my ($anno) = @_;
	my @anno1 = split (/,/, $anno);
	my @anno2;
	for my $i (0 .. @anno1-1) {
		my @temp = split (/:/, $anno1[$i]);
		$temp[2] =~ s/^exon//;
		push @anno2, [$anno1[$i], @temp];
	}
	@anno2 = sort {$a->[3] <=> $b->[3] or $a->[2] cmp $b->[2]} @anno2;		#first sort by exon number, then by transcript name
	my @anno3 = map {$_->[0]} @anno2;
	return join (',', @anno3);
}

sub filterQuery {
	open (FIL, ">$outfile.${buildver}_${dbtype1}_filtered") or die "Error: cannot write to output file $outfile.${buildver}_${dbtype1}_filtered: $!\n"; 
	open (DROPPED, ">$outfile.${buildver}_${dbtype1}_dropped") or die "Error: cannot write to output file $outfile.${buildver}_${dbtype1}_dropped: $!\n";
	open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n";
	
	printerr "NOTICE: Variants matching filtering criteria are written to $outfile.${buildver}_${dbtype1}_dropped, other variants are written to $outfile.${buildver}_${dbtype1}_filtered\n";
	
	open (QUERY, $queryfile) or die "Error: cannot read from query file $queryfile: $!\n";
	
	my (%variant, $filedone, $batchdone);
	my ($linecount, $batchlinecount, $invalid, $invalidcount) = (0, 0);
	my ($chr, $start, $end, $ref, $obs, $info);
	while (1) {
		$_ = <QUERY>;
		if (not defined $_) {
			$filedone++;
		} else {
			s/[\r\n]+$//;
			
			if (m/^#/ and $comment) {				#comment line start with #, do not include this is $linecount
				print FIL "$_\n";
				print DROPPED "#comment\t#comment\t$_\n";
				next;
			}
			
			$linecount++;
			$batchlinecount++;
			if ($batchlinecount == $batchsize) {
				$batchdone++;
			}
			
			if ($memfree or $memtotal) {		#if these arguments are specified
				if ($linecount =~ m/00000$/) {						#about 40Mb memory per 10k lines for a typical input dataset
					my ($availmem, $allmem) = currentAvailMemory();
					$verbose and printerr "NOTICE: Current available system memory is $availmem kb (this program uses $allmem bytes memory), after reading $linecount query\n";
					if ($availmem and $availmem <= $memfree+50_000) {		#some subsequent steps may take ~50Mb memory, so here we try to allocate some more memory
						$batchdone++;
					}
					if ($memtotal and $allmem >= $memtotal-50_000) {	#when --memtotal is specified, ensure that program use less memory
						$batchdone++;
					}
				}
			}
	
			$invalid = 0;						#reset invalid status
			
			my @nextline = split (/\s+/, $_);
			($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
			if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) {
				$invalid++;
			} else {
				($ref, $obs) = (uc $ref, uc $obs);
				$zerostart and $start++;
				$chr =~ s/^chr//;
				if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) {
					$invalid++;
				} elsif ($ref eq '-' and $obs eq '-' 		#both are empty allele
					or $ref =~ m/[^ACTG0\-]/ 		#non-standard nucleotide code
					or $obs =~ m/[^ACGT0\-]/ 		#non-standard nucleotide code
					or $start =~ m/[^\d]/ 			#start is not a number
					or $end =~ m/[^\d]/ 			#end is not a number
					or $start > $end			#start is more than end
					or $ref ne '0' and $end-$start+1 != length ($ref) 	#length mismatch with ref
					or $ref eq '-' and $start != $end	#length mismatch for insertion
					) {		
					$invalid++;
				}
			}				
			
			if ($invalid) {
				print INVALID $_, "\n";	#invalid record found
				$invalidcount++;
				next;
			}
			
			if ($start == $end and $ref eq '-') {	#insertion
				$obs = "0$obs";
			} elsif ($obs eq '-') {			#deletion
				$obs = $end-$start+1;
			} elsif ($end>$start or $start==$end and length($obs)>1) {	#block substitution	#fixed the bug here 2011feb19
				$obs = ($end-$start+1) . $obs;
			}
			
			if (exists $variant{$chr, $start, $obs}) {
				$variant{$chr, $start, $obs} .= "\n$_";
			} else {
				$variant{$chr, $start, $obs} = "$ref\n$_";
			}
		}
		
		if ($filedone or $batchdone) {
			printerr "NOTICE: Processing next batch with ${\(scalar keys %variant)} unique variants in $batchlinecount input lines\n";
			filterNextBatch (\%variant);
			%variant = ();
			$batchlinecount = 0;				#reset the line count for this batch
			$batchdone = 0;
		}
		if ($filedone) {
			last;
		}
	}
	close (INVALID); close (DROPPED); close (FIL);
	if ($invalidcount) {
		printerr "NOTICE: Variants with invalid input format were written to $outfile.invalid_input\n";
	} else {
		unlink ("$outfile.invalid_input");
	}
}

sub filterNextBatch {
	my ($variant) = @_;
	my $dbfile;
	
	if ($dbtype1 eq 'generic') {
		$dbfile = File::Spec->catfile ($dbloc, $genericdbfile);
	} elsif ($dbtype1 eq 'vcf') {
		$dbfile = File::Spec->catfile ($dbloc, $vcfdbfile);
	} else {
		$dbfile = File::Spec->catfile ($dbloc, "${buildver}_$dbtype1.txt");
	}

	open (DB, $dbfile) or die "Error: cannot read from input database file $dbfile: $!\n";
	printerr "NOTICE: Scanning filter database $dbfile...";
	
	my (@record, $chr, $start, $end, $ref, $obs, $score, $qual, $fil, $info);
	my ($rsid, $strand, $ucscallele, $twoallele, $class, $af, $attribute);
	my $count_invalid_dbline;
	while (<DB>) {
		my (@obs2, @score2);				#for 1000G2010 data set in VCF format, some tri-allelic SNPs are present; in the future, some quad-allelic SNPs may be also present in VCF files
		s/[\r\n]+$//;
		m/\S/ or next;					#skip empty lines in the database file (sometimes this occurs)
		m/^#/ and next;					#skip the comment line
		if ($dbtype eq 'avsift') {
			@record = split (/\t/, $_);
			@record == 8 or die "Error: invalid record found in DB file $dbfile (8 tab-delimited fields expected): <$_>\n";
			($chr, $start, $end, $ref, $obs, $score) = @record;
			if ($chromosome) {
				$valichr{$chr} or next;
			}
			if ($score < $sift_threshold) {		#this is a deleterious mutation, skip it (equal sign should not be used, otherwise the score=0 will be skipped)
				next;
			}
		} elsif ($dbtype =~ m/^ljb_/) {
			@record = split (/\t/, $_);
			@record >= 5 or die "Error: invalid record found in DB file $dbfile (at least 5 tab-delimited fields expected): <$_>\n";
			($chr, $start, $end, $ref, $obs, $score) = @record;
			if ($chromosome) {
				$valichr{$chr} or next;
			}
			if (defined $score and defined $score_threshold and $score < $score_threshold) {
				next;
			}
		} elsif ($dbtype =~ m/^snp\d+/) {
			@record = split (/\t/, $_, -1);		#-1 is required before some dbSNP records have many empty tab fields in the end
			@record == 18 or @record == 26 or die "Error: invalid record found in dbSNP database file $dbfile (18 or 26 fields expected but found ${\(scalar @record)}): <$_>\n" . join("\n",@record);
			$record[1] =~ s/^chr// or die "Error: invalid record found in DB file (2nd field should start with 'chr'): <$_>\n";
			($chr, $start, $end, $rsid, $strand, $ucscallele, $twoallele, $class) = @record[1,2,3,4,6,8,9,11];
			$start++;			#UCSC use zero-start system
			if ($chromosome) {
				$valichr{$chr} or next;
			}
			unless ($class eq 'single' or $class eq 'deletion' or $class eq 'in-del' or $class eq 'insertion') {	#enum('unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion')
				next;
			}
	
			my @allele = split (/\//, $twoallele);
			
			#before Jan 2011, only di-allelic SNPs are handled in ANNOVAR
			#@allele == 2 or next;		#many entries have no allele information (for example, rs71010435)
			#in Jan 2011 version, I decided to handle tri-allelic and quad-allelic SNP as well
			
			@allele >= 2 or next;		#Jan 2011 modification
			if ($strand eq '-') {					#handle reverse strand annotation (the vast majority of records in dbSNP should be already in + strand)
				for my $i (0 .. @allele-1) {
					$allele[$i] = revcom ($allele[$i]);
				}
				#$ucscallele = revcom ($ucscallele);		#added Jan 24, 2011 (per Kevin Ha) removed Feb 10, 2011 (per Eric Stawiski)
				#note that some SNPs (e.g., rs28434453) may have multiple location in diferent chromosome or strand; I may want to handle this by a special flag in the future
				#585        chr1    13301   13302   rs28434453      0       -       C       C       C/T     genomic single etc...
				#1367    chr15   102517867       102517868       rs28434453      0       +       G       G       C/T     genomic single etc...
			}
			
			#in-del is usually annotated below, so they require special treatment
			#587     chr1    384538  384539  rs3971283       0       +       T       T       -/ATT   genomic in-del  unknown 0       0       unknown exact   3
			if ($class eq 'in-del') {					#indel are usually annotated as -/xxx, where xxx is the alternative allele
				$obs = length ($ucscallele) . $allele[1];		#prefix a number before the alleles, indicating block substitution
				defined $allele[1] or die "no allele 1 <$_>";
			} elsif ($class eq 'insertion') {
				$start--;
				$obs = "0$allele[1]";
			} elsif ($class eq 'deletion') {
				$obs = length ($ucscallele);
			} else {
				for my $i (0 .. @allele-1) {
					if ($ucscallele eq $allele[$i]) {
						@obs2 = @allele;
						splice (@obs2, $i, 1);
						for my $j (0 .. @obs2-1) {
							push @score2, $rsid;
						}
					}
				}
				if (@obs2) {
					$obs = shift @obs2;
					$score = shift @score2;
				} else {
					$verbose and printerr ("Database error: wildtype base $ucscallele is not part of the allele description in <$_>\n");
					next;
				}
			}
			$score = $rsid;
		} elsif ($dbtype =~ m/^1000g_(\w+)/ or $dbtype =~ m/^1000g2010_(\w+)/ or $dbtype =~ m/^1000g2010\w\w\w_(\w+)/) {	#dbtype1 should NOT be used here
			@record = split (/\t/, $_);
			@record == 5 or @record == 6 or die "Error: invalid record found in 1000G database file $dbfile (5 or 6 fields expected): <$_>\n";
			($chr, $start, $ref, $obs, $af) = @record;			#there is no "END" in 1000G input file
			if ($chromosome) {
				$valichr{$chr} or next;
			}
			if ($maf_threshold) {
				if ($af > 0.5) {					#the frequency is the non-reference allele frequency, which could exceed 0.5
					1-$af >= $maf_threshold or next;
				} else {
					$af >= $maf_threshold or next;
				}
			}
			$score = $af;
		} elsif ($dbtype eq 'generic') {
			($chr, $start, $end, $ref, $obs, $score) = split (/\t/, uc $_);		#make sure to use upper case, as query is always in upper case
			defined $obs or die "Error: the generic database file must contains at least five tab-delimited fields per line (but observed line: $_)\n";
			defined $score or $score = "NA";
			if ($chromosome) {
				$valichr{$chr} or next;
			}
			defined $obs or die "Error: invalid record found in DB file $dbfile (at least 5 fields expected for 'generic' dbtype): <$_>\n";
			if ($start == $end and $ref eq '-') {	#insertion
				$obs = "0$obs";
			}
			if ($obs eq '-') {			#deletion
				$obs = $end-$start+1;
			} elsif ($start != $end) {		#block substitution
				$obs = ($end-$start+1) . $obs;
			}
			if (defined $score and defined $score_threshold and $score < $score_threshold) {
				next;
			}
		} elsif ($dbtype eq 'vcf') {			#vcf file is adopted by 1000 Genomes Project; it can describe both SNPs and indels, and it may contain both summary level statistics and individual level genotype calls
			($chr, $start, $rsid, $ref, $obs, $qual, $fil, $info) = split (/\t/, $_);
			if ($chromosome) {
				$valichr{$chr} or next;
			}

			my ($ac, $an);
			
			if ($info =~ m/AF=([^;]+)/) {
				$score = $1;
				if ($obs =~ m/(\w),(\w)/) {		#1000G November; this format is not really valid because it does not handle tri-allelic SNP
					($obs, @obs2) = ($1, $2);
					@score2 = ($score);
				}
			} elsif ($info =~ m/AC=(\S+?);AN=(\d+)/) {
				my ($alleles, $count) = ($1, $2);
				if ($alleles =~ m/^(\d+),(.+)/) {
					$score = sprintf ("%.3f", $1/$count);
					@score2 = split (/,/, $2);
					@score2 = map {sprintf("%.3f", $_/$count)} @score2;
					($obs, @obs2) = split (/,/, $obs);				#the obs is composed of two alleles
				} else {
					$af = sprintf ("%.3f", $alleles/$count);
					$score = $af;
					#this is an invalid record in 1000GJuly: 1       2266231 rs11589451      C       T,A     .       PASS    AA=c;AC=20;AN=120;DP=237
					if ($obs =~ m/(\w),/) {
						$count_invalid_dbline++;
						$verbose and printerr "WARNING: Invalid input line found in $dbfile (more than one alleles are observed, but only one is annotated with allelic counts): <$_>\n";
						next;
					}
				}
			} else {
				printerr "WARNING: the VCF file does not contain allele frequency information. ANNOVAR cannot process this file\n";
				exit;
			}
			
			if (length ($ref) == 1 and length ($obs) == 1) {#single base substitution
				1;					#the obs and obs2 is already handled
			} elsif ($obs =~ m/^\-((\w)(\w*))$/) {		#deletion (1000G March)
				$2 eq $ref or $ref eq 'N' or die "Error: mismatch of deleted allele and reference allele: <$_>\n";
				$obs = length ($1);
			} elsif ($obs =~ m/^\+(\w+)$/) {		#insertion (1000G March)
				$obs = "0$1";
			} elsif ($ref =~ m/^[ACGTN]+$/ and $obs =~ m/^[ACGTN]+$/) {
				if (length ($obs) == 1) {		#deletion (1000G July)
					substr ($ref, 0, 1) eq $obs or die "Error: mismatch of deleted allele and reference allele: ref=$ref obs=$obs in <$_>\n";
					$start++;
					$obs = length ($ref)-1;
				} elsif (length ($ref) == 1) {		#duplication (1000G July)
					substr ($obs, 0, 1) eq $ref or die "Error: mismatch of duplicated allele and reference allele: ref=$ref obs=$obs in <$_>\n";
					$start++;
					$obs = "0" . substr ($obs, 1);
				}
			} else {
				die "Error: invalid record found in VCF file: ref=$ref obs=$obs <$_>\n";
			}
		} else {
			die "invalid dbtype: $dbtype\n";
		}
		
		if ($variant->{$chr, $start, $obs}) {
			my ($ref, @info) = split (/\n/, $variant->{$chr, $start, $obs});	#most likely, only one piece of information
			for my $i (0 .. @info-1) {
				print DROPPED join ("\t", $dbtype, $score), "\t", $info[$i], "\n";
			}
			delete $variant->{$chr, $start, $obs};
		}
		if (@obs2) {
			for my $j (0 .. @obs2-1) {
				if ($variant->{$chr, $start, $obs2[$j]}) {
					my ($ref, @info) = split (/\n/, $variant->{$chr, $start, $obs2[$j]});	#most likely, only one piece of information
					for my $i (0 .. @info-1) {
						print DROPPED join ("\t", $dbtype, $score2[$j]), "\t", $info[$i], "\n";
					}
					delete $variant->{$chr, $start, $obs2[$j]};
				}
			}
		}
	}
	for my $key (keys %$variant) {
		my ($chr, $start, $obs) = split ($;, $key);		#hash key separator
		my ($ref, @info) = split (/\n/, $variant->{$key});
		my $len;
		if ($obs =~ m/^(\d+)(.*)/) {
			($len, $obs) = ($1, $2);
			$obs ||= '-';			#deletion
			if ($len) {
				$end = $start+$len-1;
			} else {
				$end = $start;
			}
		} else {
			$end = $start;
		}
		for my $i (0 .. @info-1) {
			print FIL $info[$i], "\n";
		}
	}
	printerr "Done\n";
	$count_invalid_dbline and printerr "WARNING: $count_invalid_dbline lines in dbfile $dbfile were ignored due to invalid formats\n";
}
	
sub annotateQueryByRegion {
	open (QUERY, $queryfile) or die "Error: cannot read from --queryfile ($queryfile): $!\n";
	open (OUT, ">$outfile.${buildver}_$dbtype1") or die "Error: cannot write to output file $outfile.${buildver}_$dbtype1: $!\n";
	open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n";

	my ($regiondb, $parent) = ({}, {});
	
	if ($dbtype eq 'gff3') {
		($regiondb, $parent) = readGFF3RegionAnnotation ();
	} elsif ($dbtype eq 'bed') {
		($regiondb) = readBedRegionAnnotation ();
	} else {
		($regiondb) = readUCSCRegionAnnotation ();
	}
	
	my ($chr, $start, $end, $ref, $obs);
	my ($invalid);
	my ($linecount, $invalidcount) = qw/0 0/;
	
	$time and printerr "NOTICE: Current time (before examining variants) is ", scalar (localtime), "\n";
	while (<QUERY>) {
		s/[\r\n]+$//;
		
		if (m/^#/ and $comment) {				#comment line start with #, do not include this is $linecount
			print OUT "#comment\t#comment\t$_\n";
			next;
		}
		
		$linecount++;

		$invalid = 0;						#reset invalid status

		my @nextline = split (/\s+/, $_);
		($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
		if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) {
			$invalid++;
		} else {
			($ref, $obs) = (uc $ref, uc $obs);
			$zerostart and $start++;
			$chr =~ s/^chr//;
			if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) {
				$invalid++;
			} elsif ($ref eq '-' and $obs eq '-' 		#both are empty allele
				or $ref =~ m/[^ACTG0\-]/ 		#non-standard nucleotide code
				or $obs =~ m/[^ACGT0\-]/ 		#non-standard nucleotide code
				or $start =~ m/[^\d]/ 			#start is not a number
				or $end =~ m/[^\d]/ 			#end is not a number
				or $start > $end			#start is more than end
				or $ref ne '0' and $end-$start+1 != length ($ref) 	#length mismatch with ref
				or $ref eq '-' and $start != $end	#length mismatch for insertion
				) {		
				$invalid++;
			}
		}


		if ($invalid) {
			print INVALID $_, "\n";	#invalid record found
			$invalidcount++;
			next;
		}

		my $bin1 = int ($start/$genomebinsize);		#start bin
		my $bin2 = int ($end/$genomebinsize);		#end bin (usually same as start bin, unless the query is really big that spans multiple megabases)
		my ($foundhit, $score, $name);
		for my $bin ($bin1 .. $bin2) {
			for my $nextgene (@{$regiondb->{$chr, $bin}}) {
				my ($txstart, $txend, $txscore, $txname) = @$nextgene;
				
				if ($end < $txstart) {
					#db:            <------------------------->
					#query: <--->
					last;						#if genomic region is too far away from end, end the search of the bins
				} elsif ($end <= $txend) {				#query contained completely within db region
					if ($start >= $txstart) {
						#db:      <-------------------------->
						#query:       <------------------>
					} else {					#query overlap but upstream of db region
						#db:       <------------------------->
						#query: <---------------------->
						if ($minqueryfrac) {
							if (($end-$txstart+1)/($end-$start+1) < $minqueryfrac) {
								next;
							}
						}
					}
					$foundhit++;
					$score ||= $txscore; $name ||= $txname;
					if ($score < $txscore) {
						$score = $txscore;
						$name=$txname;
					}
					if ($score == $txscore and defined $name and $name ne $txname) {
						$name .= ",$txname";
					}
					if ($dbtype1 eq 'cytoBand') {			#a new chromosome band is encountered
						$name ne $txname and $name .= ",$txname";
					}
				} elsif ($start <= $txend) {
					if ($start >= $txstart) {			#query overlap but downstream of db region
						#db:      <------------------------>
						#query:        <----------------------->
						if ($minqueryfrac) {
							if (($txend-$start+1)/($end-$start+1) < $minqueryfrac) {
								next;
							}
						}
					} else {
						#db region completely contained within query
						#db:      <------------------------->
						#query: <------------------------------>
						if ($minqueryfrac) {
							if (($txend-$txstart+1)/($end-$start+1) < $minqueryfrac) {
								next;
							}
						}
					}
					$foundhit++;
					$score ||= $txscore; $name ||= $txname;
					if ($score < $txscore) {
						$score = $txscore;
						$name=$txname;
					}
					if ($score == $txscore and defined $name and $name ne $txname) {
						$name .= ",$txname";
					}
					if ($dbtype1 eq 'cytoBand') {			#a new chromosome band is encountered
						$name ne $txname and $name .= ",$txname";
					}
				} else {
					#query            ---
					#gene  <-*----*->
				}
			}
		}
		$linecount =~ m/000000$/ and printerr "NOTICE: Finished processing $linecount variants in queryfile\n";
		if ($foundhit) {
			$name ||= '';
			my @name = split (/,/, $name);
			my %name = map {$_, 1} @name;
			@name = keys %name; 
			
			if ($dbtype1 eq 'cytoBand') {
				map {s/^chr//} @name;
				if (@name >= 2) {
					$name[$#name] =~ s/^\d+//;
					$name = $name[0] . '-' . $name[$#name];
				} else {
					$name = $name[0];
				}
				print OUT "$dbtype\t$name\t$_", "\n";
			} else {
				$name = join (",", @name);
				print OUT "$dbtype\t", $score?"Score=$score;":"", $name?"Name=$name":"", "\t", $_, "\n";
			}
		}
	}
	close (QUERY);
	close (OUT);
	close (INVALID);
	$time and printerr "NOTICE: Current time (after examining variants) is ", scalar (localtime), "\n";
	
	printerr "NOTICE: Finished region-based annotation on $linecount genetic variants in $queryfile";
	if ($invalidcount) {
		printerr " (including $invalidcount with invalid format written to $outfile.invalid_input)";
	} else {
		unlink ("$outfile.invalid_input");
	}
	printerr "\n";
	printerr "NOTICE: Output files were written to $outfile.${buildver}_$dbtype1\n";
}

sub readGFF3RegionAnnotation {
	my ($dbfile);
	my ($regioncount, $dbcount) = (0, 0);
	my (@record, %regiondb, %parent);
	
	$dbfile = File::Spec->catfile ($dbloc, $gff3dbfile);
	-f $dbfile or die "Error: required database $dbfile does not exists. Please use 'annotate_variation.pl -downdb $dbtype $dbloc -buildver $buildver' to download annotation database.\n";
	
	open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n";
	printerr "NOTICE: Reading annotation database $dbfile ... ";
	$_ = <DB>;
	$_ =~ m/^##gff-version\s+3/ or die "Error: invalid header line found in the GFF3 database $dbfile (expect to see '##gff-version 3'): <$_>\n";
	while (<DB>) {
		m/^#/ and next;			#skip comments line
		m/^##FASTA/ and last;		#reached the FASTA sequence section of GFF3 file
		$dbcount++;
		s/[\r\n]+$//;							#deleting the newline characters
		@record = split (/\t/, $_);
		@record == 9 or die "Error: invalid records found in the GFF3 database $dbfile (9 fields expected): <$_>\n";
		my ($chr, $start, $end, $score, $attribute) = @record[0,3,4,5,8]; 
		$chr=~s/^chr//;			#sometimes the chr prefix is present and should be removed (query usually does not contain this chr prefix)
		my $name;
		defined $score_threshold and $score < $score_threshold and next;			#if --score_threshold is set, the low scoring segment will be skipped

		my @feature = split (/;/, $attribute);
		for my $i (0 .. @feature-1) {
			$feature[$i] =~ m/ID=(\S+)/ and $name = $1;
		}
		defined $name or die "Error: invalid record in GFF3 database $dbfile (ID field not found): <$_>\n";
		for my $i (0 .. @feature-1) {
			if ($feature[$i] =~ m/Parent=(.+)/) {
				my @parent = split (/,/, $1);
				for my $j (0 .. @parent-1) {
					$parent{$name} .= $parent[$j];
				}
			}
		}
		
		my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize));
		for my $nextbin ($bin1 .. $bin2) {
			push @{$regiondb{$chr, $nextbin}}, [$start, $end, $score, $name];
		}
		$regioncount++;
		if ($verbose and $dbcount =~ m/000000$/) {
			my ($availmem, $allmem) = currentAvailMemory();
			printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n";
		}
	}
	close (DB);
	for my $key (keys %regiondb) {						#pre-sort gene DB by txstart to faciliate future use
		@{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}};
	}
	printerr "Done with $regioncount regions\n";
	return (\%regiondb, \%parent);
}

sub readBedRegionAnnotation {
	my ($dbfile);
	my ($regioncount, $dbcount) = (0, 0);
	my (@record, %regiondb);
	my ($chr, $start, $end);
	
	$dbfile = File::Spec->catfile ($dbloc, $bedfile);

	-f $dbfile or die "Error: required bedfile $dbfile does not exists.\n";
	
	open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n";
	printerr "NOTICE: Reading annotation database $dbfile ... ";

	while (<DB>) {
		$dbcount++;
		s/[\r\n]+$//;							#deleting the newline characters
		@record = split (/\t/, $_);
		
		($chr, $start, $end) = @record;


		$chr =~ s/^chr//;
		$start++;										#due to the zero-opening coordinate system in UCSC
		
		my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize));
		for my $nextbin ($bin1 .. $bin2) {
			push @{$regiondb{$chr, $nextbin}}, [$start, $end, 0, 'NA'];
		}
		$regioncount++;
		if ($verbose and $dbcount =~ m/000000$/) {
			my ($availmem, $allmem) = currentAvailMemory();
			printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n";
		}
	}
	close (DB);
	
	for my $key (keys %regiondb) {						#pre-sort gene DB by txstart to faciliate future use
		@{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}};
	}
	printerr "Done with $regioncount regions\n";
	return (\%regiondb);
}

sub readUCSCRegionAnnotation {
	my ($dbfile);
	my ($regioncount, $dbcount) = (0, 0);
	my (@record, %regiondb);
	my ($chr, $start, $end, $score, $normscore, $name);
	my ($expectedLength, @positionCols, @scoreCols, @colsToOutput);
	
	if ($dbtype1 =~ m/^mce(\d+way)$/) {
		$dbfile = File::Spec->catfile ($dbloc, "${buildver}_phastConsElements$1.txt");
	} else {
		$dbfile = File::Spec->catfile ($dbloc, "${buildver}_$dbtype1.txt");
	}
	-f $dbfile or die "Error: required database $dbfile does not exists. Please use 'annotate_variation.pl -downdb $dbtype $dbloc' to download annotation database.\n";

	#################$$$
	### The following SWITCH structure is modified Jan 2011 to faciliate future expansion
	### $expectedLength is the number of cols expected in each line
	### @postionCols => location of ($chr,$start,$end) columns
	### @scoreCols => location of ($score, $normscore) columns leave empty is set not present (then set to zero below) ; WARNING must be empty or of length 2
	### @colsToOutPut => location of ($name) columns to put into $name concatinated with ":" below

	if ($dbtype1 =~ m/^phastConsElements\d+way/) {
		$expectedLength=6;
		@positionCols=(1,2,3);
		@scoreCols=(4,5);		#normalized score
		@colsToOutput=(4);		#lod=xxx is the Name output
	} elsif ($dbtype1 eq 'evofold') {
		$expectedLength=10;
		@positionCols=(1,2,3);
		@scoreCols=(5,5);
		@colsToOutput=(4);
	} elsif ($dbtype1 eq 'tfbsConsSites') {
		$expectedLength=8;
		@positionCols=(1,2,3);
		@scoreCols=(7,5);
		@colsToOutput=(4);
	} elsif ($dbtype1 eq 'wgRna') {
		$expectedLength=10;
		@positionCols=(1,2,3);
		@scoreCols=(5,5);
		@colsToOutput=(4);
	} elsif ($dbtype1 eq 'targetScanS') {
		$expectedLength=7;
		@positionCols=(1,2,3);
		@scoreCols=(5,5);
		@colsToOutput=(4);
	} elsif ($dbtype1 eq 'genomicSuperDups') {
		$expectedLength=30;
		@positionCols=(1,2,3);
		@scoreCols=(27,27);
		@colsToOutput=(4);
	} elsif ($dbtype1 eq 'omimGene') {
		$expectedLength=5;
		@positionCols=(1,2,3);
		@scoreCols=();
		@colsToOutput=(4);
	} elsif ($dbtype1 eq 'gwasCatalog') {
		$expectedLength=23;
		@positionCols=(1,2,3);
		@scoreCols=();
		@colsToOutput=(10);
	} elsif ($dbtype1 eq 'dgv') {
		$expectedLength=16;
		@positionCols=(1,2,3);
		@scoreCols=();
		@colsToOutput=(4); 
	} elsif ($dbtype1 eq 'cytoBand') {		#special handling required
		$expectedLength=5;
		@positionCols=(0,1,2);
		@scoreCols=();
		@colsToOutput=(0,3);
	} elsif ($dbtype1 =~ m/^chr\w+_chainSelf$/) {	#example: chr1_selfChain
		$expectedLength=13;
		@positionCols=(2,4,5);
		@scoreCols=(12,12);
		@colsToOutput=(11);
	} elsif ($dbtype1 =~ m/^chr\w+_chain\w+$/) {	#example: chr1_chainPanTro2
		$expectedLength=12;
		@positionCols=(2,4,5);
		@scoreCols=();
		@colsToOutput=(11);
	} elsif ($dbtype1 eq 'snp130' or $dbtype1 eq 'snp131') {	
		$expectedLength=18;
		@positionCols=(1,2,3);
		@scoreCols=();
		@colsToOutput=(4);
	} else {
		#other UCSC format if file is not defined above
		$expectedLength='';
		@positionCols=(1,2,3);
		@scoreCols=();
		@colsToOutput=(4);
	}
	
	if ($scorecolumn) {
		@scoreCols = ($scorecolumn, $scorecolumn);
	}
	
	open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n";
	printerr "NOTICE: Reading annotation database $dbfile ... ";
	
	if ($expectedLength eq '') {		# if DB is unknown "generic format" use first line to get $expectedLength : file rewound afterwards
		my $line = <DB>;
		@record = split (/\t/, $line);
		$expectedLength=@record;
		seek (DB, 0, 0);
	};
	
	########$$ Check to see if user has defined columns to output (intergers or all allowed)
	if (defined $colsWanted) { 
		if ($colsWanted[0] eq 'all') {
			@colsToOutput= 0 .. ($expectedLength-1);
		} elsif ($colsWanted[0] eq 'none') {
			@colsToOutput = ();
		} else{
			@colsToOutput = @colsWanted;
		}
	};

	########$$ check that the columns requested exist in the current DB
	for my $i (0 .. @colsToOutput-1) {
		if ($colsToOutput[$i] > $expectedLength) {
			die "Error: The DB file $dbfile has only $expectedLength columns but output column $colsToOutput[$i] is requested by --colsWanted!\n";
		}
	}

	while (<DB>) {
		$dbcount++;
		s/[\r\n]+$//;							#deleting the newline characters
		@record = split (/\t/, $_);
		
		@record == $expectedLength or die "Error: invalid record in dbfile $dbfile ($expectedLength fields expected): <$_>\n";
		($chr, $start, $end) = @record[@positionCols];
		if (@colsToOutput) {		#I think there should always be a Name in the output column
			$name = join (':', @record[@colsToOutput]);
		}
		
		if(@scoreCols){
			($score, $normscore)=(@record[@scoreCols])
		} else{
			($score, $normscore) = qw/0 0/;
		}
		
		#########$$ Unusual exceptions for phastCons
		if ($dbtype1 =~ m/^phastConsElements\d+way/) {
			$score =~ s/^lod=// or die "Error: invalid lod score designation (no 'lod=' found) in dbfile $dbfile: <$_>\n";
		} ##lod= in the score for conservation tracks
		
		#########$$ Unusual exceptions for cytoBand
		if ($dbtype1 eq 'cytoBand' and not defined $colsWanted) {	#the name for chromosome band is concatenated as single word
			$name =~ s/://;
		}
		
		defined $score_threshold and $score < $score_threshold and next;			#if --score_threshold is set, the low scoring segment will be skipped
		defined $normscore_threshold and $normscore < $normscore_threshold and next;		#if --normscore_threshold is set, the low scoring segment will be skipped
		
		$chr =~ s/^chr//;
		$start++;										#due to the zero-opening coordinate system in UCSC
		
		my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize));
		for my $nextbin ($bin1 .. $bin2) {
			if ($rawscore) {								#print out rawscore, rather than normalized score (default)
				$normscore = $score;
			}
			if (defined $name) {
				push @{$regiondb{$chr, $nextbin}}, [$start, $end, $normscore, $name];
			} else {			#name is not requested in the output
				push @{$regiondb{$chr, $nextbin}}, [$start, $end, $normscore];
			}
		}
		$regioncount++;
		if ($verbose and $dbcount =~ m/000000$/) {
			my ($availmem, $allmem) = currentAvailMemory();
			printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n";
		}
	}
	close (DB);
	
	for my $key (keys %regiondb) {						#pre-sort gene DB by txstart to faciliate future use
		@{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}};
	}
	printerr "Done with $regioncount regions";
	if (defined $score_threshold or $normscore_threshold) {
		printerr " (that passed --score_threhsold or --normscore_threshold from a total of $dbcount regions)\n";
	} else {
		printerr "\n";
	}
	return (\%regiondb);
}


sub translateDNA {
	my ($seq) = @_;
	my ($nt3, $protein);
	$seq = uc $seq;
	#length ($seq) % 3 == 0 or printerr "WARNING: length of DNA sequence to be translated is not multiples of 3: <length=${\(length $seq)}>\n";
	while ($seq =~ m/(...)/g) {
		defined $codon1{$1} or printerr "WARNING: invalid triplets found in DNA sequence to be translated: <$1>\n";
		$protein .= $codon1{$1};
	}
	return $protein;
}

sub translateRNA {
	my ($seq) = @_;
	my ($nt3, $protein);
	$seq = uc $seq;
	#length ($seq) % 3 == 0 or printerr "WARNING: length of RNA sequence to be translated is not multiples of 3: <length=${\(length $seq)}>\n";
	while ($seq =~ m/(...)/g) {
		defined $codonr1{$1} or printerr "WARNING: invalid triplets found in RNA sequence to be translated: <$1>\n";
		$protein .= $codonr1{$1};
	}
	return $protein;
}

sub revcom {
	my ($seq) = @_;
	$seq = reverse $seq;
	$seq =~ tr/acgtACGT/tgcaTGCA/;
	return ($seq);
}

sub readSeqFromFASTADB {
	my ($refseqvar) = @_;
	my (%seqhash);
	my $seqdbfile;
	
	#the four statements below should be condensed in the future (they are identical)
	$seqdbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1" . "Mrna.fa");

	my ($seqid, $curseq) = ('', '');

	-f $seqdbfile or die "Error: FASTA sequence file $seqdbfile does not exist. Please use 'annotate_variation.pl --downdb $dbtype $dbloc' download the database.\n";
	open (SEQ, $seqdbfile) or die "Error: cannot read from seqdbfile $seqdbfile: $!\n";
	printerr "NOTICE: Reading FASTA sequences from $seqdbfile ... ";
	while (<SEQ>) {
		if (m/^>(\S+)/) {
			if ($refseqvar->{$seqid}) {
				not defined $seqhash{$seqid} and $seqhash{$seqid} = $curseq;		#finish reading the sequence for seqid and save it (unless the sequence is already read from the file)
			}
			$seqid = $1;
			$curseq = '';
		} else {
			if ($refseqvar->{$seqid}) {
				s/[\r\n]+$//;
				$curseq .= uc $_;					#only use upper case characters
			}
		}
	}
	if ($refseqvar->{$seqid}) {			#finish the last sequence in the file
		not defined $seqhash{$seqid} and $seqhash{$seqid} = $curseq;
	}
	close (SEQ);
	printerr "Done with ", scalar keys %seqhash, " sequences\n";
	if (keys %seqhash < keys %$refseqvar) {
		my (@seqnotfound, @seqnotfound_example);
		for $seqid (keys %$refseqvar) {
			exists $seqhash{$seqid} or push @seqnotfound, $seqid;
		}
		printerr "WARNING: A total of ${\(scalar @seqnotfound)} sequences cannot be found in $seqdbfile";
		@seqnotfound_example = splice (@seqnotfound, 0, 3);
		printerr " (example: @seqnotfound_example)\n";
	}
	return (\%seqhash);
}

sub readKgXref {
	my ($inputfile) = @_;
	my (%gene_xref);
	open (XREF, $inputfile) or die "Error: cannot read from kgxref file $inputfile: $!\n";
	while (<XREF>) {
		m/^#/ and next;
		s/[\r\n]+$//;
		my @record = split (/\t/, $_);
		@record == 8 or die "Error: invalid record found in knownGene cross-reference file (6 fields expected): <$_>\n";
		#some genes were given names that are prefixed with "Em:" which should be removed due to the presence of ":" in exonic variant annotation
		#Em:AC006547.7 Em:AC005003.4 Em:U62317.15 Em:AC008101.5 Em:AC004997.11 Em:U51561.2
		$record[4] =~ s/^Em:/Em./;
		if ($gene_xref{$record[0]}) {			#BC003168 occur twice in kgxref file (OSBPL10, BC003168)
			if ($gene_xref{$record[0]} =~ m/^(BC|AK)\d+$/) {
				$gene_xref{$record[0]} = $record[4];
			}
		} else {
			$gene_xref{$record[0]} = $record[4];
		}
	}
	close (XREF);
	return (\%gene_xref);
}

sub readUCSCGeneAnnotation {			#read RefGene annotation database from the UCSC Genome Browser, convert 0-based coordinates to 1-based coordinates
	my ($dbloc) = @_;
	my ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes);
	my (%genedb, %geneidmap, %name2count, %cdslen, %mrnalen);
	my ($genecount, $ncgenecount) = (0, 0);
	
	my $dbfile;
	my $kgxref;
	
	if ($dbtype1 eq 'refGene') {
		$dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt");
	} elsif ($dbtype1 eq 'knownGene') {
		$dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt");
		my $kgxreffile = File::Spec->catfile($dbloc, $buildver . "_kgXref.txt");
		-f $kgxreffile or die "Error: the knownGene cross-reference file $kgxreffile does not exist. Please use 'annotate_variation.pl --downdb knownGene $dbloc' to download the database.\n";
		$kgxref = readKgXref ($kgxreffile);
	} elsif ($dbtype1 eq 'ensGene') {
		$dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt");
	} else {
		$dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt");		#added 2011feb18
		#die "FATAL ERROR: the dbype $dbtype1 is not supported in the readUCSCGeneAnnotation() subroutine.\n";		#commented 2011feb18
	}
	-f $dbfile or die "Error: The gene annotation database $dbfile does not exist. Please use 'annotate_variation.pl --downdb $dbtype $dbloc -build $buildver' to download the database.\n";

	open (GENEDB, $dbfile) or die "Error: cannot read from gene annotaion database $dbfile: $!\n";
	printerr "NOTICE: Reading gene annotation from $dbfile ... ";
	while (<GENEDB>) {
		s/[\r\n]+$//;							#deleting the newline characters
		my @record = split (/\t/, $_);

		if ($dbtype1 eq 'refGene') {
			@record == 16 or die "Error: invalid record in $dbfile (expecting 16 tab-delimited fields in refGene file): <$_>\n";
			($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15];		#human hg18, mouse
		} elsif ($dbtype1 eq 'knownGene') {
			@record >= 11 or die "Error: invalid record in $dbfile (>=11 fields expected in knownGene file): <$_>\n";	#mm8=11, hg18=hg19=12
			($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend) = @record[0..9];
			$name2 = $kgxref->{$name} || $name;
		} elsif ($dbtype1 eq 'ensGene') {
			@record == 16 or die "Error: invalid record in $dbfile (expecting 16 fields in ensGene file): <$_>\n";
			($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15];
		} else {
			@record >= 11 or die "Error: invalid record in $dbfile (>=11 fields expected in $dbtype1 gene definition file): <$_>\n";
			($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15];
			defined $name2 or $name2=$name;
			#die "FATAL ERROR: the --dbtype $dbtype is not supported in readUCSCGeneAnnotation() subroutine.\n";		#commented 2011feb18
		}
	
		#handle situations where the same transcript is mapped to several chromosomes or regions (for example, NM_019105 is mapped to chr6, chr6_cox_hap1, chr6_qbl_hap2; NM_002538 is mapped to chr5 positive and negative strand and also in chr5_h2_hap1)
		if ($chr =~ m/hap\d+$/) {
			next;			#this is a temporary solution on 2011feb19, to ignore alternative haplotype chromosomes
		}
	
		$chr =~ s/^chr// or die "Error: invalid record found in $dbfile (chrom field not found): <$_>\n";						#UCSC always prefix "chr" to the chromosome identifier, so this is a good check to make sure that the file is the correct file
		$dbstrand eq '+' or $dbstrand eq '-' or die "Error: invalid dbstrand information found in $dbfile (dbstrand has to be + or -): <$_>\n";		#dbstrand is important to know and cannot be optional
		my @exonstart = split (/,/, $exonstart); 			#remove trailing comma
		my @exonend = split (/,/, $exonend);				#remove trailing comma
		$exoncount == @exonstart or die "Error: invalid record found in $dbfile (exoncount discordance): <$exoncount vs ${\(scalar @exonstart)}>\n";
		@exonstart == @exonend or die "Error: invalid record found in $dbfile (exonstart and exonend count discordance): <${\(scalar @exonstart)} vs ${\(scalar @exonend)}>\n";
		$txstart++; $cdsstart++; map {$_++} @exonstart;			#convert 0-based coordinate to 1-based coordinate

		#LOGIC here:
		#first calcluate mRNA length, and if the transcript maps to multiple locations with discordant mRNA length, only consider the leftmost chromosome and leftmost coordinate (because the FASTA file is sorted in this manner)

		my $cdslength = 0;
		my $mrnalength = 0;
		for my $i (0 .. @exonstart-1) {					#this calculation is valid regardless of strand
			$mrnalength += $exonend[$i]-$exonstart[$i]+1;
			if ($cdsstart >= $exonstart[$i] and $cdsstart <= $exonend[$i]) {
				if ($cdsend <= $exonend[$i]) {
					$cdslength = $cdsend-$cdsstart+1;
					last;
				} else {
					$cdslength += $exonend[$i]-$cdsstart+1;
					next;
				}
			}
			if ($cdslength and $cdsend < $exonstart[$i]) {
				die "FATAL ERROR: impossible scenario for $name in $dbfile (cdsend is less than exon start)";
			} elsif ($cdslength and $cdsend <= $exonend[$i]) {
				$cdslength += $cdsend-$exonstart[$i]+1;
				last;
			} elsif ($cdslength and $cdsend > $exonend[$i]) {
				$cdslength += $exonend[$i]-$exonstart[$i]+1;
			}
			
		}
		
		if ($cdsstart != $cdsend+1) {		#coding gene
			if (defined $mrnalen{$name} and $mrnalen{$name} != $mrnalength) {
				$verbose and printerr "WARNING: $name occurs more than once in $dbfile with different mRNA length. The first occurences with identical mRNA length will be uesd in analysis.\n";
				next;
			}
			
			
			if (defined $cdslen{$name} and $cdslen{$name} != $cdslength) {
				$verbose and printerr "WARNING: $name occurs more than once in $dbfile with different CDS length. The first occurences with identical mRNA length will be uesd in analysis.\n";
				next;
			}
		}
		
		$cdslen{$name} = $cdslength;
		$mrnalen{$name} = $mrnalength;
				
		my ($bin1, $bin2) = (int(($txstart - $neargene)/$genomebinsize), int(($txend + $neargene)/$genomebinsize));
		for my $nextbin ($bin1 .. $bin2) {
			push @{$genedb{$chr, $nextbin}}, [$name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, [@exonstart], [@exonend], $name2];
		}
		$geneidmap{$name} = $name2;
		$genecount++;
		$name2count{$name2}++;
		$cdsstart == $cdsend+1 and $ncgenecount++;			#non-coding gene has the same start and end site
	} 
	close (GENEDB);
	for my $key (keys %genedb) {						#pre-sort gene DB by txstart to faciliate future use
		@{$genedb{$key}} = sort {$a->[2] <=> $b->[2]} @{$genedb{$key}};
	}
	printerr "Done with $genecount transcripts (including $ncgenecount without coding sequence annotation) for ", scalar (keys %name2count), " unique genes\n";
	return (\%genedb, \%geneidmap, \%cdslen, \%mrnalen);
}

sub downloadDB {
	my ($cwd, $msg, $sc);
	
	$cwd = Cwd::cwd();
	
	-w $dbloc or die "Error: the directory $dbloc is not writable by the current user\n";
	chdir ($dbloc) or die "Error: the directory $dbloc cannot be accessed\n";
	
	my (@urlin, @filein, @fileout, %fail);		#the fail hash contains index of files that fail to be downloaded
	my $count_success;
	if ($dbtype1 eq 'refGene') {
		push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/refGene.txt.gz";
		push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/refLink.txt.gz";
		push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_refGeneMrna.fa.gz";
	} elsif ($dbtype1 eq 'knownGene') {
		push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/knownGene.txt.gz";
		push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/kgXref.txt.gz";
		push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_knownGeneMrna.fa.gz";
	} elsif ($dbtype1  eq 'ensGene') {
		push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/ensGene.txt.gz";
		push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_ensGeneMrna.fa.gz";
	} elsif ($dbtype1 eq 'seq') {
		push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/chromFa.zip";		#example: hg18, hg19
		push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/chromFa.tar.gz";	#example: panTro2
		push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/$buildver.fa.gz";	#example: bosTau4
	} elsif ($dbtype1 =~ m/^mce(\d+way)$/) {		#it could be 17 way, 28 way, 30 way, 44 way, etc, depending on genome and on build
		push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/phastConsElements$1.txt.gz";
	} elsif ($dbtype1 eq 'avsift') {
		$buildver eq 'hg18' or $buildver eq 'hg19' or die "Error: currently the --dbtype of avsift only support --buildver of 'hg18' or 'hg19'\n";
		push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_avsift.txt.gz";
	} elsif ($dbtype1 eq '1000g') {				#dbtype1 is same as queryfile, when --downdb is used
		$buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g' only support --buildver of 'hg18'\n";
		push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/CEU.sites.2009_04.gz";
		push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/YRI.sites.2009_04.gz";
		push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/JPTCHB.sites.2009_04.gz";
	} elsif ($dbtype1 eq '1000g2010') {			#dbtype1 is same as queryfile, when --downdb is used
		$buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g2010' only support --buildver of 'hg18'\n";
		push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_CEU.sites.2010_03.txt.gz";
		push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_YRI.sites.2010_03.txt.gz";
		push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_JPTCHB.sites.2010_03.txt.gz";
	} elsif ($dbtype1 eq '1000g2010jul') {			#dbtype1 is same as queryfile, when --downdb is used
		$buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g2010jul' only support --buildver of 'hg18'\n";
		push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_CEU.sites.2010_07.txt.gz";
		push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_YRI.sites.2010_07.txt.gz";
		push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_JPTCHB.sites.2010_07.txt.gz";
	} elsif ($dbtype1 eq '1000g2010nov') {
		$buildver eq 'hg19' or die "Error: currently the --dbtype of '1000g2010nov' only support --buildver of 'hg19'\n";
		push @urlin, "http://www.openbioinformatics.org/annovar/download/hg19_ALL.sites.2010_11.txt.gz";
	} elsif ($dbtype1 eq 'null') {
		1;
	} else {
		if ($webfrom) {
			if ($webfrom eq 'annovar') {
				push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_$dbtype1.txt.gz";
			} elsif ($webfrom eq 'ucsc') {
				push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/$dbtype1.txt.gz";
			} else {
				push @urlin, "$webfrom/$dbtype1.txt.gz";
			}
		} else {
			push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/$dbtype1.txt.gz";	#default goes to UCSC
		}
	}
	
	@filein = @urlin;
	map {s/.+\///} @filein;
	@fileout = @filein;
	map {s/\.gz$//; s/\.zip$//} @fileout;
	
	if ($wget) {
		$msg = qx/wget --help 2>&1/ || '';		#collect the output of the system command
	} else {
		$msg = '';					#when --nowget is specified, do not use wget to retrieve files from Internet
	}
	if ($msg =~ m/Usage/) {
		checkProgramUpdate ("wget");
		for my $i (0 .. @urlin-1) {
			printerr "NOTICE: Downloading annotation database $urlin[$i] ... ";
			if ($verbose) {
				$sc = "wget -t 1 -T 10 -O $filein[$i] $urlin[$i]";
			} else {
				$sc = "wget -t 1 -T 10 -q -O $filein[$i] $urlin[$i]";
			}
			if (system ($sc)) {	#time-out is 10 seconds, with 1 retry attempt
				printerr "Failed\n";
				$verbose and print "WARNING: unable to execute system command: <$sc>\n";
				unlink ($filein[$i]);		#delete the temporary files generated by wget
				$fail{$i}++;
			} else {
				printerr "OK\n";
				$count_success++;
			}
		}
	} else {
		eval {
			require Net::FTP;
			require LWP::UserAgent;
		};
		if ($@) {
			printerr "WARNING: cannot retrieve remote files automatically (by 'wget' command or by standard Net::FTP/LWP::UserAgent Perl module).\n";
			printerr "Please manually download the following file, uncompress the files to $dbloc directory, then add a ${buildver}_ prefix to the file names.\n";
			printerr join ("\n", @urlin), "\n";
			exit (100);
		}
		
		checkProgramUpdate ("lwp");
		my ($http, $ftp);
		for my $i (0 .. @urlin-1) {
			printerr "NOTICE: Downloading annotation database $urlin[$i] ... ";
			if ($urlin[$i] =~ m/^http/) {
				$http = LWP::UserAgent->new (timeout=>10, show_progress=>$verbose);
				$http->env_proxy;
				
				my $response = $http->get ($urlin[$i], ':content_file'=>$filein[$i]);
				if ($response->is_success) {
					printerr "Done\n";
					$count_success++;
				} else {
					printerr "Failed\n";
					$verbose and printerr "WARNING: cannot retrieve remote files ($urlin[$i]) via LWP::UserAgent Perl module: ", $response->status_line, "\n";
					$fail{$i}++;
				}
			} elsif ($urlin[$i] =~ m#^ftp://([^\\\/]+)#) {		#for hgdownload.cse.ucsc.edu, ftp-trace.ncbi.nih.gov, ftp.ensembl.org, etc
				my $urlroot = $1;
				if ($ftp = Net::FTP->new($urlroot, Timeout=>10, Debug=>$verbose)) {
					$ftp->login("anonymous", 'anonymous@');
					$ftp->binary();
					my $url = $urlin[$i];
					$url =~ s#ftp://[\w\.\-]+/##;		#remove the URL root
					if (not $ftp->get($url)) {
						printerr "Failed\n";
						$verbose and printerr "WARNING: cannot retrieve remote file ($url) in FTP server $urlroot\n";
						$fail{$i}++;
					} else {
						printerr "Done\n";
						$count_success++;
					}
				} else {
					printerr "Failed\n";
					$verbose and printerr "WARNING: cannot retrieve remote file ($urlin[$i]) via Net::FTP Perl module\n";
					$fail{$i}++;
				}
				
			} else {
				die "Error: The URL $urlin[$i] uses an unsupported protocol. Download cannot continue\n";
			}
		}
	}
	
	$count_success and printerr "NOTICE: Uncompressing downloaded files\n";
	for my $i (0 .. @filein-1) {
		$fail{$i} and next;
		if ($filein[$i] =~ m/\.zip$/) {
			$msg = qx/unzip --help 2>&1/ || '';		#collect the output of the system command
			if ($msg =~ m/Usage/i) {
				if ($verbose) {
					system ("unzip -o $filein[$i]");
				} else {
					system ("unzip -o -q $filein[$i]");
				}
			} else {
				printerr "ERROR: unzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n";
				exit (101);
			}
		} elsif ($filein[$i] =~ m/\.tar\.gz$/) {		#panTro2 FASTA sequence is stored as tar.gz rather than zip
			$msg = qx/tar --help 2>&1/ || '';		#collect the output of the system command
			if ($msg =~ m/Usage/i) {
				system ("tar -x -z -f $filein[$i]");
			} else {
				printerr "ERROR: tar/gunzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n";
				exit (102);
			}
		} elsif ($filein[$i] =~ m/\.gz$/) {
			$msg = qx/gunzip --help 2>&1/ || '';		#collect the output of the system command
			if ($msg =~ m/Usage/i) {
				system ("gunzip -f $filein[$i]");
			} else {
				printerr "ERROR: gunzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n";
				exit (103);
			}
		}
	}

	for my $i (0 .. @fileout-1) {
		$fail{$i} and next;				#skip the file that failed to be downloaded
		my $fileout = $fileout[$i];
		$dbtype1 eq 'seq' and next;			#the zip file contains dozens of FASTA files so cannot rename them automatically
		if (not $fileout =~ m/^${buildver}_/) {		#if the buildver is not the prefix of the files
			rename ($fileout, "${buildver}_$fileout") or die "Error: cannot rename $fileout to ${buildver}_$fileout\n";
			$fileout = "${buildver}_$fileout";
		}
		if (not $fileout =~ m/\.txt$/ and not $fileout =~ m/\.fa$/) {
			rename ($fileout, "$fileout.txt");
		}
	}
	
	$count_success and printerr "NOTICE: Finished downloading annotation files for $buildver build version, with files saved at the '$dbloc' directory\n";
	$cwd and chdir ($cwd);
	if (%fail) {
		my @failindex = keys %fail;
		if ($dbtype1 eq 'seq' and @failindex == 1) {	#not really a fail, because for seq, ANNOVAR attempts on tar.gz and zip file
			1;
		} else {
			printerr "WARNING: Some files cannot be downloaded, including ", join (', ', @urlin[@failindex]), "\n";
		}
		
		for my $index (@failindex) {
			if ($urlin[$index] =~ m#^http://www\.openbioinformatics\.org.+Mrna.fa.gz$#) {
				printerr "---------------------------ADDITIONAL PROCEDURE---------------------------\n";
				printerr "--------------------------------------------------------------------------\n";
				printerr "NOTICE: the FASTA file $urlin[$index] is not available to download but can be generated by the ANNOVAR software. ";
				printerr "PLEASE RUN THE FOLLOWING TWO COMMANDS CONSECUTIVELY TO GENERATE THE FASTA FILES:\n\n";
				printerr "\tannotate_variation.pl --buildver $buildver --downdb seq $dbloc/${buildver}_seq\n";
				printerr "\tretrieve_seq_from_fasta.pl $dbloc/${buildver}_$dbtype1.txt -seqdir $dbloc/${buildver}_seq -format $dbtype1 -outfile $dbloc/${buildver}_${dbtype1}Mrna.fa\n";
				printerr "--------------------------------------------------------------------------\n";
				printerr "--------------------------------------------------------------------------\n";
			}
		}
	}	
}

sub currentAvailMemory {
	my ($availmem, $allmem) = (0, 0);
	if ($^O eq "MSWin32") {		#no easy solution to get available memory from Windows.
		($availmem, $allmem) = (0, 0);
	} elsif ($^O eq 'linux' or $^O eq 'aix' or $^O eq 'solaris') {
		if (open (TOP, "top -b -n 1 2>&1 |")) {
			my $index;
			while (<TOP>) {
				if (m/^Mem:.+\s(\d+)k free/) {
					$availmem = $1;
				}
				s/^\s+//;
				my @field = split (/\s+/, $_);
				@field >= 10 or next;			#make sure that the PID lines are reached
				if ($field[0] eq 'PID') {
					for my $i (0 .. @field-1) {
						$field[$i] eq 'RES' and $index = $i;
					}
				}
				if ($field[0] eq $$) {
					defined $index or die "Error: invalid output from top command: the line with PID and RES is not found\n";
					$allmem = $field[$index];
					if ($allmem =~ m/^([\d\.]+)(\w)$/) {
						if ($2 eq 'g') {
							$allmem = $1 * 1_000_000;
						} elsif ($2 eq 'm') {
							$allmem = $1 * 1_000;
						} elsif ($2 eq 'k') {
							$allmem = $1;
						} else {
							printerr "WARNING: unrecognizable output from top command: <$_>\n";
						}
					}
					last;
				}
			}
		}
	} else {
		($availmem, $allmem) = (0, 0);
	}
	return ($availmem, $allmem);
}

sub printerr {
	print STDERR @_;
	print LOG @_;
}

sub checkProgramUpdate {
	my ($method) = @_;
	my $sc;
	my ($curdate, $webdate, $webdate1) = $LAST_CHANGED_DATE;
	my (@webcontent);
	$method eq 'wget' or $method eq 'lwp' or die "Error: update checking method can be only 'wget' or 'lwp'";
	printerr "NOTICE: Web-based checking to see whether ANNOVAR new version is available ... ";
	$LAST_CHANGED_DATE =~ m/LastChangedDate: (\d+)\-(\d+)-(\d+)/ or printerr "Failed\n" and return; 
	$curdate = $1.$2.$3;
	if ($method eq 'wget') {
		$sc = "wget -t 1 -T 10 -q -O .annovar_date http://www.openbioinformatics.org/annovar/download/annovar_date";
		if (system ($sc)) {
			printerr "Failed\n";
			return;
		} else {
			if (not open (AVDATE, ".annovar_date")) {
				printerr "Cannot access version information\n";
			} else {
				printerr "Done\n";
				@webcontent = <AVDATE>;		#$LAST_CHANGED_DATE =	'$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $';
				close (AVDATE);
				unlink (".annovar_date");
			}
		}
	} elsif ($method eq 'lwp') {
		my $http = LWP::UserAgent->new (timeout=>10);
		$http->env_proxy;
		my $response = $http->get("http://www.openbioinformatics.org/annovar/download/annovar_date");
		if ($response->is_success) {
			printerr "Done\n";
			$_ = $response->decoded_content;
			@webcontent = split (/\n/, $_);
		} else {
			printerr "Failed\n";
			return;
		}
	}
	
	$webdate = $webcontent[0];
	$webdate =~ s/[\r\n]+$//;
	$webdate1 = $webdate;
	$webdate1 =~ s/\-//g;			#remove the - sign in webdate
	if ($curdate < $webdate1) {
		printerr "----------------------------UPDATE AVAILABLE------------------------------\n";
		printerr "--------------------------------------------------------------------------\n";
		printerr "WARNING: A new version of ANNOVAR (dated $webdate) is available!\n";
		printerr "         Download from http://www.openbioinformatics.org/annovar/\n";
	
		if (@webcontent >= 2) {
			printerr "Changes made in the $webdate version:\n";
			for my $i (1 .. @webcontent-1) {
				if ($webcontent[$i] =~ m/^(\d{4})\-(\d{2})\-(\d{2})[\r\n]+$/) {
					$webdate = "$1-$2-$3";
					$webdate1 = "$1$2$3";
					if ($curdate >= $webdate1) {	#the current version is more recent than this date
						last;
					} else {
						printerr "Changes made in the $webdate version:\n";
					}
				} else {
					printerr "         * $webcontent[$i]";
				}
			}
		}
		printerr "--------------------------------------------------------------------------\n";
		printerr "--------------------------------------------------------------------------\n";
	}
}

=head1 SYNOPSIS

 annotate_variation.pl [arguments] <query-file|table-name> <database-location>

 Optional arguments:
        -h, --help                      print help message
        -m, --man                       print complete documentation
        -v, --verbose                   use verbose output
        
        Arguments to download databases or perform annotations
            --downdb			download UCSC Genome Browser annotation database
            --geneanno			annotate variants by functional consequences on genes
            --regionanno		annotate variants by targetting specific genomics regions
            --filter			filter variants based on a position list
            --webfrom <string>		specify the source of database (default usually works fine)
        
        Arguments to control input and output files
            --outfile <file>		output file prefix
            --zerostart			input query file uses half-open zero-start coordinate
            --dbtype <string>		database type
            --buildver <string>		genome build version (default: hg18 for human)
            --gff3dbfile <file>		specify the GFF3 DB file used in region-based annotation
            --genericdbfile <file>	specify the generic DB file used in filter-based annotation
            --vcfdbfile <file>		specify the DB file in VCF format in filter-based annotation
            --bedfile <file>		specify a BED file in region-based annotation
            --time			print out local time during program run
            --separate			separately print out all function of a variant (default: one line per variant)
            --colsWanted <string>	specify which columns to output in -regionanno by comma-delimited numbers
            --comment			print out comment line (those starting with #) in output files 
            --scorecolumn <int>		the column with scores in database file (for region-based annotation)
            --exonsort			sort the exon number in output line (for gene-based annotation)
            --transcript_function	use transcript name rather than gene name in gene-based annotation output
        
        Arguments to fine-tune the annotation procedure
            --batchsize <int>		batch size for processing variants per batch (default: 5m)
            --genomebinsize <int>	bin size to speed up search (default: 100k for -geneanno, 10k for -regionanno)
            --expandbin <int>		check nearby bin to find neighboring genes (default: 2m/genomebinsize)
            --neargene <int>		distance threshold to define upstream/downstream of a gene
            --score_threshold <float>	minimum score of DB regions to use in annotation
            --normscore_threshold <float> minimum normalized score of DB regions to use in annotation
            --rawscore			output includes the raw score (not normalized score) in UCSC Browser Track
            --minqueryfrac <float>	minimum percentage of query overlap to define match to DB (default: 0)
            --splicing_threshold <int>	distance between splicing variants and exon/intron boundary (default: 2)
            --maf_threshold <float>	filter 1000G variants with MAF above this threshold (default: 0)
            --sift_threshold <float>	SIFT threshold for deleterious prediction (default: 0.05)
            --precedence <string>	comma-delimited to specify precedence of variant function (default: exonic>intronic...)
       
       Arguments to control memory usage
            --memfree <int>		ensure minimum amount of free system memory (default: 100000, in the order of kb)
            --memtotal <int>		limit total amount of memory used by ANNOVAR (default: 0, unlimited, in the order of kb)
            --chromosome <string>	examine these specific chromosomes in database file
            

 Function: annotate a list of genetic variants against genome annotation 
 databases saved at local disk.
 
 Example: #download gene annotation database (for hg18 build) and save to humandb/ directory
 	  annotate_variation.pl -downdb gene humandb/
 	  annotate_variation.pl -buildver mm9 -downdb mce30way mousedb/
 	  annotate_variation.pl -downdb snp130 humandb/
 	
 	  #gene-based annotation of variants in the varlist file (by default --geneanno is ON)
 	  annotate_variation.pl ex1.human humandb/
 	  
 	  #region-based annotate variants
 	  annotate_variation.pl -regionanno -dbtype mce44way ex1.human humandb/
 	  annotate_variation.pl -regionanno -dbtype gff3 -gff3dbfile tfbs.gff3 ex1.human humandb/
 	  
 	  #filter rare or unreported variants (in 1000G/dbSNP) or predicted deleterious variants
 	  annotate_variation.pl -filter -dbtype 1000g_ceu -maf 0.01 ex1.human humandb/
 	  annotate_variation.pl -filter -dbtype snp130 ex1.human humandb/
 	  annotate_variation.pl -filter -dbtype avsift ex1.human humandb/
 
 Version: $LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $

=head1 OPTIONS

=over 8

=item B<--help>

print a brief usage message and detailed explanation of options.

=item B<--man>

print the complete manual of the program.

=item B<--verbose>

use verbose output.

=item B<--downdb>

download annotation databases from UCSC Genome Browser, Ensembl, 1000 Genomes 
Project or other resources. The annotation files in this database are required 
for the functional annotation of variants.

=item B<--geneanno>

perform gene-based annotation. For each variant, examine whether it hit exon, 
intron, intergenic region, or close to a transcript, or hit a non-coding RNA 
gene, or is located in a untranslated region. In addition, for an exonic variant, 
determine whether it causes splicing change, non-synonymous amino acid change, 
synonymous amino acid change or frameshift changes.

=item B<--regionanno>

perform region-based annotation. For each variant, examine whether it overlaps 
with a specific genomic region, such as the most conserved elements, the 
predicted transcription factor binding sites, the specific cytogeneic bands, the 
evolutionarily conserved RNA secondary structures and so on.

=item B<--filter>

perform filter-based annotation. For each variants, filter it against a 
variation database, such as the 1000 Genomes Project database and the dbSNP 
database, and identify a subset that have not been reported in these databases 
as novel variants.

=item B<--outfile>

specify the output file prefix. Several output files will be generated using 
this prefix and different suffixes. A directory name can also be specified as 
part of the argument, so that the output files can be written to a different 
directory than the current directory.

=item B<--zerostart>

utilize the half-open zero-start coordinate system that is used by many UCSC 
Genome Browser annotation tables. By default, the 1-based coordinate system will 
be used.

=item B<--dbtype>

specify the database type to be used in gene-based, region-based or filter-based 
annotations. For gene-based annotation, by default refGene annotations from the 
UCSC Genome Browser will be used for annotating variants. However, users can 
switch to utilize Ensembl annotations instead, or use the UCSC Gene annotations. 
In general, RefSeq gene annotations are more conservative, and UCSC Gene 
annotations are most liberal with many predicted genes and non-coding RNAs. For 
region-based annotations, users can select any UCSC annotation databases (by 
providing the database name), or alternatively select a Generic Feature Format 
version 3 (GFF3) formatted file for annotation (by providing 'gff3' as the --
dbtype and providing the --gff3dbfile argument). For filter-based annotations, 
users can select a dbSNP file, a 1000G file, a generic format file (with simple 
columns including chr, start, end, reference, observed, score), a VCF format 
(which is the current popular format for variants exchange), or a avsift format 
(which is identital to the generic format but is provided for convenience).

=item B<--buildver>

genome build version to use. By default, the hg18 build for human genome is 
used. The build version will be used by ANNOVAR to identify corresponding database files 
automatically, for example, when gene-based annotation is used for hg18 build, 
ANNOVAR will search for the hg18_refGene.txt file, but if the hg19 is used as --
buildver, ANNOVAR will examine hg19_refGene.txt instead.

=item B<--gff3dbfile>

specify the GFF3-formatted database file used in the region-based annotation.

=item B<--genericdbfile>

specify the generic format database file used in the filter-based annotation.

=item B<--vcfdbfile>

specify the database file in VCF format in the filter-based annotation. VCF has 
been a popular format for summarizing SNP and indel calls in a population of 
samples, and has been adopted by 1000 Genomes Project in their most recent data 
release.

=item B<--time>

print out the local time during execution of the program

=item B<--separate>

for gene-based annotation, separate the effects of each variant, so that each 
effect (intronic, exonic, splicing) is printed in one output line. By default, 
all effects are printed in the same line, in the comma-separated form of 
'UTR3,UTR5' or 'exonic,splicing'.

=item B<--colsWanted>

specify which columns are desired in the output for -regionanno. By default, 
ANNOVAR inteligently selects the columns based on the DB type. However, users 
can use a list of comma-delimited numbers, or use 'all', or use 'none', to 
request custom output columns.

=item B<--comment>

specify that the program should include comment lines in the output files. 
Comment lines are defined as any line starting with #. By default, these lines 
are not recognized as valid ANNOVAR input and are therefore written to the 
INVALID_INPUT file. This argument can be very useful to keep columns headers in 
the output file, if the input file use comment line to flag the column headers 
(usually the first line in the input file).

=item B<--scorecolumn>

specify the the column with desired output scores in UCSC database file (for 
region-based annotation). The default usually works okay.

=item B<--exonsort>

sort the exon number in output line in the exonic_variant_function file during 
gene-based annotation. If a mutation affects multiple transcripts, the ones with 
the smaller exon number will be printed before the transcript with larger exon 
number in the output.

=item B<--batchsize>

this argument specifies the batch size for processing variants by gene-based 
annotation. Normally 5 million variants (usually one human genome will have 
about 3-5 million variants depending on ethnicity) are annotated as a batch, to 
reduce the amounts of memory. The users can adjust the parameters: larger values 
make the program slightly faster, at the expense of slightly larger memory 
requirements. In a 64bit computer, the default settings usually take 1GB memory 
for gene-based annotation for human genome for a typical query file, but this 
depends on the complexity of the query (note that the query has a few required 
fields, but may have many optional fields and those fields need to be read and 
kept in memory).

=item B<--genomebinsize>

the bin size of genome to speed up search. By default 100kb is used for gene-
based annotation, so that variant annotation focused on specific bins only 
(based on the start-end site of a given variant), rather than searching the 
entire chromosomes for each variant. By default 10kb is used for region-based 
annotation. The filter-based annotations look for variants directly so no bin is 
used.

=item B<--expandbin>

expand bin to both sides to find neighboring genes/regions. For gene-based 
annotation, ANNOVAR tries to find nearby genes for any intergenic variant, with 
a maximum number of nearby bins to search. By default, ANNOVAR will 
automatically set this argument to search 2 megabases to the left and right of 
the variant in genome.

=item B<--neargene>

the distance threshold to define whether a variant is in the upstream or 
downstream region of a gene. By default 1 kilobase from the start or end site of 
a transcript is defined as upstream or downstream, respectively. This is useful, 
for example, when one wants to identify variants that are located in the 
promoter regions of genes across the genome.

=item B<--score_threshold>

the minimum score to consider when examining region-based annotations on UCSC 
Genome Browser tables. Some tables do not have such scores and this argument 
will not be effective.

=item B<--normscore_threshold>

the minimum normalized score to consider when examining region-based annotations 
on UCSC Genome Browser tables. The normalized score is calculated by UCSC, 
ranging from 0 to 1000, to make visualization easier. Some tables do not have 
such scores and this argument will not be effective.

=item B<--rawscore>

for region-based annotation, print out raw scores from UCSC Genome Browser 
tables, rather than normalized scores. By default, normalized scores are printed 
in the output files. Normalized scores are compiled by UCSC Genome Browser for 
each track, and they usually range from 0 to 1000, but there are some 
exceptions.

=item B<--minqueryfrac>

The minimum fraction of overlap between a query and a database record to decide 
on their match. By default, any overlap is regarded as a match, but this may not 
work best when query consist of large copy number variants.

=item B<--splicing_threshold>

distance between splicing variants and exon/intron boundary, to claim that a 
variant is a splicing variant. By default, 2bp is used. ANNOVAR is relatively 
more stringent than some other software to claim variant as regulating splicing. 
In addition, if a variant is an exonic variant, it will not be reported as 
splicing variant even if it is within 2bp to an exon/intron boundary.

=item B<--maf_threshold>

the minor allele frequency (MAF) threshold to be used in the filter-based 
annotation for the 1000 Genomes Project databases. By default, any variant 
annotated in the 1000G will be used in filtering.

=item B<--memfree>

the minimum amount of free system memory that ANNOVAR should ensure to have. By 
default, if ANNOVAR takes too much memory such that only 100Mb system memory is 
available, ANNOVAR will stop reading annotation database into memory, and will 
start annotation procedure, and then clear the memory, and then read the next 
block of annotation database again. This argument ensures that ANNOVAR will not 
attempt to use virtual memory in the system (which makes ANNOVAR extremely 
slow).

=item B<--memtotal>

the total amount of memory that ANNOVAR should use at most. By default, this 
value is zero, meaning that there is no limit on that. Decreasing this threshold 
reduce the memory requirement by ANNOVAR, but may increase the execution time.

=item B<--chromosome>

examine these specific chromosomes in database file. The argument takes comma-
delimited values, and the dash can be correctly recognized. For example, 5-10,X 
represent chromosome 5 through chromosome 10 plus chromosome X.

=back

=head1 DESCRIPTION

ANNOVAR is a software tool that can be used to functionally annotate a list of genetic variants, 
possibly generated from next-generation sequencing experiments. For example, 
given a whole-genome resequencing data set for a human with specific diseases, 
typically around 3 million SNPs and around half million insertions/deletions 
will be identified. Given this massive amounts of data (and candidate disease-
causing variants), it is necessary to have a fast algorithm that scans the data 
and identify a prioritized subset of variants that are most likely functional 
for follow-up Sanger sequencing studies and functional assays.

Currently, these various types of functional annotations produced by ANNOVAR can 
be (1) gene-based annotations (the default behavior), such as exonic variants, 
intronic variants, intergenic variants, downstream variants, UTR variants, 
splicing site variants, stc. For exonic variants, ANNOVAR will try to predict 
whether each of the variants is non-synonymous SNV, synonymous SNV, 
frameshifting change, nonframeshifting change. (2) region-based annotation, to 
identify whether a given variant overlaps with a specific type of genomic 
region, for example, predicted transcription factor binding site or predicted 
microRNAs.(3) filter-based annotation, to filter a list of variants so that only 
those not observed in variation databases (such as 1000 Genomes Project and 
dbSNP) are printed out.

Currently, I am expanding the functionality of ANNOVAR on (1) Fusion gene 
detection from large deletions, where a deletion joins the reading frame of two 
genes (same orientation of transcription) together to create a new gene. (2) 
Assignment of functional importance score to each observed mutation in the 
genome. This will be extremely important for the development of association 
tests for rare variants, and for prioritization of variants in downstream 
functional studies after a successful genome-wide association studies (GWAS).

=over 8

=item * B<variant file format>

A sample variant file contains one variant per line, with the fields being chr, 
start, end, reference allele, observed allele, other information. The other 
information can be anything (for example, it may contain sample identifiers for 
the corresponding variant.) An example is shown below:

	16      49303427        49303427        C       T       rs2066844       R702W (NOD2)
	16      49314041        49314041        G       C       rs2066845       G908R (NOD2)
	16      49321279        49321279        -       C       rs2066847       c.3016_3017insC (NOD2)
	16      49290897        49290897        C       T       rs9999999       intronic (NOD2)
	16      49288500        49288500        A       T       rs8888888       intergenic (NOD2)
	16      49288552        49288552        T       -       rs7777777       UTR5 (NOD2)
	18      56190256        56190256        C       T       rs2229616       V103I (MC4R)

=item * B<database file format: UCSC Genome Browser annotation database>

Most but not all of the gene annotation databases are directly downloaded from 
UCSC Genome Browser, so the file format is identical to what was used by the 
genome browser. The users can check Table Browser (for example, human hg18 table 
browser is at http://www.genome.ucsc.edu/cgi-bin/hgTables?org=Human&db=hg18) to 
see what fields are available in the annotation file. Note that even for the 
same species (such as humans), the file format might be different between 
different genome builds (such as between hg16, hg17 and hg18). ANNOVAR will try 
to be smart about guessing file format, based on the combination of the --
buildver argument and the number of columns in the input file. In general, the 
database file format should not be something that users need to worry about.

=item * B<database file format: GFF3 format for gene-based annotations)>

As of June 2010, ANNOVAR cannot perform gene-based annotations using GFF3 input 
files, and any annotations on GFF3 is region-based. However, this is expected to 
be changed in the future.

=item * B<database file format: GFF3 format for region-based annotations)>

Currently, region-based annotations can support the Generic Feature Format 
version 3 (GFF3) formatted files. The GFF3 has become the de facto golden 
standards for many model organism databases, such that many users may want to 
take a custom annotation database and run ANNOVAR on them, and it would be the 
most convenient if the custom file is made with GFF3 format. 

=item * B<database file format: generic format for filter-based annotations)>

The 'generic' format is designed for filter-based annotation that looks for 
exact variants. The format is almost identical to the ANNOVAR input format, with 
chr, start, end, reference allele, observed allele and scores (higher scores are 
regarded as better).

=item * B<database file format: VCF format for filter-based annotations)>

The 1000 Genomes Project now provide their variant annotations in VCF format, so 
I implemented the functionality to directly interrogate VCF files. A VCF file 
may contain summary information for variants (for example, this variant has MAF 
of 5% in this population), or it may contain the actual variant calls for each 
individual in a specific population. As of March 2010, the files from 1000G website 
only contains the first type of information (that is, alleles and their 
frequencies in population). For the purpose of simplicity, ANNOVAR only 
interrogates the first type of information.

=item * B<database file format: avsift for filter-based annotations)>

avsift refers to a file that ANNOVAR developers compiled for fast annotation of 
SIFT scores for non-synonymous variants in the human genome. It conforms to the 
generic format described above. However, users can directly specify '--dbtype 
avsift' in command line to perform avsift annotations, making it more convenient 
for users. Alternatively, users can use '--dbtype generic -genericdbfile 
hg18_avsift.txt' for the annotation, and the effects are usually the same.

=item * B<sequence file format>

ANNOVAR can directly examine FASTA-formatted sequence files. For mRNA sequences, 
the name of the sequences are the mRNA identifier. For genomic sequences, the 
name of the sequences in the files are usually chr1, chr2, chr3, etc, so that 
ANNOVAR knows which sequence corresponds to which chromosome. Unfortunately, 
UCSC uses things like chr6_random to annotate un-assembled sequences, as opposed 
to using the actual contig identifiers. This causes some issues (depending on 
how reads alignment algorithms works), but in general should not be something 
that user need to worry about. If the users absolutely care about the exact 
contigs rather than chr*_random, then they will need to re-align the short reads 
at chr*_random to a different FASTA file that contains the contigs, and then 
execute ANNOVAR on the newly identified variants.

=item * B<invalid input>

If the query file contains input lines with invalid format, ANNOVAR will skip 
such line and continue with the annotation on next lines. These invalid input 
lines will be written to a file with suffix invalid_input. Users should manually 
examine this file and identify sources of error.

=back

ANNOVAR is freely available to the academic community for non-commercial use. For 
questions or comments, please contact kai@openbioinformatics.org.

=cut