Mercurial > repos > rdaveau > gfap
comparison gfapts/inc/annovar/annotate_variation.pl @ 0:f753b30013e6 draft
Uploaded
author | rdaveau |
---|---|
date | Fri, 29 Jun 2012 10:20:55 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f753b30013e6 |
---|---|
1 #!/usr/bin/perl | |
2 use warnings; | |
3 use strict; | |
4 use Pod::Usage; | |
5 use Getopt::Long; | |
6 use File::Spec; | |
7 use Cwd; | |
8 | |
9 our $VERSION = '$Revision: 466 $'; | |
10 our $LAST_CHANGED_DATE = '$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $'; | |
11 | |
12 our ($verbose, $help, $man); | |
13 our ($queryfile, $dbloc); | |
14 our ($outfile, $separate, $batchsize, $dbtype, $neargene, $genomebinsize, $geneanno, $regionanno, $filter, $downdb, $buildver, $score_threshold, $normscore_threshold, $minqueryfrac, $expandbin, $splicing_threshold, | |
15 $maf_threshold, $chromosome, $zerostart, $rawscore, $memfree, $memtotal, $sift_threshold, $gff3dbfile, $genericdbfile, $vcfdbfile, $time, $wget, $precedence, | |
16 $webfrom, $colsWanted, $comment, $scorecolumn, $transfun, $exonsort, $avcolumn, $bedfile); | |
17 our (%valichr, $dbtype1); | |
18 our (@precedence, @colsWanted, @avcolumn); | |
19 sub printerr; #declare a subroutine | |
20 | |
21 our %codon1 = (TTT=>"F", TTC=>"F", TCT=>"S", TCC=>"S", TAT=>"Y", TAC=>"Y", TGT=>"C", TGC=>"C", TTA=>"L", TCA=>"S", TAA=>"*", TGA=>"*", TTG=>"L", TCG=>"S", TAG=>"*", TGG=>"W", CTT=>"L", CTC=>"L", CCT=>"P", CCC=>"P", CAT=>"H", CAC=>"H", CGT=>"R", CGC=>"R", CTA=>"L", CTG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", ATT=>"I", ATC=>"I", ACT=>"T", ACC=>"T", AAT=>"N", AAC=>"N", AGT=>"S", AGC=>"S", ATA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", ATG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GTT=>"V", GTC=>"V", GCT=>"A", GCC=>"A", GAT=>"D", GAC=>"D", GGT=>"G", GGC=>"G", GTA=>"V", GTG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G"); | |
22 our %codon3 = (TTT=>"Phe", TTC=>"Phe", TCT=>"Ser", TCC=>"Ser", TAT=>"Tyr", TAC=>"Tyr", TGT=>"Cys", TGC=>"Cys", TTA=>"Leu", TCA=>"Ser", TAA=>"*", TGA=>"*", TTG=>"Leu", TCG=>"Ser", TAG=>"*", TGG=>"Trp", CTT=>"Leu", CTC=>"Leu", CCT=>"Pro", CCC=>"Pro", CAT=>"His", CAC=>"His", CGT=>"Arg", CGC=>"Arg", CTA=>"Leu", CTG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", ATT=>"Ile", ATC=>"Ile", ACT=>"Thr", ACC=>"Thr", AAT=>"Asn", AAC=>"Asn", AGT=>"Ser", AGC=>"Ser", ATA=>"Ile", ACA=>"Thr", AAA=>"Lys", AGA=>"Arg", ATG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"Arg", GTT=>"Val", GTC=>"Val", GCT=>"Ala", GCC=>"Ala", GAT=>"Asp", GAC=>"Asp", GGT=>"Gly", GGC=>"Gly", GTA=>"Val", GTG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly"); | |
23 our %codonfull = (TTT=>"Phenylalanine", TTC=>"Phenylalanine", TCT=>"Serine", TCC=>"Serine", TAT=>"Tyrosine", TAC=>"Tyrosine", TGT=>"Cysteine", TGC=>"Cysteine", TTA=>"Leucine", TCA=>"Serine", TAA=>"Stop", TGA=>"Stop", TTG=>"Leucine", TCG=>"Serine", TAG=>"Stop", TGG=>"Tryptophan", CTT=>"Leucine", CTC=>"Leucine", CCT=>"Proline", CCC=>"Proline", CAT=>"Histidine", CAC=>"Histidine", CGT=>"Arginine", CGC=>"Arginine", CTA=>"Leucine", CTG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", ATT=>"Isoleucine", ATC=>"Isoleucine", ACT=>"Threonine", ACC=>"Threonine", AAT=>"Asparagine", AAC=>"Asparagine", AGT=>"Serine", AGC=>"Serine", ATA=>"Isoleucine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Arginine", ATG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Arginine", GTT=>"Valine", GTC=>"Valine", GCT=>"Alanine", GCC=>"Alanine", GAT=>"Aspartic acid", GAC=>"Aspartic acid", GGT=>"Glycine", GGC=>"Glycine", GTA=>"Valine", GTG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine"); | |
24 our %codonr1 = (UUU=>"F", UUC=>"F", UCU=>"S", UCC=>"S", UAU=>"Y", UAC=>"Y", UGU=>"C", UGC=>"C", UUA=>"L", UCA=>"S", UAA=>"*", UGA=>"*", UUG=>"L", UCG=>"S", UAG=>"*", UGG=>"W", CUU=>"L", CUC=>"L", CCU=>"P", CCC=>"P", CAU=>"H", CAC=>"H", CGU=>"R", CGC=>"R", CUA=>"L", CUG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", AUU=>"I", AUC=>"I", ACU=>"T", ACC=>"T", AAU=>"N", AAC=>"N", AGU=>"S", AGC=>"S", AUA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", AUG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GUU=>"V", GUC=>"V", GCU=>"A", GCC=>"A", GAU=>"D", GAC=>"D", GGU=>"G", GGC=>"G", GUA=>"V", GUG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G"); | |
25 our %codonr3 = (UUU=>"Phe", UUC=>"Phe", UCU=>"Ser", UCC=>"Ser", UAU=>"Tyr", UAC=>"Tyr", UGU=>"Cys", UGC=>"Cys", UUA=>"Leu", UCA=>"Ser", UAA=>"*", UGA=>"*", UUG=>"Leu", UCG=>"Ser", UAG=>"*", UGG=>"Trp", CUU=>"Leu", CUC=>"Leu", CCU=>"Pro", CCC=>"Pro", CAU=>"His", CAC=>"His", CGU=>"Arg", CGC=>"Arg", CUA=>"Leu", CUG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", AUU=>"Ile", AUC=>"Ile", ACU=>"Thr", ACC=>"Thr", AAU=>"Asn", AAC=>"Asn", AGU=>"Ser", AGC=>"Ser", AUA=>"Ile", ACA=>"Thr", AAA=>"Lys", AGA=>"Arg", AUG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"Arg", GUU=>"Val", GUC=>"Val", GCU=>"Ala", GCC=>"Ala", GAU=>"Asp", GAC=>"Asp", GGU=>"Gly", GGC=>"Gly", GUA=>"Val", GUG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly"); | |
26 our %codonrfull = (UUU=>"Phenylalanine", UUC=>"Phenylalanine", UCU=>"Serine", UCC=>"Serine", UAU=>"Tyrosine", UAC=>"Tyrosine", UGU=>"Cysteine", UGC=>"Cysteine", UUA=>"Leucine", UCA=>"Serine", UAA=>"Stop", UGA=>"Stop", UUG=>"Leucine", UCG=>"Serine", UAG=>"Stop", UGG=>"Tryptophan", CUU=>"Leucine", CUC=>"Leucine", CCU=>"Proline", CCC=>"Proline", CAU=>"Histidine", CAC=>"Histidine", CGU=>"Arginine", CGC=>"Arginine", CUA=>"Leucine", CUG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", AUU=>"Isoleucine", AUC=>"Isoleucine", ACU=>"Threonine", ACC=>"Threonine", AAU=>"Asparagine", AAC=>"Asparagine", AGU=>"Serine", AGC=>"Serine", AUA=>"Isoleucine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Arginine", AUG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Arginine", GUU=>"Valine", GUC=>"Valine", GCU=>"Alanine", GCC=>"Alanine", GAU=>"Aspartic acid", GAC=>"Aspartic acid", GGU=>"Glycine", GGC=>"Glycine", GUA=>"Valine", GUG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine"); | |
27 our %iupac = (R=>'AG', Y=>'CT', S=>'GC', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-'); | |
28 | |
29 processArguments (); #process program arguments, set up default values, check for errors, check for existence of db files | |
30 if ($geneanno) { | |
31 annotateQueryByGene (); #generate gene-based annoations (classify variants into intergenic, introgenic, non-synonymous, synonymous, UTR, frameshift, etc) | |
32 } elsif ($regionanno) { | |
33 annotateQueryByRegion (); #generate region-based annotations (most conserved elements, transcription factor binding sites, etc) | |
34 } elsif ($filter) { | |
35 filterQuery (); #generate filter-based annotations (identify variants not reported in variation databases) | |
36 } elsif ($downdb) { | |
37 downloadDB (); #download annotation databases from Internet | |
38 } | |
39 | |
40 sub processArguments { | |
41 my @command_line = @ARGV; #command line argument | |
42 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'outfile=s'=>\$outfile, 'separate'=>\$separate, | |
43 'batchsize=s'=>\$batchsize, 'dbtype=s'=>\$dbtype, 'neargene=i'=>\$neargene, 'genomebinsize=s'=>\$genomebinsize, | |
44 'geneanno'=>\$geneanno, 'regionanno'=>\$regionanno, , 'filter'=>\$filter, 'downdb'=>\$downdb, 'buildver=s'=>\$buildver, 'score_threshold=f'=>\$score_threshold, | |
45 'normscore_threshold=i'=>\$normscore_threshold, 'minqueryfrac=f'=>\$minqueryfrac, 'expandbin=i'=>\$expandbin, 'splicing_threshold=i'=>\$splicing_threshold, | |
46 'maf_threshold=f'=>\$maf_threshold, 'chromosome=s'=>\$chromosome, 'zerostart'=>\$zerostart, 'rawscore'=>\$rawscore, 'memfree=i'=>\$memfree, | |
47 'memtotal=i'=>\$memtotal, 'sift_threshold=f'=>\$sift_threshold, 'gff3dbfile=s'=>\$gff3dbfile, 'genericdbfile=s'=>\$genericdbfile, 'vcfdbfile=s'=>\$vcfdbfile, | |
48 'time'=>\$time, 'wget!'=>\$wget, 'precedence=s'=>\$precedence, 'webfrom=s'=>\$webfrom, 'colsWanted=s'=>\$colsWanted, 'comment'=>\$comment, | |
49 'scorecolumn=i'=>\$scorecolumn, 'transcript_function'=>\$transfun, 'exonsort'=>\$exonsort, 'avcolumn=s'=>\$avcolumn, 'bedfile=s'=>\$bedfile) or pod2usage (); | |
50 | |
51 $help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT); | |
52 $man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT); | |
53 @ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT); | |
54 @ARGV == 2 or pod2usage ("Syntax error"); | |
55 | |
56 ($queryfile, $dbloc) = @ARGV; | |
57 | |
58 $dbloc =~ s/[\\\/]$//; #delete the trailing / or \ sign as part of the directory name | |
59 if (defined $batchsize) { | |
60 $batchsize =~ s/k$/000/; | |
61 $batchsize =~ s/m$/000000/; | |
62 $batchsize =~ m/^\d+$/ or pod2usage ("Error: the --batchsize argument must be a positive integer (suffix of k or m is okay)"); | |
63 } else { | |
64 $batchsize = 5_000_000; | |
65 } | |
66 if (defined $genomebinsize) { | |
67 $genomebinsize =~ s/k$/000/; | |
68 $genomebinsize =~ s/m$/000000/; | |
69 $genomebinsize =~ m/^\d+$/ or pod2usage ("Error: the --genomebinsize argument must be a positive integer (suffix of k or m is okay)"); | |
70 $genomebinsize > 1000 or pod2suage ("Error: the --genomebinsize argument must be larger than 1000"); | |
71 } else { | |
72 if ($geneanno) { | |
73 $genomebinsize = 100_000; #gene usually span large genomic regions | |
74 } else { | |
75 $genomebinsize = 10_000; #MCE, TFBS, miRNA, etc are small genomic regions | |
76 } | |
77 } | |
78 | |
79 $verbose ||= 0; #when it is not specified, it is zero | |
80 $neargene ||= 1_000; #for upstream/downstream annotation of variants, specify the distance threshold between variants and genes | |
81 $expandbin ||= int(2_000_000/$genomebinsize); #for gene-based annotations, when intergenic variants are found, expand to specified number of nearby bins to find closest genes | |
82 $outfile ||= $queryfile; #specify the prefix of output file names | |
83 | |
84 #set up log file | |
85 if ($downdb) { | |
86 if (not -d $dbloc) { | |
87 mkdir ($dbloc) or die "Error: the directory $dbloc does not exist and cannot be created\n"; | |
88 } | |
89 my $errfile = File::Spec->catfile ($dbloc, "annovar_downdb.log"); | |
90 open (LOG, ">$errfile") or die "Error: cannot write LOG information to log file $errfile: $!\n"; | |
91 } else { | |
92 open (LOG, ">$outfile.log") or die "Error: cannot write LOG information to log file $outfile.log: $!\n"; | |
93 } | |
94 print LOG "ANNOVAR Version:\n\t", q/$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $/, "\n"; | |
95 print LOG "ANNOVAR Information:\n\tFor questions, comments, documentation, bug reports and program update, please visit http://www.openbioinformatics.org/annovar/\n"; | |
96 print LOG "ANNOVAR Command:\n\t$0 @command_line\n"; | |
97 print LOG "ANNOVAR Started:\n\t", scalar (localtime), "\n"; | |
98 | |
99 my $num = 0; | |
100 $geneanno and $num++; | |
101 $downdb and $num++; | |
102 $filter and $num++; | |
103 $regionanno and $num++; | |
104 $num <= 1 or pod2usage ("Error in argument: please specify only one of --geneanno, -regionanno, --downdb, --filter"); | |
105 if (not $num) { | |
106 $geneanno++; | |
107 printerr "NOTICE: The --geneanno operation is set to ON by default\n"; | |
108 } | |
109 | |
110 my %dbtype1 = ('gene'=>'refGene', 'refgene'=>'refGene', 'knowngene'=>'knownGene', 'ensgene'=>'ensGene', 'band'=>'cytoBand', 'cytoband'=>'cytoBand', 'tfbs'=>'tfbsConsSites', 'mirna'=>'wgRna', | |
111 'mirnatarget'=>'targetScanS', 'segdup'=>'genomicSuperDups', 'omimgene'=>'omimGene', 'gwascatalog'=>'gwasCatalog', | |
112 '1000g_ceu'=>'CEU.sites.2009_04', '1000g_yri'=>'YRI.sites.2009_04', '1000g_jptchb'=>'JPTCHB.sites.2009_04', | |
113 '1000g2010_ceu'=>'CEU.sites.2010_03', '1000g2010_yri'=>'YRI.sites.2010_03', '1000g2010_jptchb'=>'JPTCHB.sites.2010_03', | |
114 '1000g2010jul_ceu'=>'CEU.sites.2010_07', '1000g2010jul_yri'=>'YRI.sites.2010_07', '1000g2010jul_jptchb'=>'JPTCHB.sites.2010_07', | |
115 '1000g2010nov_all'=>'ALL.sites.2010_11', | |
116 ); | |
117 | |
118 if ($geneanno) { | |
119 $dbtype ||= 'refGene'; | |
120 $dbtype1 = $dbtype1{$dbtype} || $dbtype; | |
121 #$dbtype1 =~ m/^(refGene|knownGene|ensGene)$/ or pod2usage ("Error: the gene-based annotation procedure currently only support -dbtype of refGene, knownGene and ensGene"); #commented 2011feb18 | |
122 } elsif ($regionanno) { | |
123 defined $dbtype or pod2usage ("Error in argument: please specify --dbtype (required for the --regionanno operation)"); | |
124 $dbtype1 = $dbtype1{$dbtype} || $dbtype; | |
125 if ($dbtype =~ m/^mce(\d+)way/) { #added 2010Feb16 | |
126 $dbtype1 = "phastConsElements$1way"; | |
127 } | |
128 if ($dbtype1 eq 'gff3') { | |
129 defined $gff3dbfile or pod2usage ("Error in argument: please specify --gff3dbfile for the --dbtype of 'gff3'"); | |
130 } | |
131 } elsif ($filter) { | |
132 defined $dbtype or pod2usage ("Error in argument: please specify --dbtype (required for the --filter operation)"); | |
133 $dbtype =~ m/^avsift|generic|1000g_(ceu|yri|jptchb)|1000g2010_(ceu|yri|jptchb)|1000g20\d\d[a-z]{3}_[a-z]+|snp\d+|vcf|(ljb_\w+)$/ or pod2usage ("Error in argument: the specified --dbtype $dbtype is not valid for --filter operation (valid ones are '1000g_ceu', '1000g2010_yri', 'snp129', 'avsift', 'vcf', 'generic', etc)"); | |
134 $dbtype1 = $dbtype1{$dbtype} || $dbtype; | |
135 if ($dbtype1 eq 'generic') { | |
136 defined $genericdbfile or pod2usage ("Error in argument: please specify --genericdbfile for the --dbtype of 'generic'"); | |
137 } | |
138 if ($dbtype eq 'vcf') { | |
139 defined $vcfdbfile or pod2usage ("Error in argument: please specify --vcfdbfile for the --dbtype of 'vcf'"); | |
140 } | |
141 } elsif ($downdb) { | |
142 defined $dbtype and pod2usage ("Error in argument: please do not specify --dbtype for the --downdb operation"); | |
143 $dbtype1 = $dbtype1{$queryfile} || $queryfile; | |
144 } | |
145 | |
146 if (not $buildver) { | |
147 $buildver = 'hg18'; | |
148 printerr "NOTICE: The --buildver is set as 'hg18' by default\n"; | |
149 } | |
150 | |
151 if ($score_threshold) { | |
152 $score_threshold > 0 or pod2usage ("Error in argument: the --score_threshold must be a positive number (you specified $score_threshold)"); | |
153 $geneanno || $downdb and pod2usage ("Error in argument: the --score_threshold is not useful for --geneanno or --downdb operations"); | |
154 } | |
155 if ($normscore_threshold) { | |
156 $normscore_threshold <= 1000 or pod2usage ("Error in argument: the --normscore_threshold must be between 0 and 1000 (you specified $normscore_threshold)"); | |
157 $regionanno or pod2usage ("Error in argument: the --score_threshold is supported only for the --regionanno operation"); | |
158 } | |
159 | |
160 | |
161 if (defined $sift_threshold) { | |
162 $filter or pod2usage ("Error in argument: the --sift_threshold is supported only for the --filter operation"); | |
163 $dbtype1 eq 'avsift' or pod2usage ("Error in argument: the --sift_threshold argument can be used only if '--dbtype avsift' is used"); | |
164 $sift_threshold >= 0 and $sift_threshold <= 1 or pod2usage ("Error in argument: the --sift_threshold must be between 0 and 1 inclusive"); | |
165 } else { | |
166 $sift_threshold = 0.05; | |
167 } | |
168 | |
169 #operation-specific argument | |
170 if (defined $splicing_threshold) { | |
171 $geneanno or pod2usage ("Error in argument: the --splicing_threshold is supported only for the --geneanno operation"); | |
172 } else { | |
173 $splicing_threshold = 2; #for splicing annotation, specify the distance threshold between variants and exon/intron boundaries | |
174 } | |
175 if (defined $maf_threshold) { | |
176 $filter or pod2usage ("Error in argument: the --maf_threshold is supported only for the --filter operation"); | |
177 } else { | |
178 $maf_threshold = 0; #for filter-based annotations on 1000 Genomes Project data, specify the MAF threshold to be used in filtering | |
179 } | |
180 if (defined $minqueryfrac) { | |
181 $regionanno or pod2usage ("Error in argument: the --minqueryfrac is supported only for the --regionanno operation"); | |
182 } else { | |
183 $minqueryfrac = 0; #minimum query overlap to declare a "match" with database records | |
184 } | |
185 if (defined $gff3dbfile) { | |
186 $dbtype eq 'gff3' or pod2usage ("Error in argument: the --gff3dbfile argument can be used only if '--dbtype gff3' is used"); | |
187 $geneanno or $regionanno or pod2usage ("Error in argument: the --gff3dbfile argument is supported only for the --geneanno or --regionanno operation"); | |
188 } | |
189 if (defined $bedfile) { | |
190 $dbtype eq 'bed' or pod2usage ("Error in argument: the --bedfile argument can be used only if '--dbtype bed' is used"); | |
191 $regionanno or pod2usage ("Error in argument: the --bedfile argument is supported only for the --regionanno operation"); | |
192 } | |
193 if (defined $genericdbfile) { | |
194 $filter or pod2usage ("Error in argument: the --genericdbfile argument is supported only for the --filter operation"); | |
195 } | |
196 if (defined $wget) { | |
197 $downdb or pod2usage ("Error in argument: the --wget argument is supported only for the --downdb operation"); | |
198 } else { | |
199 $wget = 1; #by default, use wget for downloading files from Internet | |
200 } | |
201 if (defined $precedence) { | |
202 $geneanno or pod2usage ("Error in argument: the --precedence argument is supported only for the --geneanno operation"); | |
203 @precedence = split (/,/, $precedence); | |
204 @precedence >= 2 or pod2usage ("Error in argument: the --precedence argument should be comma delimited"); | |
205 for my $i (0 .. @precedence-1) { | |
206 $precedence[$i] =~ m/^(exonic|intronic|splicing|utr5|utr3|upstream|downstream|splicing|ncrna)$/ or pod2usage ("Error in argument: the --precedence argument contains invalid keywords (valid ones are exonic|intronic|splicing|utr5|utr3|upstream|downstream|splicing)"); | |
207 } | |
208 } | |
209 | |
210 if (defined $colsWanted) { | |
211 $regionanno or pod2usage ("Error in argument: the --colWanted argument is supported only for the --geneanno operation"); | |
212 if (lc $colsWanted eq 'all') { | |
213 @colsWanted = ('all'); | |
214 } elsif (lc $colsWanted eq 'none') { | |
215 @colsWanted = ('none'); | |
216 } else { | |
217 @colsWanted = split (/,/, $colsWanted); | |
218 for my $i (0 .. @colsWanted-1) { | |
219 $colsWanted[$i]=~m/^\d+$/ or pod2usage ("Error in argument: the --colsWanted argument ($colsWanted) must be a list of comma delimited numbers or be 'all' or be 'none'"); | |
220 } | |
221 } | |
222 } | |
223 | |
224 if (defined $scorecolumn) { | |
225 $regionanno or pod2usage ("Error in argument: the --scorecolumn argument is supported only for the --regionanno operation"); | |
226 } | |
227 | |
228 if ($exonsort) { | |
229 $geneanno or pod2usage ("Error in argument: the --exonsort argument is supported only for the --geneanno operation"); | |
230 } | |
231 | |
232 if (defined $avcolumn) { | |
233 $avcolumn =~ m/^\d+,\d+,\d+,\d+,\d+$/ or pod2usage ("Error in argument: the --avcolumn argument must be five integer numbers separated by comma"); | |
234 @avcolumn = split (/,/, $avcolumn); | |
235 @avcolumn = map {$_-1} @avcolumn; | |
236 } else { | |
237 @avcolumn = (0..4); #by default, the first five columns are the required AVINPUT information | |
238 } | |
239 | |
240 if (defined $webfrom) { | |
241 if ($webfrom ne 'ucsc' and $webfrom ne 'annovar') { | |
242 $webfrom =~ m#^(http://|ftp://)# or pod2usage ("Error: the --webfrom argument needs to be 'ucsc', 'annovar', or a URL"); | |
243 } | |
244 } | |
245 | |
246 $maf_threshold >= 0 and $maf_threshold <= 0.5 or pod2usage ("Error in argument: the --maf_threshold must be between 0 and 0.5 (you specified $maf_threshold)"); | |
247 $minqueryfrac >= 0 and $minqueryfrac <= 1 or pod2usage ("Error in argument: the --minqueryfrac must be between 0 and 1 (you specified $minqueryfrac)"); | |
248 $memfree and $memfree >= 100_000 || pod2usage ("Error in argument: the --memfree argument must be at least 100000 (in the order of kilobytes)"); | |
249 $memtotal and $memtotal >= 100_000 || pod2usage ("Error in argument: the --memtotal argument must be at least 100000 (in the order of kilobytes)"); | |
250 | |
251 if ($chromosome) { | |
252 my @chr = split (/,/, $chromosome); | |
253 for my $i (0 .. @chr-1) { | |
254 if ($chr[$i] =~ m/^(\d+)-(\d+)$/) { | |
255 for my $j ($1 .. $2) { | |
256 $valichr{$j}++; | |
257 } | |
258 } else { | |
259 $valichr{$chr[$i]}++; | |
260 } | |
261 } | |
262 printerr "NOTICE: These chromosomes in database will be examined: ", join (",", sort keys %valichr), "\n"; | |
263 } | |
264 } | |
265 | |
266 | |
267 sub annotateQueryByGene { | |
268 my ($queryfh); #query file handle | |
269 my ($totalquerycount, $totalinvalidcount, $batchcount) = qw/0 0 1/; | |
270 open ($queryfh, $queryfile) or die "Error: cannot read from --queryfile ($queryfile): $!\n"; | |
271 | |
272 open (OUT, ">$outfile.variant_function") or die "Error: cannot write to output file $outfile.variant_function: $!\n"; | |
273 open (EXONIC, ">$outfile.exonic_variant_function") or die "Error: cannot write to output file $outfile.exonic_variant_function: $!\n"; | |
274 open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n"; | |
275 | |
276 my ($genedb, $geneidmap, $cdslen, $mrnalen) = readUCSCGeneAnnotation ($dbloc); | |
277 | |
278 $time and printerr "NOTICE: Current time (before examining variants) is ", scalar (localtime), "\n"; | |
279 while (1) { | |
280 my ($linecount, $invalidcount) = newprocessNextQueryBatchByGene ($queryfh, $batchsize, $genedb, $geneidmap, $cdslen, $mrnalen); | |
281 $totalquerycount += $linecount; | |
282 $totalinvalidcount += $invalidcount; | |
283 $linecount == $batchsize or last; | |
284 $batchcount++; | |
285 printerr "NOTICE: Begin processing batch $batchcount (each batch contains $batchsize variants)\n"; | |
286 } | |
287 close (INVALID); | |
288 close (EXONIC); | |
289 close (OUT); | |
290 close ($queryfh); | |
291 $time and printerr "NOTICE: Current time (after examining variants) is ", scalar (localtime), "\n"; | |
292 | |
293 $totalinvalidcount or unlink ("$outfile.invalid_input"); #delete the file as it is empty | |
294 printerr "NOTICE: Finished gene-based annotation on $totalquerycount genetic variants in $queryfile"; | |
295 $totalinvalidcount and printerr " (including $totalinvalidcount with invalid format written to $outfile.invalid_input)"; | |
296 printerr "\n"; | |
297 printerr "NOTICE: Output files were written to $outfile.variant_function, $outfile.exonic_variant_function\n"; | |
298 } | |
299 | |
300 sub newprocessNextQueryBatchByGene { | |
301 my ($queryfh, $batchsize, $genedb, $geneidmap, $cdslen, $mrnalen) = @_; | |
302 my (%refseqvar); | |
303 | |
304 my ($chr, $start, $end, $ref, $obs); | |
305 my ($name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend, $name2); | |
306 my ($invalid); | |
307 my ($linecount, $invalidcount) = qw/0 0/; | |
308 | |
309 for my $i (1 .. $batchsize) { #process up to batchsize variants | |
310 my $nextline = <$queryfh>; #read the next line in variant file | |
311 defined $nextline or last; | |
312 $nextline =~ s/[\r\n]+$//; | |
313 | |
314 if ($nextline =~ m/^#/ and $comment) { #comment line start with #, do not include this is $linecount | |
315 print OUT "#comment\t$nextline\n"; | |
316 next; | |
317 } | |
318 | |
319 $linecount++; #linecount does not include the comment line | |
320 $invalid = 0; | |
321 | |
322 my @nextline = split (/\s+/, $nextline); | |
323 ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn]; | |
324 if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) { | |
325 $invalid++; | |
326 } else { | |
327 ($ref, $obs) = (uc $ref, uc $obs); | |
328 $zerostart and $start++; | |
329 $chr =~ s/^chr//; | |
330 if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) { | |
331 $invalid++; | |
332 } elsif ($ref eq '-' and $obs eq '-' #both are empty allele | |
333 or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code | |
334 or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code | |
335 or $start =~ m/[^\d]/ #start is not a number | |
336 or $end =~ m/[^\d]/ #end is not a number | |
337 or $start > $end #start is more than end | |
338 or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref | |
339 or $ref eq '-' and $start != $end #length mismatch for insertion | |
340 ) { | |
341 $invalid++; | |
342 } | |
343 } | |
344 | |
345 | |
346 | |
347 if ($invalid) { | |
348 print INVALID $nextline, "\n"; #invalid record found | |
349 $invalidcount++; | |
350 next; | |
351 } | |
352 | |
353 my (%intronic, %utr5, %utr3, %exonic, %upstream, %downstream, %ncrna, %intergenic, %splicing); | |
354 my $foundgenic; #variant found in genic region (between start and end position of a gene in genome) | |
355 my ($distl, $distr, $genel, $gener); #for intergenic variant, the distance and gene name to the left and right side of gene | |
356 my $bin1 = int ($start/$genomebinsize)-1; #start bin | |
357 $bin1 < 0 and $bin1=0; | |
358 my $bin2 = int ($end/$genomebinsize)+1; #end bin (usually same as start bin, unless the query is really big that spans multiple megabases) | |
359 | |
360 while (not exists $genedb->{$chr, $bin1} and $bin1 > int ($start/$genomebinsize)-$expandbin) { #examine at least 5 bins (by default 5Mb) to the left to make sure that a gene is found in the bin | |
361 $bin1 > 0 or last; | |
362 $bin1--; | |
363 } | |
364 | |
365 while (not exists $genedb->{$chr, $bin2} and $bin2 < int ($end/$genomebinsize)+$expandbin) { #examine at least 5 bins (by default 5Mb) to the right to make sure that a gene is found in the bin | |
366 $bin2++; | |
367 } | |
368 | |
369 my (%seen); | |
370 for my $nextbin ($bin1 .. $bin2) { | |
371 exists $genedb->{$chr, $nextbin} or next; #this genome bin has no annotated gene (a complete intergenic region) | |
372 for my $nextgene (@{$genedb->{$chr, $nextbin}}) { #when $genedb->{$chr, $nextbin} is undefined, this automatically create an array!!! | |
373 ($name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend, $name2) = @$nextgene; | |
374 defined $name2 or printerr "WARNING: name2 field is not provided for transcript $name (start=$txstart end=$txend)\n" and $name2=''; | |
375 $seen{$name, $txstart} and next; #name and txstart uniquely identify a transcript and chromosome position (sometimes same transcript may map to two nearby positions, such as nearby segmental duplications) | |
376 $seen{$name, $txstart}++; #a transcript may be in two adjacent bins, so if one is already scanned, there is no need to work on it again | |
377 | |
378 if ($transfun) { #variant_function output contains transcript name, rather than gene name | |
379 $name2 = $name; | |
380 } | |
381 | |
382 if (not $foundgenic) { #this variant has not hit a genic region yet | |
383 if ($start > $txend) { | |
384 defined $distl or $distl = $start-$txend and $genel=$name2; | |
385 $distl > $start-$txend and $distl = $start-$txend and $genel=$name2; #identify left closest gene | |
386 } | |
387 | |
388 if ($end < $txstart) { | |
389 defined $distr or $distr = $txstart-$end and $gener=$name2; | |
390 $distr > $txstart-$end and $distr = $txstart-$end and $gener=$name2; #identify right closest gene | |
391 } | |
392 } | |
393 | |
394 if ($end < $txstart) { | |
395 #query --- | |
396 #gene <-*----*-> | |
397 $foundgenic and last; #if found a genic annotation already, end the search of the bins | |
398 if ($end > $txstart - $neargene) { | |
399 if ($dbstrand eq '+') { | |
400 $upstream{$name2}++; | |
401 } else { | |
402 $downstream{$name2}++; | |
403 } | |
404 } else { | |
405 last; #if transcript is too far away from end, end the search of the bins | |
406 } | |
407 } elsif ($start > $txend) { | |
408 #query --- | |
409 #gene <-*----*-> | |
410 if (not $foundgenic and $start < $txend + $neargene) { | |
411 if ($dbstrand eq '+') { | |
412 $downstream{$name2}++; | |
413 } else { | |
414 $upstream{$name2}++; | |
415 } | |
416 } | |
417 } elsif ($cdsstart == $cdsend+1) { #non-coding RNA (could be microRNA, or could be due to lack of CDS annotation for mRNA such as NR_026730 or BC039000). Previously we already did cdsstart++ so here the cdsstart is more than cdsend | |
418 if ($start >= $txstart and $start <= $txend or $end >= $txstart and $end <= $txend or $start <= $txstart and $end >= $txend) { | |
419 $ncrna{$name2}++; | |
420 $foundgenic++; | |
421 } | |
422 } else { #query overlaps with coding region of gene | |
423 my ($lenintron) = (0); #cumulative intron length at a given exon | |
424 my ($rcdsstart, $rvarstart, $rvarend); #start of coding and variant in reference mRNA sequence | |
425 my @exonstart = @$exonstart; | |
426 my @exonend = @$exonend; | |
427 my $foundexonic; | |
428 if ($dbstrand eq '+') { #forward strand, search from left to right (first exon to last exon) | |
429 for my $k (0 .. @exonstart-1) { | |
430 $k and $lenintron += ($exonstart[$k]-$exonend[$k-1]-1); #calculate cumulative intron length | |
431 if ($cdsstart >= $exonstart[$k]) { #calculate CDS start accurately by considering intron length | |
432 $rcdsstart = $cdsstart-$txstart-$lenintron+1; | |
433 } | |
434 | |
435 #splicing calculation | |
436 if ($start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1 or $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) { | |
437 $splicing{$name2}++; #when query start site is close to exon start or exon end | |
438 } | |
439 if ($end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]+$splicing_threshold-1 or $end >= $exonend[$k]-$splicing_threshold+1 and $end <= $exonend[$k]+$splicing_threshold) { | |
440 $splicing{$name2}++; #when query end site is close to exon start or exon end | |
441 } | |
442 if ($start <= $exonstart[$k] and $end>=$exonstart[$k] or $start <= $exonend[$k] and $end >= $exonend[$k]) { | |
443 $splicing{$name2}++; #when query encompass the exon/intron boundary | |
444 } | |
445 | |
446 if ($start < $exonstart[$k]) { | |
447 if ($end >= $exonstart[$k]) { #exonic | |
448 $rvarstart = $exonstart[$k]-$txstart-$lenintron+1; | |
449 | |
450 for my $m ($k .. @exonstart-1) { | |
451 $m > $k and $lenintron += ($exonstart[$m]-$exonend[$m-1]-1); | |
452 if ($end < $exonstart[$m]) { | |
453 #query -------- | |
454 #gene <--**---******---****----> | |
455 $rvarend = $exonend[$m-1]-$txstart-$lenintron+1 + ($exonstart[$m]-$exonend[$m-1]-1); | |
456 last; | |
457 } elsif ($end <= $exonend[$m]) { | |
458 #query ----------- | |
459 #gene <--**---******---****----> | |
460 $rvarend = $end-$txstart-$lenintron+1; | |
461 last; | |
462 } | |
463 } | |
464 if (not defined $rvarend) { | |
465 $rvarend = $txend-$txstart-$lenintron+1; #if this value is longer than transcript length, it suggest whole gene deletion | |
466 } | |
467 | |
468 #here the trick begins to differentiate UTR versus coding exonic | |
469 if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns | |
470 #query ---- | |
471 #gene <--*---*-> | |
472 $utr5{$name2}++; #positive strand for UTR5 | |
473 } elsif ($start > $cdsend) { | |
474 #query ---- | |
475 #gene <--*---*-> | |
476 $utr3{$name2}++; #positive strand for UTR3 | |
477 } else { | |
478 $exonic{$name2}++; | |
479 $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '+', $i, $k+1, $nextline]; #refseq CDS start, refseq variant start | |
480 } | |
481 $foundgenic++; | |
482 last; | |
483 } elsif ($k and $start > $exonend[$k-1]) { #intronic | |
484 $intronic{$name2}++; | |
485 $foundgenic++; | |
486 last; | |
487 } | |
488 } elsif ($start <= $exonend[$k]) { #exonic | |
489 $rvarstart = $start-$txstart-$lenintron+1; | |
490 | |
491 for my $m ($k .. @exonstart-1) { | |
492 $m > $k and $lenintron += ($exonstart[$m]-$exonend[$m-1]-1); | |
493 if ($end < $exonstart[$m]) { | |
494 #query ------ | |
495 #gene <--**---******---****----> | |
496 $rvarend = $exonend[$m-1]-$txstart-$lenintron+1 + ($exonstart[$m]-$exonend[$m-1]-1); | |
497 last; | |
498 } elsif ($end <= $exonend[$m]) { | |
499 #query ----------- | |
500 #gene <--**---******---****----> | |
501 $rvarend = $end-$txstart-$lenintron+1; | |
502 last; | |
503 } | |
504 } | |
505 if (not defined $rvarend) { | |
506 $rvarend = $txend-$txstart-$lenintron+1; #if this value is longer than transcript length, it suggest whole gene deletion | |
507 } | |
508 | |
509 #here is the trick begins to differentiate UTR versus coding exonic | |
510 if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns | |
511 #query ---- | |
512 #gene <--*---*-> | |
513 $utr5{$name2}++; #positive strand for UTR5 | |
514 } elsif ($start > $cdsend) { | |
515 #query ---- | |
516 #gene <--*---*-> | |
517 $utr3{$name2}++; #positive strand for UTR3 | |
518 } else { | |
519 $exonic{$name2}++; | |
520 $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '+', $i, $k+1, $nextline]; #queryindex, refseq CDS start, refseq variant start | |
521 } | |
522 $foundgenic++; | |
523 last; | |
524 } | |
525 } | |
526 } elsif ($dbstrand eq '-') { #process negative strand (in the future, this should be fused to the paragraph above for positive strands; for now, I keep them separate for easier debugging) | |
527 for (my $k = @exonstart-1; $k>=0; $k--) { | |
528 $k < @exonstart-1 and $lenintron += ($exonstart[$k+1]-$exonend[$k]-1); | |
529 if ($cdsend <= $exonend[$k]) { #calculate CDS start accurately by considering intron length | |
530 $rcdsstart = $txend-$cdsend-$lenintron+1; | |
531 } | |
532 | |
533 #splicing calculation | |
534 if ($start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1 or $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) { | |
535 $splicing{$name2}++; | |
536 } | |
537 if ($end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]+$splicing_threshold-1 or $end >= $exonend[$k]-$splicing_threshold+1 and $end <= $exonend[$k]+$splicing_threshold) { | |
538 $splicing{$name2}++; | |
539 } | |
540 if ($start <= $exonstart[$k] and $end>=$exonstart[$k] or $start <= $exonend[$k] and $end >= $exonend[$k]) { | |
541 $splicing{$name2}++; | |
542 } | |
543 | |
544 if ($end > $exonend[$k]) { | |
545 if ($start <= $exonend[$k]) { | |
546 $rvarstart = $txend-$exonend[$k]-$lenintron+1; | |
547 | |
548 for (my $m = $k; $m >= 0; $m--) { | |
549 $m < $k and $lenintron += ($exonstart[$m+1]-$exonend[$m]-1); | |
550 if ($start > $exonend[$m]) { | |
551 #query -------- | |
552 #gene <--**---******---****----> | |
553 #$rvarend = $txend-$exonstart[$m]-$lenintron+1 - ($exonstart[$m+1]-$exonend[$m]-1); #commented out 2011feb18 | |
554 $rvarend = $txend-$exonstart[$m+1]+1-$lenintron + ($exonstart[$m+1]-$exonend[$m]-1); #fixed this 2011feb18 | |
555 last; #finsih the cycle!!!!!!!!!!!!!!!!!!! | |
556 } elsif ($start >= $exonstart[$m]) { #start within exons | |
557 #query ---- | |
558 #gene <--**---******---****----> | |
559 $rvarend = $txend-$start-$lenintron+1; | |
560 last; | |
561 } | |
562 } | |
563 if (not defined $rvarend) { #if rvarend is not found, then the whole tail of gene is covered | |
564 $rvarend = $txend-$txstart-$lenintron+1; | |
565 } | |
566 | |
567 #here is the trick begins to differentiate UTR versus coding exonic | |
568 if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns | |
569 #query ---- | |
570 #gene <--*---*-> | |
571 $utr3{$name2}++; #negative strand for UTR5 | |
572 } elsif ($start > $cdsend) { | |
573 #query ---- | |
574 #gene <--*---*-> | |
575 $utr5{$name2}++; #negative strand for UTR3 | |
576 } else { | |
577 $exonic{$name2}++; | |
578 $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '-', $i, @exonstart-$k, $nextline]; | |
579 } | |
580 $foundgenic++; | |
581 last; | |
582 } elsif ($k < @exonstart-1 and $end < $exonstart[$k+1]) { | |
583 $intronic{$name2}++; | |
584 $foundgenic++; | |
585 last; | |
586 } | |
587 } elsif ($end >= $exonstart[$k]) { | |
588 $rvarstart = $txend-$end-$lenintron+1; #all the rvarstart, rvarend are with respect to the cDNA sequence (so rvarstart corresponds to end of variants) | |
589 | |
590 for (my $m = $k; $m >= 0; $m--) { | |
591 $m < $k and $lenintron += ($exonstart[$m+1]-$exonend[$m]-1); | |
592 if ($start > $exonend[$m]) { | |
593 #query ---- | |
594 #gene <--**---******---****----> | |
595 #$rvarend = $txend-$exonstart[$m]-$lenintron+1 - ($exonstart[$m+1]-$exonend[$m]-1); #commented out 2011feb18 due to bug (10 42244567 42244600 CACCTTTGCTTGATATGATAATATAGTGCCAAGG - hetero) | |
596 $rvarend = $txend-$exonstart[$m+1]+1 - $lenintron + ($exonstart[$m+1]-$exonend[$m]-1); #fixed this 2011feb18 | |
597 last; #finish the circle of counting exons!!!!! | |
598 } elsif ($start >= $exonstart[$m]) { #the start is right located within exon | |
599 #query ------- | |
600 #gene <--**---******---****----> | |
601 $rvarend = $txend-$start-$lenintron+1; | |
602 last; #finish the cycle | |
603 } | |
604 } | |
605 if (not defined $rvarend) { #if rvarend is not found, then the whole tail of gene is covered | |
606 $rvarend = $txend-$txstart-$lenintron+1; | |
607 } | |
608 | |
609 #here the trick begins to differentiate UTR versus coding exonic | |
610 if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns | |
611 #query ---- | |
612 #gene <--*---*-> | |
613 $utr3{$name2}++; #negative strand for UTR5 | |
614 } elsif ($start > $cdsend) { | |
615 #query ---- | |
616 #gene <--*---*-> | |
617 $utr5{$name2}++; #negative strand for UTR3 | |
618 } else { | |
619 $exonic{$name2}++; | |
620 $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '-', $i, @exonstart-$k, $nextline]; | |
621 } | |
622 $foundgenic++; | |
623 last; | |
624 } | |
625 } | |
626 } | |
627 } | |
628 } | |
629 } | |
630 $foundgenic or $intergenic{$name2}++; | |
631 $i =~ m/000000$/ and printerr "NOTICE: Finished analyzing $i query variants\n"; | |
632 | |
633 | |
634 my (@txname, %genename); | |
635 | |
636 if ($separate) { #separately print out each effect on one line | |
637 if (%exonic or %splicing or %intronic or %utr5 or %utr3 or %ncrna or %upstream or %downstream) { | |
638 %exonic and print OUT "exonic\t", join(",", sort keys %exonic), "\t", $nextline, "\n"; | |
639 %splicing and $end-$start+1<=$splicing_threshold and print OUT "splicing\t", join (",", sort keys %splicing), "\t", $nextline, "\n"; | |
640 %intronic and print OUT "intronic\t", join(",", sort keys %intronic), "\t", $nextline, "\n"; | |
641 %utr5 and print OUT "UTR5\t", join(",", sort keys %utr5), "\t", $nextline, "\n"; | |
642 %utr3 and print OUT "UTR3\t", join(",", sort keys %utr3), "\t", $nextline, "\n"; | |
643 %ncrna and print OUT "ncRNA\t", join(",", sort keys %ncrna), "\t", $nextline, "\n"; | |
644 %upstream and print OUT "upstream\t", join(",", sort keys %upstream), "\t", $nextline, "\n"; | |
645 %downstream and print OUT "downstream\t", join(",", sort keys %downstream), "\t", $nextline, "\n"; | |
646 } elsif (%intergenic) { | |
647 $genel ||= "NONE"; | |
648 $gener ||= "NONE"; | |
649 $distl ||= "NONE"; | |
650 $distr ||= "NONE"; | |
651 print OUT "intergenic\t", "$genel(dist=$distl),$gener(dist=$distr)", "\t", $nextline, "\n"; | |
652 } else { | |
653 die "FATAL ERROR: please report bug to ANNOVAR author with your input file\n"; | |
654 } | |
655 } else { | |
656 if (@precedence) { | |
657 my $foundmatch; | |
658 for my $i (0 .. @precedence-2) { | |
659 $precedence[$i] eq 'exonic' and %exonic and $foundmatch++; | |
660 $precedence[$i] eq 'splicing' and %splicing and $foundmatch++; | |
661 $precedence[$i] eq 'intronic' and %intronic and $foundmatch++; | |
662 $precedence[$i] eq 'utr5' and %utr5 and $foundmatch++; | |
663 $precedence[$i] eq 'utr3' and %utr3 and $foundmatch++; | |
664 $precedence[$i] eq 'ncrna' and %ncrna and $foundmatch++; | |
665 $precedence[$i] eq 'upstream' and %upstream and $foundmatch++; | |
666 $precedence[$i] eq 'downstream' and %downstream and $foundmatch++; | |
667 $precedence[$i] eq 'intergenic' and %intergenic and $foundmatch++; | |
668 if ($foundmatch) { | |
669 for my $j ($i+1 .. @precedence-1) { | |
670 $precedence[$j] eq 'exonic' and %exonic = (); | |
671 $precedence[$j] eq 'splicing' and %splicing = (); | |
672 $precedence[$j] eq 'intronic' and %intronic = (); | |
673 $precedence[$j] eq 'utr5' and %utr5 = (); | |
674 $precedence[$j] eq 'utr3' and %utr3 = (); | |
675 $precedence[$j] eq 'ncrna' and %ncrna = (); | |
676 $precedence[$j] eq 'upstream' and %upstream = (); | |
677 $precedence[$j] eq 'downstream' and %downstream = (); | |
678 $precedence[$j] eq 'intergenic' and %intergenic = (); | |
679 } | |
680 last; | |
681 } | |
682 } | |
683 } | |
684 | |
685 | |
686 if (%exonic) { | |
687 if (%splicing and $end-$start+1<=$splicing_threshold) { #a big deletion spanning splicing site is not really a "splicing" mutation | |
688 print OUT "exonic;splicing\t", join(",", sort keys %exonic), ";", join (",", sort keys %splicing), "\t", $nextline, "\n"; | |
689 } else { | |
690 print OUT "exonic\t", join(",", sort keys %exonic), "\t", $nextline, "\n"; | |
691 } | |
692 } elsif (%splicing) { | |
693 print OUT "splicing\t", join (",", sort keys %splicing), "\t", $nextline, "\n"; | |
694 } elsif (%ncrna) { | |
695 print OUT "ncRNA\t", join(",", sort keys %ncrna), "\t", $nextline, "\n"; | |
696 } elsif (%utr5 or %utr3) { | |
697 if (%utr5 and %utr3) { | |
698 print OUT "UTR5;UTR3\t", join(",", sort keys %utr5), ";", join(",", sort keys %utr3), "\t", $nextline, "\n"; #use ";" to separate UTR5 and UTR3 genes | |
699 } elsif (%utr5) { | |
700 print OUT "UTR5\t", join(",", sort keys %utr5), "\t", $nextline, "\n"; | |
701 } else { | |
702 print OUT "UTR3\t", join(",", sort keys %utr3), "\t", $nextline, "\n"; | |
703 } | |
704 } elsif (%intronic) { | |
705 print OUT "intronic\t", join(",", sort keys %intronic), "\t", $nextline, "\n"; | |
706 } elsif (%upstream or %downstream) { | |
707 if (%upstream and %downstream) { | |
708 print OUT "upstream;downstream\t", join(",", sort keys %upstream), ";", join(",", sort keys %downstream), "\t", $nextline, "\n"; | |
709 } elsif (%upstream) { | |
710 print OUT "upstream\t", join(",", sort keys %upstream), "\t", $nextline, "\n"; | |
711 } else { | |
712 print OUT "downstream\t", join(",", sort keys %downstream), "\t", $nextline, "\n"; | |
713 } | |
714 } elsif (%intergenic) { | |
715 $genel ||= "NONE"; | |
716 $gener ||= "NONE"; | |
717 $distl ||= "NONE"; | |
718 $distr ||= "NONE"; | |
719 print OUT "intergenic\t", "$genel(dist=$distl),$gener(dist=$distr)", "\t", $nextline, "\n"; | |
720 } else { | |
721 die "FATAL ERROR: please report bug to ANNOVAR author with your input file\n"; | |
722 } | |
723 } | |
724 } | |
725 %refseqvar and annotateExonicVariants (\%refseqvar, $geneidmap, $cdslen, $mrnalen); | |
726 | |
727 return ($linecount, $invalidcount); | |
728 } | |
729 | |
730 sub annotateExonicVariants { | |
731 my ($refseqvar, $geneidmap, $cdslen, $mrnalen) = @_; | |
732 my $refseqhash; | |
733 my $function = {}; | |
734 my %varinfo; #variants information (same as input line) | |
735 | |
736 $refseqhash = readSeqFromFASTADB ($refseqvar); | |
737 | |
738 for my $seqid (keys %$refseqvar) { | |
739 for my $i (0 .. @{$refseqvar->{$seqid}}-1) { | |
740 my ($refcdsstart, $refvarstart, $refvarend, $refstrand, $index, $exonpos, $nextline) = @{$refseqvar->{$seqid}->[$i]}; | |
741 my ($wtnt3, $wtnt3_after, @wtnt3, $varnt3, $wtaa, $wtaa_after, $varaa, $varpos); #wtaa_after is the aa after the wtaa | |
742 my ($chr, $start, $end, $ref, $obs); | |
743 | |
744 my @nextline = split (/\s+/, $nextline); | |
745 ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn]; | |
746 ($ref, $obs) = (uc $ref, uc $obs); | |
747 $zerostart and $start++; | |
748 $chr =~ s/^chr//; | |
749 | |
750 $varinfo{$index} = $nextline; | |
751 | |
752 if (not $refseqhash->{$seqid}) { #this refseq do not have FASTA sequence so cannot be interrogated | |
753 $function->{$index}{unknown} = "UNKNOWN"; | |
754 next; | |
755 } | |
756 | |
757 my $fs = (($refvarstart-$refcdsstart) % 3); | |
758 if ($refvarstart-$fs-1 > length($refseqhash->{$seqid})) { | |
759 printerr "WARNING: Potential database annotation error seqid=$seqid, refvarstart=$refvarstart, fs=$fs, seqlength=", length($refseqhash->{$seqid}), " refcdsstart=$refcdsstart, with inputline=$nextline\n"; | |
760 next; | |
761 } | |
762 | |
763 $wtnt3 = substr ($refseqhash->{$seqid}, $refvarstart-$fs-1, 3); | |
764 if (length ($refseqhash->{$seqid}) >= $refvarstart-$fs+3) { #going into UTR | |
765 $wtnt3_after = substr ($refseqhash->{$seqid}, $refvarstart-$fs+2, 3); | |
766 } else { | |
767 $wtnt3_after = ''; #last amino acid in the sequence without UTR (extremely rare situation) (example: 17 53588444 53588444 - T 414 hetero) | |
768 } | |
769 @wtnt3 = split (//, $wtnt3); | |
770 if (@wtnt3 != 3 and $refvarstart-$fs-1>=0) { #some times there are database annotation errors (example: chr17:3,141,674-3,141,683), so the last coding frame is not complete and as a result, the cDNA sequence is not complete | |
771 $function->{$index}{unknown} = "UNKNOWN"; | |
772 next; | |
773 } | |
774 | |
775 if ($refstrand eq '-') { #change the observed nucleotide to the reverse strand | |
776 $obs = revcom ($obs); | |
777 } | |
778 | |
779 if ($start == $end) { | |
780 if ($ref eq '-') { #insertion variant | |
781 #the insertion coordinate system in ANNOVAR always uses "position after the current site" | |
782 #in positive strand, this is okay | |
783 #in negative strand, the "after current site" becomes "before current site" during transcription | |
784 #therefore, appropriate handling is necessary to take this into account | |
785 #for example, for a trinucleotide GCC with frameshift of 1 and insertion of CCT | |
786 #in positive strand, it is G-CTT-CC | |
787 #but if the transcript is in negative strand, the genomic sequence should be GC-CCT-C, and transcript is G-AGG-GC | |
788 if ($refstrand eq '+') { | |
789 if ($fs == 1) { | |
790 $varnt3 = $wtnt3[0] . $wtnt3[1] . $obs . $wtnt3[2]; | |
791 } elsif ($fs == 2) { | |
792 $varnt3 = $wtnt3[0] . $wtnt3[1] . $wtnt3[2] . $obs; | |
793 } else { | |
794 $varnt3 = $wtnt3[0] . $obs . $wtnt3[1] . $wtnt3[2]; | |
795 } | |
796 } elsif ($refstrand eq '-') { | |
797 if ($fs == 1) { | |
798 $varnt3 = $wtnt3[0] . $obs . $wtnt3[1] . $wtnt3[2]; | |
799 } elsif ($fs == 2) { | |
800 $varnt3 = $wtnt3[0] . $wtnt3[1] . $obs . $wtnt3[2]; | |
801 } else { | |
802 $varnt3 = $obs . $wtnt3[0] . $wtnt3[1] . $wtnt3[2]; | |
803 } | |
804 } | |
805 ($wtaa, $wtaa_after, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($wtnt3_after), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1); | |
806 $wtaa_after and $wtaa_after eq '*' and $wtaa_after = 'X'; #wtaa_after could be undefined, if the current aa is the stop codon (X) (example: 17 53588444 53588444 - T) | |
807 | |
808 my $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarstart-$refcdsstart+2) . "ins$obs"; #cDNA level annotation | |
809 if (length ($obs) % 3 == 0) { | |
810 if ($wtaa eq '*') { #mutation on stop codon | |
811 if ($varaa =~ m/\*/) { | |
812 $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before | |
813 $function->{$index}{nfsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,"; #stop codon is stil present | |
814 } else { | |
815 $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,"; #stop codon is lost | |
816 } | |
817 } else { | |
818 if ($varaa =~ m/\*/) { | |
819 $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before | |
820 $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "delins$varaa,"; | |
821 } else { | |
822 $function->{$index}{nfsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "delins$varaa,"; | |
823 } | |
824 } | |
825 } else { | |
826 if ($wtaa eq '*') { #mutation on stop codon | |
827 if ($varaa =~ m/\*/) { #in reality, this cannot be differentiated from non-frameshift insertion, but we'll still call it frameshift | |
828 $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before | |
829 $function->{$index}{fsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,"; | |
830 } else { | |
831 $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,"; | |
832 } | |
833 } else { | |
834 if ($varaa =~ m/\*/) { | |
835 $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before | |
836 $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "_$wtaa_after" . ($varpos+1) . "delins$varaa,"; | |
837 } else { | |
838 $function->{$index}{fsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "fs,"; | |
839 } | |
840 } | |
841 } | |
842 } elsif ($obs eq '-') { #single nucleotide deletion | |
843 my $deletent; | |
844 if ($fs == 1) { | |
845 $deletent = $wtnt3[1]; | |
846 $varnt3 = $wtnt3[0].$wtnt3[2].$wtnt3_after; | |
847 } elsif ($fs == 2) { | |
848 $deletent = $wtnt3[2]; | |
849 $varnt3 = $wtnt3[0].$wtnt3[1].$wtnt3_after; | |
850 } else { | |
851 $deletent = $wtnt3[0]; | |
852 $varnt3 = $wtnt3[1].$wtnt3[2].$wtnt3_after; | |
853 } | |
854 ($wtaa, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1); | |
855 my $canno = "c." . ($refvarstart-$refcdsstart+1) . "del$deletent"; | |
856 if ($wtaa eq '*') { #mutation on stop codon | |
857 if ($varaa =~ m/\*/) { #stop codon is still stop codon | |
858 $function->{$index}{nfsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "X,"; #changed fsdel to nfsdel on 2011feb19 | |
859 } else { #stop codon is lost | |
860 $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "$varaa,"; | |
861 } | |
862 } else { | |
863 if ($varaa =~ m/\*/) { #new stop codon created | |
864 $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "X,"; | |
865 } else { | |
866 $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "fs,"; | |
867 } | |
868 } | |
869 } elsif (length ($obs) > 1) { #block substitution (since start==end, this changed from 1nt to several nt) | |
870 if (($refvarend-$refvarstart+1-length($obs)) % 3 == 0) { | |
871 $function->{$index}{nfssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "delins$obs,"; | |
872 } else { | |
873 $function->{$index}{fssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "delins$obs,"; | |
874 } | |
875 } else { #single nucleotide substitution variant | |
876 my $canno; | |
877 if ($fs == 1) { | |
878 $varnt3 = $wtnt3[0] . $obs . $wtnt3[2]; | |
879 $canno = "c.$wtnt3[1]" . ($refvarstart-$refcdsstart+1) . $obs; | |
880 } elsif ($fs == 2) { | |
881 $varnt3 = $wtnt3[0] . $wtnt3[1]. $obs; | |
882 $canno = "c.$wtnt3[2]" . ($refvarstart-$refcdsstart+1) . $obs; | |
883 } else { | |
884 $varnt3 = $obs . $wtnt3[1] . $wtnt3[2]; | |
885 $canno = "c.$wtnt3[0]" . ($refvarstart-$refcdsstart+1) . $obs; | |
886 } | |
887 ($wtaa, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1); | |
888 | |
889 if ($wtaa eq $varaa) { | |
890 $wtaa eq '*' and ($wtaa, $varaa) = qw/X X/; #change * to X in the output | |
891 $function->{$index}{ssnv} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos$varaa,"; | |
892 } elsif ($varaa eq '*') { | |
893 $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa${varpos}X,"; | |
894 } elsif ($wtaa eq '*') { | |
895 $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos$varaa,"; | |
896 } else { | |
897 $function->{$index}{nssnv} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos$varaa,"; | |
898 } | |
899 } | |
900 } elsif ($obs eq '-') { #deletion variant involving several nucleotides | |
901 ($wtaa, $varpos) = (translateDNA ($wtnt3), int(($refvarstart-$refcdsstart)/3)+1); #wildtype amino acid, position of amino acid | |
902 my ($varposend, $canno); #the position of the last amino acid in the deletion | |
903 if ($refvarstart<=$refcdsstart) { #since the first amino acid is deleted, the whole gene is considered deleted | |
904 $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:wholegene,"; #it is exonic variant, so the varend has to hit the first exon | |
905 } elsif ($refvarend >= $cdslen->{$seqid}+$refcdsstart) { #3' portion of the gene is deleted | |
906 $varposend = int ($cdslen->{$seqid}/3); #cdslen should be multiples of 3, but just in case of database mis-annotation | |
907 $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($cdslen->{$seqid}+$refcdsstart-1) . "del"; | |
908 $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,"; | |
909 } elsif (($refvarend-$refvarstart+1) % 3 == 0) { | |
910 $varposend = int (($refvarend-$refcdsstart)/3) + 1; | |
911 $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "del"; | |
912 $function->{$index}{nfsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,"; | |
913 } else { | |
914 $varposend = int (($refvarend-$refcdsstart)/3) + 1; | |
915 $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "del"; | |
916 $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,"; | |
917 } | |
918 } else { #block substitution event | |
919 if (($refvarend-$refvarstart+1-length($obs)) % 3 == 0) { | |
920 $function->{$index}{nfssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "$obs,"; | |
921 } else { | |
922 $function->{$index}{fssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "$obs,"; | |
923 } | |
924 } | |
925 } | |
926 } | |
927 | |
928 for my $index (sort {$a<=>$b} keys %$function) { | |
929 if ($separate) { #print out each type of exonic mutations separately (one effect in one line), rather than printing out only the most important function | |
930 if ($function->{$index}{fsins}) { | |
931 print EXONIC "line$index\t", "frameshift insertion\t$function->{$index}{fsins}\t", $varinfo{$index}, "\n"; | |
932 } | |
933 if ($function->{$index}{fsdel}) { | |
934 print EXONIC "line$index\t", "frameshift deletion\t$function->{$index}{fsdel}\t", $varinfo{$index}, "\n"; | |
935 } | |
936 if ($function->{$index}{fssub}) { | |
937 print EXONIC "line$index\t", "frameshift substitution\t$function->{$index}{fssub}\t", $varinfo{$index}, "\n"; | |
938 } | |
939 if ($function->{$index}{stopgain}) { | |
940 print EXONIC "line$index\t", "stopgain SNV\t$function->{$index}{stopgain}\t", $varinfo{$index}, "\n"; | |
941 } | |
942 if ($function->{$index}{stoploss}) { | |
943 print EXONIC "line$index\t", "stoploss SNV\t$function->{$index}{stoploss}\t", $varinfo{$index}, "\n"; | |
944 } | |
945 if ($function->{$index}{nfsins}) { | |
946 print EXONIC "line$index\t", "nonframeshift insertion\t$function->{$index}{nfsins}\t", $varinfo{$index}, "\n"; | |
947 } | |
948 if ($function->{$index}{nfsdel}) { | |
949 print EXONIC "line$index\t", "nonframeshift deletion\t$function->{$index}{nfsdel}\t", $varinfo{$index}, "\n"; | |
950 } | |
951 if ($function->{$index}{nfssub}) { | |
952 print EXONIC "line$index\t", "nonframeshift substitution\t$function->{$index}{nfssub}\t", $varinfo{$index}, "\n"; | |
953 } | |
954 if ($function->{$index}{nssnv}) { | |
955 print EXONIC "line$index\t", "nonsynonymous SNV\t$function->{$index}{nssnv}\t", $varinfo{$index}, "\n"; | |
956 } | |
957 if ($function->{$index}{ssnv}) { | |
958 print EXONIC "line$index\t", "synonymous SNV\t$function->{$index}{ssnv}\t", $varinfo{$index}, "\n"; | |
959 } | |
960 if ($function->{$index}{unknown}) { | |
961 print EXONIC "line$index\t", "unknown\t$function->{$index}{unknown}\t", $varinfo{$index}, "\n"; | |
962 } | |
963 } else { #print out only the most important functional changes (for example, chr3:9931279-9931279 G->A can be both non-synonymous and synonymous mutations based on UCSC gene model) | |
964 print EXONIC "line$index\t"; | |
965 my $sortout; | |
966 if ($sortout = $function->{$index}{fsins}) { | |
967 $exonsort and $sortout = sortExonicAnnotation ($sortout); | |
968 print EXONIC "frameshift insertion\t$sortout\t"; | |
969 } elsif ($sortout = $function->{$index}{fsdel}) { | |
970 $exonsort and $sortout = sortExonicAnnotation ($sortout); | |
971 print EXONIC "frameshift deletion\t$sortout\t"; | |
972 } elsif ($sortout = $function->{$index}{fssub}) { | |
973 $exonsort and $sortout = sortExonicAnnotation ($sortout); | |
974 print EXONIC "frameshift substitution\t$sortout\t"; | |
975 } elsif ($sortout = $function->{$index}{stopgain}) { | |
976 $exonsort and $sortout = sortExonicAnnotation ($sortout); | |
977 print EXONIC "stopgain SNV\t$sortout\t"; | |
978 } elsif ($sortout = $function->{$index}{stoploss}) { | |
979 $exonsort and $sortout = sortExonicAnnotation ($sortout); | |
980 print EXONIC "stoploss SNV\t$sortout\t"; | |
981 } elsif ($sortout = $function->{$index}{nfsins}) { | |
982 $exonsort and $sortout = sortExonicAnnotation ($sortout); | |
983 print EXONIC "nonframeshift insertion\t$sortout\t"; | |
984 } elsif ($sortout = $function->{$index}{nfsdel}) { | |
985 $exonsort and $sortout = sortExonicAnnotation ($sortout); | |
986 print EXONIC "nonframeshift deletion\t$sortout\t"; | |
987 } elsif ($sortout = $function->{$index}{nfssub}) { | |
988 $exonsort and $sortout = sortExonicAnnotation ($sortout); | |
989 print EXONIC "nonframeshift substitution\t$sortout\t"; | |
990 } elsif ($sortout = $function->{$index}{nssnv}) { | |
991 $exonsort and $sortout = sortExonicAnnotation ($sortout); | |
992 print EXONIC "nonsynonymous SNV\t$sortout\t"; | |
993 } elsif ($sortout = $function->{$index}{ssnv}) { | |
994 $exonsort and $sortout = sortExonicAnnotation ($sortout); | |
995 print EXONIC "synonymous SNV\t$sortout\t"; | |
996 } elsif ($sortout = $function->{$index}{unknown}) { | |
997 $exonsort and $sortout = sortExonicAnnotation ($sortout); | |
998 print EXONIC "unknown\t$sortout\t"; | |
999 } | |
1000 print EXONIC $varinfo{$index}, "\n"; | |
1001 } | |
1002 } | |
1003 } | |
1004 | |
1005 sub sortExonicAnnotation { | |
1006 my ($anno) = @_; | |
1007 my @anno1 = split (/,/, $anno); | |
1008 my @anno2; | |
1009 for my $i (0 .. @anno1-1) { | |
1010 my @temp = split (/:/, $anno1[$i]); | |
1011 $temp[2] =~ s/^exon//; | |
1012 push @anno2, [$anno1[$i], @temp]; | |
1013 } | |
1014 @anno2 = sort {$a->[3] <=> $b->[3] or $a->[2] cmp $b->[2]} @anno2; #first sort by exon number, then by transcript name | |
1015 my @anno3 = map {$_->[0]} @anno2; | |
1016 return join (',', @anno3); | |
1017 } | |
1018 | |
1019 sub filterQuery { | |
1020 open (FIL, ">$outfile.${buildver}_${dbtype1}_filtered") or die "Error: cannot write to output file $outfile.${buildver}_${dbtype1}_filtered: $!\n"; | |
1021 open (DROPPED, ">$outfile.${buildver}_${dbtype1}_dropped") or die "Error: cannot write to output file $outfile.${buildver}_${dbtype1}_dropped: $!\n"; | |
1022 open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n"; | |
1023 | |
1024 printerr "NOTICE: Variants matching filtering criteria are written to $outfile.${buildver}_${dbtype1}_dropped, other variants are written to $outfile.${buildver}_${dbtype1}_filtered\n"; | |
1025 | |
1026 open (QUERY, $queryfile) or die "Error: cannot read from query file $queryfile: $!\n"; | |
1027 | |
1028 my (%variant, $filedone, $batchdone); | |
1029 my ($linecount, $batchlinecount, $invalid, $invalidcount) = (0, 0); | |
1030 my ($chr, $start, $end, $ref, $obs, $info); | |
1031 while (1) { | |
1032 $_ = <QUERY>; | |
1033 if (not defined $_) { | |
1034 $filedone++; | |
1035 } else { | |
1036 s/[\r\n]+$//; | |
1037 | |
1038 if (m/^#/ and $comment) { #comment line start with #, do not include this is $linecount | |
1039 print FIL "$_\n"; | |
1040 print DROPPED "#comment\t#comment\t$_\n"; | |
1041 next; | |
1042 } | |
1043 | |
1044 $linecount++; | |
1045 $batchlinecount++; | |
1046 if ($batchlinecount == $batchsize) { | |
1047 $batchdone++; | |
1048 } | |
1049 | |
1050 if ($memfree or $memtotal) { #if these arguments are specified | |
1051 if ($linecount =~ m/00000$/) { #about 40Mb memory per 10k lines for a typical input dataset | |
1052 my ($availmem, $allmem) = currentAvailMemory(); | |
1053 $verbose and printerr "NOTICE: Current available system memory is $availmem kb (this program uses $allmem bytes memory), after reading $linecount query\n"; | |
1054 if ($availmem and $availmem <= $memfree+50_000) { #some subsequent steps may take ~50Mb memory, so here we try to allocate some more memory | |
1055 $batchdone++; | |
1056 } | |
1057 if ($memtotal and $allmem >= $memtotal-50_000) { #when --memtotal is specified, ensure that program use less memory | |
1058 $batchdone++; | |
1059 } | |
1060 } | |
1061 } | |
1062 | |
1063 $invalid = 0; #reset invalid status | |
1064 | |
1065 my @nextline = split (/\s+/, $_); | |
1066 ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn]; | |
1067 if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) { | |
1068 $invalid++; | |
1069 } else { | |
1070 ($ref, $obs) = (uc $ref, uc $obs); | |
1071 $zerostart and $start++; | |
1072 $chr =~ s/^chr//; | |
1073 if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) { | |
1074 $invalid++; | |
1075 } elsif ($ref eq '-' and $obs eq '-' #both are empty allele | |
1076 or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code | |
1077 or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code | |
1078 or $start =~ m/[^\d]/ #start is not a number | |
1079 or $end =~ m/[^\d]/ #end is not a number | |
1080 or $start > $end #start is more than end | |
1081 or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref | |
1082 or $ref eq '-' and $start != $end #length mismatch for insertion | |
1083 ) { | |
1084 $invalid++; | |
1085 } | |
1086 } | |
1087 | |
1088 if ($invalid) { | |
1089 print INVALID $_, "\n"; #invalid record found | |
1090 $invalidcount++; | |
1091 next; | |
1092 } | |
1093 | |
1094 if ($start == $end and $ref eq '-') { #insertion | |
1095 $obs = "0$obs"; | |
1096 } elsif ($obs eq '-') { #deletion | |
1097 $obs = $end-$start+1; | |
1098 } elsif ($end>$start or $start==$end and length($obs)>1) { #block substitution #fixed the bug here 2011feb19 | |
1099 $obs = ($end-$start+1) . $obs; | |
1100 } | |
1101 | |
1102 if (exists $variant{$chr, $start, $obs}) { | |
1103 $variant{$chr, $start, $obs} .= "\n$_"; | |
1104 } else { | |
1105 $variant{$chr, $start, $obs} = "$ref\n$_"; | |
1106 } | |
1107 } | |
1108 | |
1109 if ($filedone or $batchdone) { | |
1110 printerr "NOTICE: Processing next batch with ${\(scalar keys %variant)} unique variants in $batchlinecount input lines\n"; | |
1111 filterNextBatch (\%variant); | |
1112 %variant = (); | |
1113 $batchlinecount = 0; #reset the line count for this batch | |
1114 $batchdone = 0; | |
1115 } | |
1116 if ($filedone) { | |
1117 last; | |
1118 } | |
1119 } | |
1120 close (INVALID); close (DROPPED); close (FIL); | |
1121 if ($invalidcount) { | |
1122 printerr "NOTICE: Variants with invalid input format were written to $outfile.invalid_input\n"; | |
1123 } else { | |
1124 unlink ("$outfile.invalid_input"); | |
1125 } | |
1126 } | |
1127 | |
1128 sub filterNextBatch { | |
1129 my ($variant) = @_; | |
1130 my $dbfile; | |
1131 | |
1132 if ($dbtype1 eq 'generic') { | |
1133 $dbfile = File::Spec->catfile ($dbloc, $genericdbfile); | |
1134 } elsif ($dbtype1 eq 'vcf') { | |
1135 $dbfile = File::Spec->catfile ($dbloc, $vcfdbfile); | |
1136 } else { | |
1137 $dbfile = File::Spec->catfile ($dbloc, "${buildver}_$dbtype1.txt"); | |
1138 } | |
1139 | |
1140 open (DB, $dbfile) or die "Error: cannot read from input database file $dbfile: $!\n"; | |
1141 printerr "NOTICE: Scanning filter database $dbfile..."; | |
1142 | |
1143 my (@record, $chr, $start, $end, $ref, $obs, $score, $qual, $fil, $info); | |
1144 my ($rsid, $strand, $ucscallele, $twoallele, $class, $af, $attribute); | |
1145 my $count_invalid_dbline; | |
1146 while (<DB>) { | |
1147 my (@obs2, @score2); #for 1000G2010 data set in VCF format, some tri-allelic SNPs are present; in the future, some quad-allelic SNPs may be also present in VCF files | |
1148 s/[\r\n]+$//; | |
1149 m/\S/ or next; #skip empty lines in the database file (sometimes this occurs) | |
1150 m/^#/ and next; #skip the comment line | |
1151 if ($dbtype eq 'avsift') { | |
1152 @record = split (/\t/, $_); | |
1153 @record == 8 or die "Error: invalid record found in DB file $dbfile (8 tab-delimited fields expected): <$_>\n"; | |
1154 ($chr, $start, $end, $ref, $obs, $score) = @record; | |
1155 if ($chromosome) { | |
1156 $valichr{$chr} or next; | |
1157 } | |
1158 if ($score < $sift_threshold) { #this is a deleterious mutation, skip it (equal sign should not be used, otherwise the score=0 will be skipped) | |
1159 next; | |
1160 } | |
1161 } elsif ($dbtype =~ m/^ljb_/) { | |
1162 @record = split (/\t/, $_); | |
1163 @record >= 5 or die "Error: invalid record found in DB file $dbfile (at least 5 tab-delimited fields expected): <$_>\n"; | |
1164 ($chr, $start, $end, $ref, $obs, $score) = @record; | |
1165 if ($chromosome) { | |
1166 $valichr{$chr} or next; | |
1167 } | |
1168 if (defined $score and defined $score_threshold and $score < $score_threshold) { | |
1169 next; | |
1170 } | |
1171 } elsif ($dbtype =~ m/^snp\d+/) { | |
1172 @record = split (/\t/, $_, -1); #-1 is required before some dbSNP records have many empty tab fields in the end | |
1173 @record == 18 or @record == 26 or die "Error: invalid record found in dbSNP database file $dbfile (18 or 26 fields expected but found ${\(scalar @record)}): <$_>\n" . join("\n",@record); | |
1174 $record[1] =~ s/^chr// or die "Error: invalid record found in DB file (2nd field should start with 'chr'): <$_>\n"; | |
1175 ($chr, $start, $end, $rsid, $strand, $ucscallele, $twoallele, $class) = @record[1,2,3,4,6,8,9,11]; | |
1176 $start++; #UCSC use zero-start system | |
1177 if ($chromosome) { | |
1178 $valichr{$chr} or next; | |
1179 } | |
1180 unless ($class eq 'single' or $class eq 'deletion' or $class eq 'in-del' or $class eq 'insertion') { #enum('unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion') | |
1181 next; | |
1182 } | |
1183 | |
1184 my @allele = split (/\//, $twoallele); | |
1185 | |
1186 #before Jan 2011, only di-allelic SNPs are handled in ANNOVAR | |
1187 #@allele == 2 or next; #many entries have no allele information (for example, rs71010435) | |
1188 #in Jan 2011 version, I decided to handle tri-allelic and quad-allelic SNP as well | |
1189 | |
1190 @allele >= 2 or next; #Jan 2011 modification | |
1191 if ($strand eq '-') { #handle reverse strand annotation (the vast majority of records in dbSNP should be already in + strand) | |
1192 for my $i (0 .. @allele-1) { | |
1193 $allele[$i] = revcom ($allele[$i]); | |
1194 } | |
1195 #$ucscallele = revcom ($ucscallele); #added Jan 24, 2011 (per Kevin Ha) removed Feb 10, 2011 (per Eric Stawiski) | |
1196 #note that some SNPs (e.g., rs28434453) may have multiple location in diferent chromosome or strand; I may want to handle this by a special flag in the future | |
1197 #585 chr1 13301 13302 rs28434453 0 - C C C/T genomic single etc... | |
1198 #1367 chr15 102517867 102517868 rs28434453 0 + G G C/T genomic single etc... | |
1199 } | |
1200 | |
1201 #in-del is usually annotated below, so they require special treatment | |
1202 #587 chr1 384538 384539 rs3971283 0 + T T -/ATT genomic in-del unknown 0 0 unknown exact 3 | |
1203 if ($class eq 'in-del') { #indel are usually annotated as -/xxx, where xxx is the alternative allele | |
1204 $obs = length ($ucscallele) . $allele[1]; #prefix a number before the alleles, indicating block substitution | |
1205 defined $allele[1] or die "no allele 1 <$_>"; | |
1206 } elsif ($class eq 'insertion') { | |
1207 $start--; | |
1208 $obs = "0$allele[1]"; | |
1209 } elsif ($class eq 'deletion') { | |
1210 $obs = length ($ucscallele); | |
1211 } else { | |
1212 for my $i (0 .. @allele-1) { | |
1213 if ($ucscallele eq $allele[$i]) { | |
1214 @obs2 = @allele; | |
1215 splice (@obs2, $i, 1); | |
1216 for my $j (0 .. @obs2-1) { | |
1217 push @score2, $rsid; | |
1218 } | |
1219 } | |
1220 } | |
1221 if (@obs2) { | |
1222 $obs = shift @obs2; | |
1223 $score = shift @score2; | |
1224 } else { | |
1225 $verbose and printerr ("Database error: wildtype base $ucscallele is not part of the allele description in <$_>\n"); | |
1226 next; | |
1227 } | |
1228 } | |
1229 $score = $rsid; | |
1230 } elsif ($dbtype =~ m/^1000g_(\w+)/ or $dbtype =~ m/^1000g2010_(\w+)/ or $dbtype =~ m/^1000g2010\w\w\w_(\w+)/) { #dbtype1 should NOT be used here | |
1231 @record = split (/\t/, $_); | |
1232 @record == 5 or @record == 6 or die "Error: invalid record found in 1000G database file $dbfile (5 or 6 fields expected): <$_>\n"; | |
1233 ($chr, $start, $ref, $obs, $af) = @record; #there is no "END" in 1000G input file | |
1234 if ($chromosome) { | |
1235 $valichr{$chr} or next; | |
1236 } | |
1237 if ($maf_threshold) { | |
1238 if ($af > 0.5) { #the frequency is the non-reference allele frequency, which could exceed 0.5 | |
1239 1-$af >= $maf_threshold or next; | |
1240 } else { | |
1241 $af >= $maf_threshold or next; | |
1242 } | |
1243 } | |
1244 $score = $af; | |
1245 } elsif ($dbtype eq 'generic') { | |
1246 ($chr, $start, $end, $ref, $obs, $score) = split (/\t/, uc $_); #make sure to use upper case, as query is always in upper case | |
1247 defined $obs or die "Error: the generic database file must contains at least five tab-delimited fields per line (but observed line: $_)\n"; | |
1248 defined $score or $score = "NA"; | |
1249 if ($chromosome) { | |
1250 $valichr{$chr} or next; | |
1251 } | |
1252 defined $obs or die "Error: invalid record found in DB file $dbfile (at least 5 fields expected for 'generic' dbtype): <$_>\n"; | |
1253 if ($start == $end and $ref eq '-') { #insertion | |
1254 $obs = "0$obs"; | |
1255 } | |
1256 if ($obs eq '-') { #deletion | |
1257 $obs = $end-$start+1; | |
1258 } elsif ($start != $end) { #block substitution | |
1259 $obs = ($end-$start+1) . $obs; | |
1260 } | |
1261 if (defined $score and defined $score_threshold and $score < $score_threshold) { | |
1262 next; | |
1263 } | |
1264 } elsif ($dbtype eq 'vcf') { #vcf file is adopted by 1000 Genomes Project; it can describe both SNPs and indels, and it may contain both summary level statistics and individual level genotype calls | |
1265 ($chr, $start, $rsid, $ref, $obs, $qual, $fil, $info) = split (/\t/, $_); | |
1266 if ($chromosome) { | |
1267 $valichr{$chr} or next; | |
1268 } | |
1269 | |
1270 my ($ac, $an); | |
1271 | |
1272 if ($info =~ m/AF=([^;]+)/) { | |
1273 $score = $1; | |
1274 if ($obs =~ m/(\w),(\w)/) { #1000G November; this format is not really valid because it does not handle tri-allelic SNP | |
1275 ($obs, @obs2) = ($1, $2); | |
1276 @score2 = ($score); | |
1277 } | |
1278 } elsif ($info =~ m/AC=(\S+?);AN=(\d+)/) { | |
1279 my ($alleles, $count) = ($1, $2); | |
1280 if ($alleles =~ m/^(\d+),(.+)/) { | |
1281 $score = sprintf ("%.3f", $1/$count); | |
1282 @score2 = split (/,/, $2); | |
1283 @score2 = map {sprintf("%.3f", $_/$count)} @score2; | |
1284 ($obs, @obs2) = split (/,/, $obs); #the obs is composed of two alleles | |
1285 } else { | |
1286 $af = sprintf ("%.3f", $alleles/$count); | |
1287 $score = $af; | |
1288 #this is an invalid record in 1000GJuly: 1 2266231 rs11589451 C T,A . PASS AA=c;AC=20;AN=120;DP=237 | |
1289 if ($obs =~ m/(\w),/) { | |
1290 $count_invalid_dbline++; | |
1291 $verbose and printerr "WARNING: Invalid input line found in $dbfile (more than one alleles are observed, but only one is annotated with allelic counts): <$_>\n"; | |
1292 next; | |
1293 } | |
1294 } | |
1295 } else { | |
1296 printerr "WARNING: the VCF file does not contain allele frequency information. ANNOVAR cannot process this file\n"; | |
1297 exit; | |
1298 } | |
1299 | |
1300 if (length ($ref) == 1 and length ($obs) == 1) {#single base substitution | |
1301 1; #the obs and obs2 is already handled | |
1302 } elsif ($obs =~ m/^\-((\w)(\w*))$/) { #deletion (1000G March) | |
1303 $2 eq $ref or $ref eq 'N' or die "Error: mismatch of deleted allele and reference allele: <$_>\n"; | |
1304 $obs = length ($1); | |
1305 } elsif ($obs =~ m/^\+(\w+)$/) { #insertion (1000G March) | |
1306 $obs = "0$1"; | |
1307 } elsif ($ref =~ m/^[ACGTN]+$/ and $obs =~ m/^[ACGTN]+$/) { | |
1308 if (length ($obs) == 1) { #deletion (1000G July) | |
1309 substr ($ref, 0, 1) eq $obs or die "Error: mismatch of deleted allele and reference allele: ref=$ref obs=$obs in <$_>\n"; | |
1310 $start++; | |
1311 $obs = length ($ref)-1; | |
1312 } elsif (length ($ref) == 1) { #duplication (1000G July) | |
1313 substr ($obs, 0, 1) eq $ref or die "Error: mismatch of duplicated allele and reference allele: ref=$ref obs=$obs in <$_>\n"; | |
1314 $start++; | |
1315 $obs = "0" . substr ($obs, 1); | |
1316 } | |
1317 } else { | |
1318 die "Error: invalid record found in VCF file: ref=$ref obs=$obs <$_>\n"; | |
1319 } | |
1320 } else { | |
1321 die "invalid dbtype: $dbtype\n"; | |
1322 } | |
1323 | |
1324 if ($variant->{$chr, $start, $obs}) { | |
1325 my ($ref, @info) = split (/\n/, $variant->{$chr, $start, $obs}); #most likely, only one piece of information | |
1326 for my $i (0 .. @info-1) { | |
1327 print DROPPED join ("\t", $dbtype, $score), "\t", $info[$i], "\n"; | |
1328 } | |
1329 delete $variant->{$chr, $start, $obs}; | |
1330 } | |
1331 if (@obs2) { | |
1332 for my $j (0 .. @obs2-1) { | |
1333 if ($variant->{$chr, $start, $obs2[$j]}) { | |
1334 my ($ref, @info) = split (/\n/, $variant->{$chr, $start, $obs2[$j]}); #most likely, only one piece of information | |
1335 for my $i (0 .. @info-1) { | |
1336 print DROPPED join ("\t", $dbtype, $score2[$j]), "\t", $info[$i], "\n"; | |
1337 } | |
1338 delete $variant->{$chr, $start, $obs2[$j]}; | |
1339 } | |
1340 } | |
1341 } | |
1342 } | |
1343 for my $key (keys %$variant) { | |
1344 my ($chr, $start, $obs) = split ($;, $key); #hash key separator | |
1345 my ($ref, @info) = split (/\n/, $variant->{$key}); | |
1346 my $len; | |
1347 if ($obs =~ m/^(\d+)(.*)/) { | |
1348 ($len, $obs) = ($1, $2); | |
1349 $obs ||= '-'; #deletion | |
1350 if ($len) { | |
1351 $end = $start+$len-1; | |
1352 } else { | |
1353 $end = $start; | |
1354 } | |
1355 } else { | |
1356 $end = $start; | |
1357 } | |
1358 for my $i (0 .. @info-1) { | |
1359 print FIL $info[$i], "\n"; | |
1360 } | |
1361 } | |
1362 printerr "Done\n"; | |
1363 $count_invalid_dbline and printerr "WARNING: $count_invalid_dbline lines in dbfile $dbfile were ignored due to invalid formats\n"; | |
1364 } | |
1365 | |
1366 sub annotateQueryByRegion { | |
1367 open (QUERY, $queryfile) or die "Error: cannot read from --queryfile ($queryfile): $!\n"; | |
1368 open (OUT, ">$outfile.${buildver}_$dbtype1") or die "Error: cannot write to output file $outfile.${buildver}_$dbtype1: $!\n"; | |
1369 open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n"; | |
1370 | |
1371 my ($regiondb, $parent) = ({}, {}); | |
1372 | |
1373 if ($dbtype eq 'gff3') { | |
1374 ($regiondb, $parent) = readGFF3RegionAnnotation (); | |
1375 } elsif ($dbtype eq 'bed') { | |
1376 ($regiondb) = readBedRegionAnnotation (); | |
1377 } else { | |
1378 ($regiondb) = readUCSCRegionAnnotation (); | |
1379 } | |
1380 | |
1381 my ($chr, $start, $end, $ref, $obs); | |
1382 my ($invalid); | |
1383 my ($linecount, $invalidcount) = qw/0 0/; | |
1384 | |
1385 $time and printerr "NOTICE: Current time (before examining variants) is ", scalar (localtime), "\n"; | |
1386 while (<QUERY>) { | |
1387 s/[\r\n]+$//; | |
1388 | |
1389 if (m/^#/ and $comment) { #comment line start with #, do not include this is $linecount | |
1390 print OUT "#comment\t#comment\t$_\n"; | |
1391 next; | |
1392 } | |
1393 | |
1394 $linecount++; | |
1395 | |
1396 $invalid = 0; #reset invalid status | |
1397 | |
1398 my @nextline = split (/\s+/, $_); | |
1399 ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn]; | |
1400 if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) { | |
1401 $invalid++; | |
1402 } else { | |
1403 ($ref, $obs) = (uc $ref, uc $obs); | |
1404 $zerostart and $start++; | |
1405 $chr =~ s/^chr//; | |
1406 if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) { | |
1407 $invalid++; | |
1408 } elsif ($ref eq '-' and $obs eq '-' #both are empty allele | |
1409 or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code | |
1410 or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code | |
1411 or $start =~ m/[^\d]/ #start is not a number | |
1412 or $end =~ m/[^\d]/ #end is not a number | |
1413 or $start > $end #start is more than end | |
1414 or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref | |
1415 or $ref eq '-' and $start != $end #length mismatch for insertion | |
1416 ) { | |
1417 $invalid++; | |
1418 } | |
1419 } | |
1420 | |
1421 | |
1422 if ($invalid) { | |
1423 print INVALID $_, "\n"; #invalid record found | |
1424 $invalidcount++; | |
1425 next; | |
1426 } | |
1427 | |
1428 my $bin1 = int ($start/$genomebinsize); #start bin | |
1429 my $bin2 = int ($end/$genomebinsize); #end bin (usually same as start bin, unless the query is really big that spans multiple megabases) | |
1430 my ($foundhit, $score, $name); | |
1431 for my $bin ($bin1 .. $bin2) { | |
1432 for my $nextgene (@{$regiondb->{$chr, $bin}}) { | |
1433 my ($txstart, $txend, $txscore, $txname) = @$nextgene; | |
1434 | |
1435 if ($end < $txstart) { | |
1436 #db: <-------------------------> | |
1437 #query: <---> | |
1438 last; #if genomic region is too far away from end, end the search of the bins | |
1439 } elsif ($end <= $txend) { #query contained completely within db region | |
1440 if ($start >= $txstart) { | |
1441 #db: <--------------------------> | |
1442 #query: <------------------> | |
1443 } else { #query overlap but upstream of db region | |
1444 #db: <-------------------------> | |
1445 #query: <----------------------> | |
1446 if ($minqueryfrac) { | |
1447 if (($end-$txstart+1)/($end-$start+1) < $minqueryfrac) { | |
1448 next; | |
1449 } | |
1450 } | |
1451 } | |
1452 $foundhit++; | |
1453 $score ||= $txscore; $name ||= $txname; | |
1454 if ($score < $txscore) { | |
1455 $score = $txscore; | |
1456 $name=$txname; | |
1457 } | |
1458 if ($score == $txscore and defined $name and $name ne $txname) { | |
1459 $name .= ",$txname"; | |
1460 } | |
1461 if ($dbtype1 eq 'cytoBand') { #a new chromosome band is encountered | |
1462 $name ne $txname and $name .= ",$txname"; | |
1463 } | |
1464 } elsif ($start <= $txend) { | |
1465 if ($start >= $txstart) { #query overlap but downstream of db region | |
1466 #db: <------------------------> | |
1467 #query: <-----------------------> | |
1468 if ($minqueryfrac) { | |
1469 if (($txend-$start+1)/($end-$start+1) < $minqueryfrac) { | |
1470 next; | |
1471 } | |
1472 } | |
1473 } else { | |
1474 #db region completely contained within query | |
1475 #db: <-------------------------> | |
1476 #query: <------------------------------> | |
1477 if ($minqueryfrac) { | |
1478 if (($txend-$txstart+1)/($end-$start+1) < $minqueryfrac) { | |
1479 next; | |
1480 } | |
1481 } | |
1482 } | |
1483 $foundhit++; | |
1484 $score ||= $txscore; $name ||= $txname; | |
1485 if ($score < $txscore) { | |
1486 $score = $txscore; | |
1487 $name=$txname; | |
1488 } | |
1489 if ($score == $txscore and defined $name and $name ne $txname) { | |
1490 $name .= ",$txname"; | |
1491 } | |
1492 if ($dbtype1 eq 'cytoBand') { #a new chromosome band is encountered | |
1493 $name ne $txname and $name .= ",$txname"; | |
1494 } | |
1495 } else { | |
1496 #query --- | |
1497 #gene <-*----*-> | |
1498 } | |
1499 } | |
1500 } | |
1501 $linecount =~ m/000000$/ and printerr "NOTICE: Finished processing $linecount variants in queryfile\n"; | |
1502 if ($foundhit) { | |
1503 $name ||= ''; | |
1504 my @name = split (/,/, $name); | |
1505 my %name = map {$_, 1} @name; | |
1506 @name = keys %name; | |
1507 | |
1508 if ($dbtype1 eq 'cytoBand') { | |
1509 map {s/^chr//} @name; | |
1510 if (@name >= 2) { | |
1511 $name[$#name] =~ s/^\d+//; | |
1512 $name = $name[0] . '-' . $name[$#name]; | |
1513 } else { | |
1514 $name = $name[0]; | |
1515 } | |
1516 print OUT "$dbtype\t$name\t$_", "\n"; | |
1517 } else { | |
1518 $name = join (",", @name); | |
1519 print OUT "$dbtype\t", $score?"Score=$score;":"", $name?"Name=$name":"", "\t", $_, "\n"; | |
1520 } | |
1521 } | |
1522 } | |
1523 close (QUERY); | |
1524 close (OUT); | |
1525 close (INVALID); | |
1526 $time and printerr "NOTICE: Current time (after examining variants) is ", scalar (localtime), "\n"; | |
1527 | |
1528 printerr "NOTICE: Finished region-based annotation on $linecount genetic variants in $queryfile"; | |
1529 if ($invalidcount) { | |
1530 printerr " (including $invalidcount with invalid format written to $outfile.invalid_input)"; | |
1531 } else { | |
1532 unlink ("$outfile.invalid_input"); | |
1533 } | |
1534 printerr "\n"; | |
1535 printerr "NOTICE: Output files were written to $outfile.${buildver}_$dbtype1\n"; | |
1536 } | |
1537 | |
1538 sub readGFF3RegionAnnotation { | |
1539 my ($dbfile); | |
1540 my ($regioncount, $dbcount) = (0, 0); | |
1541 my (@record, %regiondb, %parent); | |
1542 | |
1543 $dbfile = File::Spec->catfile ($dbloc, $gff3dbfile); | |
1544 -f $dbfile or die "Error: required database $dbfile does not exists. Please use 'annotate_variation.pl -downdb $dbtype $dbloc -buildver $buildver' to download annotation database.\n"; | |
1545 | |
1546 open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n"; | |
1547 printerr "NOTICE: Reading annotation database $dbfile ... "; | |
1548 $_ = <DB>; | |
1549 $_ =~ m/^##gff-version\s+3/ or die "Error: invalid header line found in the GFF3 database $dbfile (expect to see '##gff-version 3'): <$_>\n"; | |
1550 while (<DB>) { | |
1551 m/^#/ and next; #skip comments line | |
1552 m/^##FASTA/ and last; #reached the FASTA sequence section of GFF3 file | |
1553 $dbcount++; | |
1554 s/[\r\n]+$//; #deleting the newline characters | |
1555 @record = split (/\t/, $_); | |
1556 @record == 9 or die "Error: invalid records found in the GFF3 database $dbfile (9 fields expected): <$_>\n"; | |
1557 my ($chr, $start, $end, $score, $attribute) = @record[0,3,4,5,8]; | |
1558 $chr=~s/^chr//; #sometimes the chr prefix is present and should be removed (query usually does not contain this chr prefix) | |
1559 my $name; | |
1560 defined $score_threshold and $score < $score_threshold and next; #if --score_threshold is set, the low scoring segment will be skipped | |
1561 | |
1562 my @feature = split (/;/, $attribute); | |
1563 for my $i (0 .. @feature-1) { | |
1564 $feature[$i] =~ m/ID=(\S+)/ and $name = $1; | |
1565 } | |
1566 defined $name or die "Error: invalid record in GFF3 database $dbfile (ID field not found): <$_>\n"; | |
1567 for my $i (0 .. @feature-1) { | |
1568 if ($feature[$i] =~ m/Parent=(.+)/) { | |
1569 my @parent = split (/,/, $1); | |
1570 for my $j (0 .. @parent-1) { | |
1571 $parent{$name} .= $parent[$j]; | |
1572 } | |
1573 } | |
1574 } | |
1575 | |
1576 my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize)); | |
1577 for my $nextbin ($bin1 .. $bin2) { | |
1578 push @{$regiondb{$chr, $nextbin}}, [$start, $end, $score, $name]; | |
1579 } | |
1580 $regioncount++; | |
1581 if ($verbose and $dbcount =~ m/000000$/) { | |
1582 my ($availmem, $allmem) = currentAvailMemory(); | |
1583 printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n"; | |
1584 } | |
1585 } | |
1586 close (DB); | |
1587 for my $key (keys %regiondb) { #pre-sort gene DB by txstart to faciliate future use | |
1588 @{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}}; | |
1589 } | |
1590 printerr "Done with $regioncount regions\n"; | |
1591 return (\%regiondb, \%parent); | |
1592 } | |
1593 | |
1594 sub readBedRegionAnnotation { | |
1595 my ($dbfile); | |
1596 my ($regioncount, $dbcount) = (0, 0); | |
1597 my (@record, %regiondb); | |
1598 my ($chr, $start, $end); | |
1599 | |
1600 $dbfile = File::Spec->catfile ($dbloc, $bedfile); | |
1601 | |
1602 -f $dbfile or die "Error: required bedfile $dbfile does not exists.\n"; | |
1603 | |
1604 open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n"; | |
1605 printerr "NOTICE: Reading annotation database $dbfile ... "; | |
1606 | |
1607 while (<DB>) { | |
1608 $dbcount++; | |
1609 s/[\r\n]+$//; #deleting the newline characters | |
1610 @record = split (/\t/, $_); | |
1611 | |
1612 ($chr, $start, $end) = @record; | |
1613 | |
1614 | |
1615 $chr =~ s/^chr//; | |
1616 $start++; #due to the zero-opening coordinate system in UCSC | |
1617 | |
1618 my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize)); | |
1619 for my $nextbin ($bin1 .. $bin2) { | |
1620 push @{$regiondb{$chr, $nextbin}}, [$start, $end, 0, 'NA']; | |
1621 } | |
1622 $regioncount++; | |
1623 if ($verbose and $dbcount =~ m/000000$/) { | |
1624 my ($availmem, $allmem) = currentAvailMemory(); | |
1625 printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n"; | |
1626 } | |
1627 } | |
1628 close (DB); | |
1629 | |
1630 for my $key (keys %regiondb) { #pre-sort gene DB by txstart to faciliate future use | |
1631 @{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}}; | |
1632 } | |
1633 printerr "Done with $regioncount regions\n"; | |
1634 return (\%regiondb); | |
1635 } | |
1636 | |
1637 sub readUCSCRegionAnnotation { | |
1638 my ($dbfile); | |
1639 my ($regioncount, $dbcount) = (0, 0); | |
1640 my (@record, %regiondb); | |
1641 my ($chr, $start, $end, $score, $normscore, $name); | |
1642 my ($expectedLength, @positionCols, @scoreCols, @colsToOutput); | |
1643 | |
1644 if ($dbtype1 =~ m/^mce(\d+way)$/) { | |
1645 $dbfile = File::Spec->catfile ($dbloc, "${buildver}_phastConsElements$1.txt"); | |
1646 } else { | |
1647 $dbfile = File::Spec->catfile ($dbloc, "${buildver}_$dbtype1.txt"); | |
1648 } | |
1649 -f $dbfile or die "Error: required database $dbfile does not exists. Please use 'annotate_variation.pl -downdb $dbtype $dbloc' to download annotation database.\n"; | |
1650 | |
1651 #################$$$ | |
1652 ### The following SWITCH structure is modified Jan 2011 to faciliate future expansion | |
1653 ### $expectedLength is the number of cols expected in each line | |
1654 ### @postionCols => location of ($chr,$start,$end) columns | |
1655 ### @scoreCols => location of ($score, $normscore) columns leave empty is set not present (then set to zero below) ; WARNING must be empty or of length 2 | |
1656 ### @colsToOutPut => location of ($name) columns to put into $name concatinated with ":" below | |
1657 | |
1658 if ($dbtype1 =~ m/^phastConsElements\d+way/) { | |
1659 $expectedLength=6; | |
1660 @positionCols=(1,2,3); | |
1661 @scoreCols=(4,5); #normalized score | |
1662 @colsToOutput=(4); #lod=xxx is the Name output | |
1663 } elsif ($dbtype1 eq 'evofold') { | |
1664 $expectedLength=10; | |
1665 @positionCols=(1,2,3); | |
1666 @scoreCols=(5,5); | |
1667 @colsToOutput=(4); | |
1668 } elsif ($dbtype1 eq 'tfbsConsSites') { | |
1669 $expectedLength=8; | |
1670 @positionCols=(1,2,3); | |
1671 @scoreCols=(7,5); | |
1672 @colsToOutput=(4); | |
1673 } elsif ($dbtype1 eq 'wgRna') { | |
1674 $expectedLength=10; | |
1675 @positionCols=(1,2,3); | |
1676 @scoreCols=(5,5); | |
1677 @colsToOutput=(4); | |
1678 } elsif ($dbtype1 eq 'targetScanS') { | |
1679 $expectedLength=7; | |
1680 @positionCols=(1,2,3); | |
1681 @scoreCols=(5,5); | |
1682 @colsToOutput=(4); | |
1683 } elsif ($dbtype1 eq 'genomicSuperDups') { | |
1684 $expectedLength=30; | |
1685 @positionCols=(1,2,3); | |
1686 @scoreCols=(27,27); | |
1687 @colsToOutput=(4); | |
1688 } elsif ($dbtype1 eq 'omimGene') { | |
1689 $expectedLength=5; | |
1690 @positionCols=(1,2,3); | |
1691 @scoreCols=(); | |
1692 @colsToOutput=(4); | |
1693 } elsif ($dbtype1 eq 'gwasCatalog') { | |
1694 $expectedLength=23; | |
1695 @positionCols=(1,2,3); | |
1696 @scoreCols=(); | |
1697 @colsToOutput=(10); | |
1698 } elsif ($dbtype1 eq 'dgv') { | |
1699 $expectedLength=16; | |
1700 @positionCols=(1,2,3); | |
1701 @scoreCols=(); | |
1702 @colsToOutput=(4); | |
1703 } elsif ($dbtype1 eq 'cytoBand') { #special handling required | |
1704 $expectedLength=5; | |
1705 @positionCols=(0,1,2); | |
1706 @scoreCols=(); | |
1707 @colsToOutput=(0,3); | |
1708 } elsif ($dbtype1 =~ m/^chr\w+_chainSelf$/) { #example: chr1_selfChain | |
1709 $expectedLength=13; | |
1710 @positionCols=(2,4,5); | |
1711 @scoreCols=(12,12); | |
1712 @colsToOutput=(11); | |
1713 } elsif ($dbtype1 =~ m/^chr\w+_chain\w+$/) { #example: chr1_chainPanTro2 | |
1714 $expectedLength=12; | |
1715 @positionCols=(2,4,5); | |
1716 @scoreCols=(); | |
1717 @colsToOutput=(11); | |
1718 } elsif ($dbtype1 eq 'snp130' or $dbtype1 eq 'snp131') { | |
1719 $expectedLength=18; | |
1720 @positionCols=(1,2,3); | |
1721 @scoreCols=(); | |
1722 @colsToOutput=(4); | |
1723 } else { | |
1724 #other UCSC format if file is not defined above | |
1725 $expectedLength=''; | |
1726 @positionCols=(1,2,3); | |
1727 @scoreCols=(); | |
1728 @colsToOutput=(4); | |
1729 } | |
1730 | |
1731 if ($scorecolumn) { | |
1732 @scoreCols = ($scorecolumn, $scorecolumn); | |
1733 } | |
1734 | |
1735 open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n"; | |
1736 printerr "NOTICE: Reading annotation database $dbfile ... "; | |
1737 | |
1738 if ($expectedLength eq '') { # if DB is unknown "generic format" use first line to get $expectedLength : file rewound afterwards | |
1739 my $line = <DB>; | |
1740 @record = split (/\t/, $line); | |
1741 $expectedLength=@record; | |
1742 seek (DB, 0, 0); | |
1743 }; | |
1744 | |
1745 ########$$ Check to see if user has defined columns to output (intergers or all allowed) | |
1746 if (defined $colsWanted) { | |
1747 if ($colsWanted[0] eq 'all') { | |
1748 @colsToOutput= 0 .. ($expectedLength-1); | |
1749 } elsif ($colsWanted[0] eq 'none') { | |
1750 @colsToOutput = (); | |
1751 } else{ | |
1752 @colsToOutput = @colsWanted; | |
1753 } | |
1754 }; | |
1755 | |
1756 ########$$ check that the columns requested exist in the current DB | |
1757 for my $i (0 .. @colsToOutput-1) { | |
1758 if ($colsToOutput[$i] > $expectedLength) { | |
1759 die "Error: The DB file $dbfile has only $expectedLength columns but output column $colsToOutput[$i] is requested by --colsWanted!\n"; | |
1760 } | |
1761 } | |
1762 | |
1763 while (<DB>) { | |
1764 $dbcount++; | |
1765 s/[\r\n]+$//; #deleting the newline characters | |
1766 @record = split (/\t/, $_); | |
1767 | |
1768 @record == $expectedLength or die "Error: invalid record in dbfile $dbfile ($expectedLength fields expected): <$_>\n"; | |
1769 ($chr, $start, $end) = @record[@positionCols]; | |
1770 if (@colsToOutput) { #I think there should always be a Name in the output column | |
1771 $name = join (':', @record[@colsToOutput]); | |
1772 } | |
1773 | |
1774 if(@scoreCols){ | |
1775 ($score, $normscore)=(@record[@scoreCols]) | |
1776 } else{ | |
1777 ($score, $normscore) = qw/0 0/; | |
1778 } | |
1779 | |
1780 #########$$ Unusual exceptions for phastCons | |
1781 if ($dbtype1 =~ m/^phastConsElements\d+way/) { | |
1782 $score =~ s/^lod=// or die "Error: invalid lod score designation (no 'lod=' found) in dbfile $dbfile: <$_>\n"; | |
1783 } ##lod= in the score for conservation tracks | |
1784 | |
1785 #########$$ Unusual exceptions for cytoBand | |
1786 if ($dbtype1 eq 'cytoBand' and not defined $colsWanted) { #the name for chromosome band is concatenated as single word | |
1787 $name =~ s/://; | |
1788 } | |
1789 | |
1790 defined $score_threshold and $score < $score_threshold and next; #if --score_threshold is set, the low scoring segment will be skipped | |
1791 defined $normscore_threshold and $normscore < $normscore_threshold and next; #if --normscore_threshold is set, the low scoring segment will be skipped | |
1792 | |
1793 $chr =~ s/^chr//; | |
1794 $start++; #due to the zero-opening coordinate system in UCSC | |
1795 | |
1796 my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize)); | |
1797 for my $nextbin ($bin1 .. $bin2) { | |
1798 if ($rawscore) { #print out rawscore, rather than normalized score (default) | |
1799 $normscore = $score; | |
1800 } | |
1801 if (defined $name) { | |
1802 push @{$regiondb{$chr, $nextbin}}, [$start, $end, $normscore, $name]; | |
1803 } else { #name is not requested in the output | |
1804 push @{$regiondb{$chr, $nextbin}}, [$start, $end, $normscore]; | |
1805 } | |
1806 } | |
1807 $regioncount++; | |
1808 if ($verbose and $dbcount =~ m/000000$/) { | |
1809 my ($availmem, $allmem) = currentAvailMemory(); | |
1810 printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n"; | |
1811 } | |
1812 } | |
1813 close (DB); | |
1814 | |
1815 for my $key (keys %regiondb) { #pre-sort gene DB by txstart to faciliate future use | |
1816 @{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}}; | |
1817 } | |
1818 printerr "Done with $regioncount regions"; | |
1819 if (defined $score_threshold or $normscore_threshold) { | |
1820 printerr " (that passed --score_threhsold or --normscore_threshold from a total of $dbcount regions)\n"; | |
1821 } else { | |
1822 printerr "\n"; | |
1823 } | |
1824 return (\%regiondb); | |
1825 } | |
1826 | |
1827 | |
1828 sub translateDNA { | |
1829 my ($seq) = @_; | |
1830 my ($nt3, $protein); | |
1831 $seq = uc $seq; | |
1832 #length ($seq) % 3 == 0 or printerr "WARNING: length of DNA sequence to be translated is not multiples of 3: <length=${\(length $seq)}>\n"; | |
1833 while ($seq =~ m/(...)/g) { | |
1834 defined $codon1{$1} or printerr "WARNING: invalid triplets found in DNA sequence to be translated: <$1>\n"; | |
1835 $protein .= $codon1{$1}; | |
1836 } | |
1837 return $protein; | |
1838 } | |
1839 | |
1840 sub translateRNA { | |
1841 my ($seq) = @_; | |
1842 my ($nt3, $protein); | |
1843 $seq = uc $seq; | |
1844 #length ($seq) % 3 == 0 or printerr "WARNING: length of RNA sequence to be translated is not multiples of 3: <length=${\(length $seq)}>\n"; | |
1845 while ($seq =~ m/(...)/g) { | |
1846 defined $codonr1{$1} or printerr "WARNING: invalid triplets found in RNA sequence to be translated: <$1>\n"; | |
1847 $protein .= $codonr1{$1}; | |
1848 } | |
1849 return $protein; | |
1850 } | |
1851 | |
1852 sub revcom { | |
1853 my ($seq) = @_; | |
1854 $seq = reverse $seq; | |
1855 $seq =~ tr/acgtACGT/tgcaTGCA/; | |
1856 return ($seq); | |
1857 } | |
1858 | |
1859 sub readSeqFromFASTADB { | |
1860 my ($refseqvar) = @_; | |
1861 my (%seqhash); | |
1862 my $seqdbfile; | |
1863 | |
1864 #the four statements below should be condensed in the future (they are identical) | |
1865 $seqdbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1" . "Mrna.fa"); | |
1866 | |
1867 my ($seqid, $curseq) = ('', ''); | |
1868 | |
1869 -f $seqdbfile or die "Error: FASTA sequence file $seqdbfile does not exist. Please use 'annotate_variation.pl --downdb $dbtype $dbloc' download the database.\n"; | |
1870 open (SEQ, $seqdbfile) or die "Error: cannot read from seqdbfile $seqdbfile: $!\n"; | |
1871 printerr "NOTICE: Reading FASTA sequences from $seqdbfile ... "; | |
1872 while (<SEQ>) { | |
1873 if (m/^>(\S+)/) { | |
1874 if ($refseqvar->{$seqid}) { | |
1875 not defined $seqhash{$seqid} and $seqhash{$seqid} = $curseq; #finish reading the sequence for seqid and save it (unless the sequence is already read from the file) | |
1876 } | |
1877 $seqid = $1; | |
1878 $curseq = ''; | |
1879 } else { | |
1880 if ($refseqvar->{$seqid}) { | |
1881 s/[\r\n]+$//; | |
1882 $curseq .= uc $_; #only use upper case characters | |
1883 } | |
1884 } | |
1885 } | |
1886 if ($refseqvar->{$seqid}) { #finish the last sequence in the file | |
1887 not defined $seqhash{$seqid} and $seqhash{$seqid} = $curseq; | |
1888 } | |
1889 close (SEQ); | |
1890 printerr "Done with ", scalar keys %seqhash, " sequences\n"; | |
1891 if (keys %seqhash < keys %$refseqvar) { | |
1892 my (@seqnotfound, @seqnotfound_example); | |
1893 for $seqid (keys %$refseqvar) { | |
1894 exists $seqhash{$seqid} or push @seqnotfound, $seqid; | |
1895 } | |
1896 printerr "WARNING: A total of ${\(scalar @seqnotfound)} sequences cannot be found in $seqdbfile"; | |
1897 @seqnotfound_example = splice (@seqnotfound, 0, 3); | |
1898 printerr " (example: @seqnotfound_example)\n"; | |
1899 } | |
1900 return (\%seqhash); | |
1901 } | |
1902 | |
1903 sub readKgXref { | |
1904 my ($inputfile) = @_; | |
1905 my (%gene_xref); | |
1906 open (XREF, $inputfile) or die "Error: cannot read from kgxref file $inputfile: $!\n"; | |
1907 while (<XREF>) { | |
1908 m/^#/ and next; | |
1909 s/[\r\n]+$//; | |
1910 my @record = split (/\t/, $_); | |
1911 @record == 8 or die "Error: invalid record found in knownGene cross-reference file (6 fields expected): <$_>\n"; | |
1912 #some genes were given names that are prefixed with "Em:" which should be removed due to the presence of ":" in exonic variant annotation | |
1913 #Em:AC006547.7 Em:AC005003.4 Em:U62317.15 Em:AC008101.5 Em:AC004997.11 Em:U51561.2 | |
1914 $record[4] =~ s/^Em:/Em./; | |
1915 if ($gene_xref{$record[0]}) { #BC003168 occur twice in kgxref file (OSBPL10, BC003168) | |
1916 if ($gene_xref{$record[0]} =~ m/^(BC|AK)\d+$/) { | |
1917 $gene_xref{$record[0]} = $record[4]; | |
1918 } | |
1919 } else { | |
1920 $gene_xref{$record[0]} = $record[4]; | |
1921 } | |
1922 } | |
1923 close (XREF); | |
1924 return (\%gene_xref); | |
1925 } | |
1926 | |
1927 sub readUCSCGeneAnnotation { #read RefGene annotation database from the UCSC Genome Browser, convert 0-based coordinates to 1-based coordinates | |
1928 my ($dbloc) = @_; | |
1929 my ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes); | |
1930 my (%genedb, %geneidmap, %name2count, %cdslen, %mrnalen); | |
1931 my ($genecount, $ncgenecount) = (0, 0); | |
1932 | |
1933 my $dbfile; | |
1934 my $kgxref; | |
1935 | |
1936 if ($dbtype1 eq 'refGene') { | |
1937 $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt"); | |
1938 } elsif ($dbtype1 eq 'knownGene') { | |
1939 $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt"); | |
1940 my $kgxreffile = File::Spec->catfile($dbloc, $buildver . "_kgXref.txt"); | |
1941 -f $kgxreffile or die "Error: the knownGene cross-reference file $kgxreffile does not exist. Please use 'annotate_variation.pl --downdb knownGene $dbloc' to download the database.\n"; | |
1942 $kgxref = readKgXref ($kgxreffile); | |
1943 } elsif ($dbtype1 eq 'ensGene') { | |
1944 $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt"); | |
1945 } else { | |
1946 $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt"); #added 2011feb18 | |
1947 #die "FATAL ERROR: the dbype $dbtype1 is not supported in the readUCSCGeneAnnotation() subroutine.\n"; #commented 2011feb18 | |
1948 } | |
1949 -f $dbfile or die "Error: The gene annotation database $dbfile does not exist. Please use 'annotate_variation.pl --downdb $dbtype $dbloc -build $buildver' to download the database.\n"; | |
1950 | |
1951 open (GENEDB, $dbfile) or die "Error: cannot read from gene annotaion database $dbfile: $!\n"; | |
1952 printerr "NOTICE: Reading gene annotation from $dbfile ... "; | |
1953 while (<GENEDB>) { | |
1954 s/[\r\n]+$//; #deleting the newline characters | |
1955 my @record = split (/\t/, $_); | |
1956 | |
1957 if ($dbtype1 eq 'refGene') { | |
1958 @record == 16 or die "Error: invalid record in $dbfile (expecting 16 tab-delimited fields in refGene file): <$_>\n"; | |
1959 ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15]; #human hg18, mouse | |
1960 } elsif ($dbtype1 eq 'knownGene') { | |
1961 @record >= 11 or die "Error: invalid record in $dbfile (>=11 fields expected in knownGene file): <$_>\n"; #mm8=11, hg18=hg19=12 | |
1962 ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend) = @record[0..9]; | |
1963 $name2 = $kgxref->{$name} || $name; | |
1964 } elsif ($dbtype1 eq 'ensGene') { | |
1965 @record == 16 or die "Error: invalid record in $dbfile (expecting 16 fields in ensGene file): <$_>\n"; | |
1966 ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15]; | |
1967 } else { | |
1968 @record >= 11 or die "Error: invalid record in $dbfile (>=11 fields expected in $dbtype1 gene definition file): <$_>\n"; | |
1969 ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15]; | |
1970 defined $name2 or $name2=$name; | |
1971 #die "FATAL ERROR: the --dbtype $dbtype is not supported in readUCSCGeneAnnotation() subroutine.\n"; #commented 2011feb18 | |
1972 } | |
1973 | |
1974 #handle situations where the same transcript is mapped to several chromosomes or regions (for example, NM_019105 is mapped to chr6, chr6_cox_hap1, chr6_qbl_hap2; NM_002538 is mapped to chr5 positive and negative strand and also in chr5_h2_hap1) | |
1975 if ($chr =~ m/hap\d+$/) { | |
1976 next; #this is a temporary solution on 2011feb19, to ignore alternative haplotype chromosomes | |
1977 } | |
1978 | |
1979 $chr =~ s/^chr// or die "Error: invalid record found in $dbfile (chrom field not found): <$_>\n"; #UCSC always prefix "chr" to the chromosome identifier, so this is a good check to make sure that the file is the correct file | |
1980 $dbstrand eq '+' or $dbstrand eq '-' or die "Error: invalid dbstrand information found in $dbfile (dbstrand has to be + or -): <$_>\n"; #dbstrand is important to know and cannot be optional | |
1981 my @exonstart = split (/,/, $exonstart); #remove trailing comma | |
1982 my @exonend = split (/,/, $exonend); #remove trailing comma | |
1983 $exoncount == @exonstart or die "Error: invalid record found in $dbfile (exoncount discordance): <$exoncount vs ${\(scalar @exonstart)}>\n"; | |
1984 @exonstart == @exonend or die "Error: invalid record found in $dbfile (exonstart and exonend count discordance): <${\(scalar @exonstart)} vs ${\(scalar @exonend)}>\n"; | |
1985 $txstart++; $cdsstart++; map {$_++} @exonstart; #convert 0-based coordinate to 1-based coordinate | |
1986 | |
1987 #LOGIC here: | |
1988 #first calcluate mRNA length, and if the transcript maps to multiple locations with discordant mRNA length, only consider the leftmost chromosome and leftmost coordinate (because the FASTA file is sorted in this manner) | |
1989 | |
1990 my $cdslength = 0; | |
1991 my $mrnalength = 0; | |
1992 for my $i (0 .. @exonstart-1) { #this calculation is valid regardless of strand | |
1993 $mrnalength += $exonend[$i]-$exonstart[$i]+1; | |
1994 if ($cdsstart >= $exonstart[$i] and $cdsstart <= $exonend[$i]) { | |
1995 if ($cdsend <= $exonend[$i]) { | |
1996 $cdslength = $cdsend-$cdsstart+1; | |
1997 last; | |
1998 } else { | |
1999 $cdslength += $exonend[$i]-$cdsstart+1; | |
2000 next; | |
2001 } | |
2002 } | |
2003 if ($cdslength and $cdsend < $exonstart[$i]) { | |
2004 die "FATAL ERROR: impossible scenario for $name in $dbfile (cdsend is less than exon start)"; | |
2005 } elsif ($cdslength and $cdsend <= $exonend[$i]) { | |
2006 $cdslength += $cdsend-$exonstart[$i]+1; | |
2007 last; | |
2008 } elsif ($cdslength and $cdsend > $exonend[$i]) { | |
2009 $cdslength += $exonend[$i]-$exonstart[$i]+1; | |
2010 } | |
2011 | |
2012 } | |
2013 | |
2014 if ($cdsstart != $cdsend+1) { #coding gene | |
2015 if (defined $mrnalen{$name} and $mrnalen{$name} != $mrnalength) { | |
2016 $verbose and printerr "WARNING: $name occurs more than once in $dbfile with different mRNA length. The first occurences with identical mRNA length will be uesd in analysis.\n"; | |
2017 next; | |
2018 } | |
2019 | |
2020 | |
2021 if (defined $cdslen{$name} and $cdslen{$name} != $cdslength) { | |
2022 $verbose and printerr "WARNING: $name occurs more than once in $dbfile with different CDS length. The first occurences with identical mRNA length will be uesd in analysis.\n"; | |
2023 next; | |
2024 } | |
2025 } | |
2026 | |
2027 $cdslen{$name} = $cdslength; | |
2028 $mrnalen{$name} = $mrnalength; | |
2029 | |
2030 my ($bin1, $bin2) = (int(($txstart - $neargene)/$genomebinsize), int(($txend + $neargene)/$genomebinsize)); | |
2031 for my $nextbin ($bin1 .. $bin2) { | |
2032 push @{$genedb{$chr, $nextbin}}, [$name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, [@exonstart], [@exonend], $name2]; | |
2033 } | |
2034 $geneidmap{$name} = $name2; | |
2035 $genecount++; | |
2036 $name2count{$name2}++; | |
2037 $cdsstart == $cdsend+1 and $ncgenecount++; #non-coding gene has the same start and end site | |
2038 } | |
2039 close (GENEDB); | |
2040 for my $key (keys %genedb) { #pre-sort gene DB by txstart to faciliate future use | |
2041 @{$genedb{$key}} = sort {$a->[2] <=> $b->[2]} @{$genedb{$key}}; | |
2042 } | |
2043 printerr "Done with $genecount transcripts (including $ncgenecount without coding sequence annotation) for ", scalar (keys %name2count), " unique genes\n"; | |
2044 return (\%genedb, \%geneidmap, \%cdslen, \%mrnalen); | |
2045 } | |
2046 | |
2047 sub downloadDB { | |
2048 my ($cwd, $msg, $sc); | |
2049 | |
2050 $cwd = Cwd::cwd(); | |
2051 | |
2052 -w $dbloc or die "Error: the directory $dbloc is not writable by the current user\n"; | |
2053 chdir ($dbloc) or die "Error: the directory $dbloc cannot be accessed\n"; | |
2054 | |
2055 my (@urlin, @filein, @fileout, %fail); #the fail hash contains index of files that fail to be downloaded | |
2056 my $count_success; | |
2057 if ($dbtype1 eq 'refGene') { | |
2058 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/refGene.txt.gz"; | |
2059 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/refLink.txt.gz"; | |
2060 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_refGeneMrna.fa.gz"; | |
2061 } elsif ($dbtype1 eq 'knownGene') { | |
2062 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/knownGene.txt.gz"; | |
2063 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/kgXref.txt.gz"; | |
2064 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_knownGeneMrna.fa.gz"; | |
2065 } elsif ($dbtype1 eq 'ensGene') { | |
2066 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/ensGene.txt.gz"; | |
2067 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_ensGeneMrna.fa.gz"; | |
2068 } elsif ($dbtype1 eq 'seq') { | |
2069 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/chromFa.zip"; #example: hg18, hg19 | |
2070 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/chromFa.tar.gz"; #example: panTro2 | |
2071 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/$buildver.fa.gz"; #example: bosTau4 | |
2072 } elsif ($dbtype1 =~ m/^mce(\d+way)$/) { #it could be 17 way, 28 way, 30 way, 44 way, etc, depending on genome and on build | |
2073 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/phastConsElements$1.txt.gz"; | |
2074 } elsif ($dbtype1 eq 'avsift') { | |
2075 $buildver eq 'hg18' or $buildver eq 'hg19' or die "Error: currently the --dbtype of avsift only support --buildver of 'hg18' or 'hg19'\n"; | |
2076 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_avsift.txt.gz"; | |
2077 } elsif ($dbtype1 eq '1000g') { #dbtype1 is same as queryfile, when --downdb is used | |
2078 $buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g' only support --buildver of 'hg18'\n"; | |
2079 push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/CEU.sites.2009_04.gz"; | |
2080 push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/YRI.sites.2009_04.gz"; | |
2081 push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/JPTCHB.sites.2009_04.gz"; | |
2082 } elsif ($dbtype1 eq '1000g2010') { #dbtype1 is same as queryfile, when --downdb is used | |
2083 $buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g2010' only support --buildver of 'hg18'\n"; | |
2084 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_CEU.sites.2010_03.txt.gz"; | |
2085 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_YRI.sites.2010_03.txt.gz"; | |
2086 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_JPTCHB.sites.2010_03.txt.gz"; | |
2087 } elsif ($dbtype1 eq '1000g2010jul') { #dbtype1 is same as queryfile, when --downdb is used | |
2088 $buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g2010jul' only support --buildver of 'hg18'\n"; | |
2089 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_CEU.sites.2010_07.txt.gz"; | |
2090 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_YRI.sites.2010_07.txt.gz"; | |
2091 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_JPTCHB.sites.2010_07.txt.gz"; | |
2092 } elsif ($dbtype1 eq '1000g2010nov') { | |
2093 $buildver eq 'hg19' or die "Error: currently the --dbtype of '1000g2010nov' only support --buildver of 'hg19'\n"; | |
2094 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg19_ALL.sites.2010_11.txt.gz"; | |
2095 } elsif ($dbtype1 eq 'null') { | |
2096 1; | |
2097 } else { | |
2098 if ($webfrom) { | |
2099 if ($webfrom eq 'annovar') { | |
2100 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_$dbtype1.txt.gz"; | |
2101 } elsif ($webfrom eq 'ucsc') { | |
2102 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/$dbtype1.txt.gz"; | |
2103 } else { | |
2104 push @urlin, "$webfrom/$dbtype1.txt.gz"; | |
2105 } | |
2106 } else { | |
2107 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/$dbtype1.txt.gz"; #default goes to UCSC | |
2108 } | |
2109 } | |
2110 | |
2111 @filein = @urlin; | |
2112 map {s/.+\///} @filein; | |
2113 @fileout = @filein; | |
2114 map {s/\.gz$//; s/\.zip$//} @fileout; | |
2115 | |
2116 if ($wget) { | |
2117 $msg = qx/wget --help 2>&1/ || ''; #collect the output of the system command | |
2118 } else { | |
2119 $msg = ''; #when --nowget is specified, do not use wget to retrieve files from Internet | |
2120 } | |
2121 if ($msg =~ m/Usage/) { | |
2122 checkProgramUpdate ("wget"); | |
2123 for my $i (0 .. @urlin-1) { | |
2124 printerr "NOTICE: Downloading annotation database $urlin[$i] ... "; | |
2125 if ($verbose) { | |
2126 $sc = "wget -t 1 -T 10 -O $filein[$i] $urlin[$i]"; | |
2127 } else { | |
2128 $sc = "wget -t 1 -T 10 -q -O $filein[$i] $urlin[$i]"; | |
2129 } | |
2130 if (system ($sc)) { #time-out is 10 seconds, with 1 retry attempt | |
2131 printerr "Failed\n"; | |
2132 $verbose and print "WARNING: unable to execute system command: <$sc>\n"; | |
2133 unlink ($filein[$i]); #delete the temporary files generated by wget | |
2134 $fail{$i}++; | |
2135 } else { | |
2136 printerr "OK\n"; | |
2137 $count_success++; | |
2138 } | |
2139 } | |
2140 } else { | |
2141 eval { | |
2142 require Net::FTP; | |
2143 require LWP::UserAgent; | |
2144 }; | |
2145 if ($@) { | |
2146 printerr "WARNING: cannot retrieve remote files automatically (by 'wget' command or by standard Net::FTP/LWP::UserAgent Perl module).\n"; | |
2147 printerr "Please manually download the following file, uncompress the files to $dbloc directory, then add a ${buildver}_ prefix to the file names.\n"; | |
2148 printerr join ("\n", @urlin), "\n"; | |
2149 exit (100); | |
2150 } | |
2151 | |
2152 checkProgramUpdate ("lwp"); | |
2153 my ($http, $ftp); | |
2154 for my $i (0 .. @urlin-1) { | |
2155 printerr "NOTICE: Downloading annotation database $urlin[$i] ... "; | |
2156 if ($urlin[$i] =~ m/^http/) { | |
2157 $http = LWP::UserAgent->new (timeout=>10, show_progress=>$verbose); | |
2158 $http->env_proxy; | |
2159 | |
2160 my $response = $http->get ($urlin[$i], ':content_file'=>$filein[$i]); | |
2161 if ($response->is_success) { | |
2162 printerr "Done\n"; | |
2163 $count_success++; | |
2164 } else { | |
2165 printerr "Failed\n"; | |
2166 $verbose and printerr "WARNING: cannot retrieve remote files ($urlin[$i]) via LWP::UserAgent Perl module: ", $response->status_line, "\n"; | |
2167 $fail{$i}++; | |
2168 } | |
2169 } elsif ($urlin[$i] =~ m#^ftp://([^\\\/]+)#) { #for hgdownload.cse.ucsc.edu, ftp-trace.ncbi.nih.gov, ftp.ensembl.org, etc | |
2170 my $urlroot = $1; | |
2171 if ($ftp = Net::FTP->new($urlroot, Timeout=>10, Debug=>$verbose)) { | |
2172 $ftp->login("anonymous", 'anonymous@'); | |
2173 $ftp->binary(); | |
2174 my $url = $urlin[$i]; | |
2175 $url =~ s#ftp://[\w\.\-]+/##; #remove the URL root | |
2176 if (not $ftp->get($url)) { | |
2177 printerr "Failed\n"; | |
2178 $verbose and printerr "WARNING: cannot retrieve remote file ($url) in FTP server $urlroot\n"; | |
2179 $fail{$i}++; | |
2180 } else { | |
2181 printerr "Done\n"; | |
2182 $count_success++; | |
2183 } | |
2184 } else { | |
2185 printerr "Failed\n"; | |
2186 $verbose and printerr "WARNING: cannot retrieve remote file ($urlin[$i]) via Net::FTP Perl module\n"; | |
2187 $fail{$i}++; | |
2188 } | |
2189 | |
2190 } else { | |
2191 die "Error: The URL $urlin[$i] uses an unsupported protocol. Download cannot continue\n"; | |
2192 } | |
2193 } | |
2194 } | |
2195 | |
2196 $count_success and printerr "NOTICE: Uncompressing downloaded files\n"; | |
2197 for my $i (0 .. @filein-1) { | |
2198 $fail{$i} and next; | |
2199 if ($filein[$i] =~ m/\.zip$/) { | |
2200 $msg = qx/unzip --help 2>&1/ || ''; #collect the output of the system command | |
2201 if ($msg =~ m/Usage/i) { | |
2202 if ($verbose) { | |
2203 system ("unzip -o $filein[$i]"); | |
2204 } else { | |
2205 system ("unzip -o -q $filein[$i]"); | |
2206 } | |
2207 } else { | |
2208 printerr "ERROR: unzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n"; | |
2209 exit (101); | |
2210 } | |
2211 } elsif ($filein[$i] =~ m/\.tar\.gz$/) { #panTro2 FASTA sequence is stored as tar.gz rather than zip | |
2212 $msg = qx/tar --help 2>&1/ || ''; #collect the output of the system command | |
2213 if ($msg =~ m/Usage/i) { | |
2214 system ("tar -x -z -f $filein[$i]"); | |
2215 } else { | |
2216 printerr "ERROR: tar/gunzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n"; | |
2217 exit (102); | |
2218 } | |
2219 } elsif ($filein[$i] =~ m/\.gz$/) { | |
2220 $msg = qx/gunzip --help 2>&1/ || ''; #collect the output of the system command | |
2221 if ($msg =~ m/Usage/i) { | |
2222 system ("gunzip -f $filein[$i]"); | |
2223 } else { | |
2224 printerr "ERROR: gunzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n"; | |
2225 exit (103); | |
2226 } | |
2227 } | |
2228 } | |
2229 | |
2230 for my $i (0 .. @fileout-1) { | |
2231 $fail{$i} and next; #skip the file that failed to be downloaded | |
2232 my $fileout = $fileout[$i]; | |
2233 $dbtype1 eq 'seq' and next; #the zip file contains dozens of FASTA files so cannot rename them automatically | |
2234 if (not $fileout =~ m/^${buildver}_/) { #if the buildver is not the prefix of the files | |
2235 rename ($fileout, "${buildver}_$fileout") or die "Error: cannot rename $fileout to ${buildver}_$fileout\n"; | |
2236 $fileout = "${buildver}_$fileout"; | |
2237 } | |
2238 if (not $fileout =~ m/\.txt$/ and not $fileout =~ m/\.fa$/) { | |
2239 rename ($fileout, "$fileout.txt"); | |
2240 } | |
2241 } | |
2242 | |
2243 $count_success and printerr "NOTICE: Finished downloading annotation files for $buildver build version, with files saved at the '$dbloc' directory\n"; | |
2244 $cwd and chdir ($cwd); | |
2245 if (%fail) { | |
2246 my @failindex = keys %fail; | |
2247 if ($dbtype1 eq 'seq' and @failindex == 1) { #not really a fail, because for seq, ANNOVAR attempts on tar.gz and zip file | |
2248 1; | |
2249 } else { | |
2250 printerr "WARNING: Some files cannot be downloaded, including ", join (', ', @urlin[@failindex]), "\n"; | |
2251 } | |
2252 | |
2253 for my $index (@failindex) { | |
2254 if ($urlin[$index] =~ m#^http://www\.openbioinformatics\.org.+Mrna.fa.gz$#) { | |
2255 printerr "---------------------------ADDITIONAL PROCEDURE---------------------------\n"; | |
2256 printerr "--------------------------------------------------------------------------\n"; | |
2257 printerr "NOTICE: the FASTA file $urlin[$index] is not available to download but can be generated by the ANNOVAR software. "; | |
2258 printerr "PLEASE RUN THE FOLLOWING TWO COMMANDS CONSECUTIVELY TO GENERATE THE FASTA FILES:\n\n"; | |
2259 printerr "\tannotate_variation.pl --buildver $buildver --downdb seq $dbloc/${buildver}_seq\n"; | |
2260 printerr "\tretrieve_seq_from_fasta.pl $dbloc/${buildver}_$dbtype1.txt -seqdir $dbloc/${buildver}_seq -format $dbtype1 -outfile $dbloc/${buildver}_${dbtype1}Mrna.fa\n"; | |
2261 printerr "--------------------------------------------------------------------------\n"; | |
2262 printerr "--------------------------------------------------------------------------\n"; | |
2263 } | |
2264 } | |
2265 } | |
2266 } | |
2267 | |
2268 sub currentAvailMemory { | |
2269 my ($availmem, $allmem) = (0, 0); | |
2270 if ($^O eq "MSWin32") { #no easy solution to get available memory from Windows. | |
2271 ($availmem, $allmem) = (0, 0); | |
2272 } elsif ($^O eq 'linux' or $^O eq 'aix' or $^O eq 'solaris') { | |
2273 if (open (TOP, "top -b -n 1 2>&1 |")) { | |
2274 my $index; | |
2275 while (<TOP>) { | |
2276 if (m/^Mem:.+\s(\d+)k free/) { | |
2277 $availmem = $1; | |
2278 } | |
2279 s/^\s+//; | |
2280 my @field = split (/\s+/, $_); | |
2281 @field >= 10 or next; #make sure that the PID lines are reached | |
2282 if ($field[0] eq 'PID') { | |
2283 for my $i (0 .. @field-1) { | |
2284 $field[$i] eq 'RES' and $index = $i; | |
2285 } | |
2286 } | |
2287 if ($field[0] eq $$) { | |
2288 defined $index or die "Error: invalid output from top command: the line with PID and RES is not found\n"; | |
2289 $allmem = $field[$index]; | |
2290 if ($allmem =~ m/^([\d\.]+)(\w)$/) { | |
2291 if ($2 eq 'g') { | |
2292 $allmem = $1 * 1_000_000; | |
2293 } elsif ($2 eq 'm') { | |
2294 $allmem = $1 * 1_000; | |
2295 } elsif ($2 eq 'k') { | |
2296 $allmem = $1; | |
2297 } else { | |
2298 printerr "WARNING: unrecognizable output from top command: <$_>\n"; | |
2299 } | |
2300 } | |
2301 last; | |
2302 } | |
2303 } | |
2304 } | |
2305 } else { | |
2306 ($availmem, $allmem) = (0, 0); | |
2307 } | |
2308 return ($availmem, $allmem); | |
2309 } | |
2310 | |
2311 sub printerr { | |
2312 print STDERR @_; | |
2313 print LOG @_; | |
2314 } | |
2315 | |
2316 sub checkProgramUpdate { | |
2317 my ($method) = @_; | |
2318 my $sc; | |
2319 my ($curdate, $webdate, $webdate1) = $LAST_CHANGED_DATE; | |
2320 my (@webcontent); | |
2321 $method eq 'wget' or $method eq 'lwp' or die "Error: update checking method can be only 'wget' or 'lwp'"; | |
2322 printerr "NOTICE: Web-based checking to see whether ANNOVAR new version is available ... "; | |
2323 $LAST_CHANGED_DATE =~ m/LastChangedDate: (\d+)\-(\d+)-(\d+)/ or printerr "Failed\n" and return; | |
2324 $curdate = $1.$2.$3; | |
2325 if ($method eq 'wget') { | |
2326 $sc = "wget -t 1 -T 10 -q -O .annovar_date http://www.openbioinformatics.org/annovar/download/annovar_date"; | |
2327 if (system ($sc)) { | |
2328 printerr "Failed\n"; | |
2329 return; | |
2330 } else { | |
2331 if (not open (AVDATE, ".annovar_date")) { | |
2332 printerr "Cannot access version information\n"; | |
2333 } else { | |
2334 printerr "Done\n"; | |
2335 @webcontent = <AVDATE>; #$LAST_CHANGED_DATE = '$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $'; | |
2336 close (AVDATE); | |
2337 unlink (".annovar_date"); | |
2338 } | |
2339 } | |
2340 } elsif ($method eq 'lwp') { | |
2341 my $http = LWP::UserAgent->new (timeout=>10); | |
2342 $http->env_proxy; | |
2343 my $response = $http->get("http://www.openbioinformatics.org/annovar/download/annovar_date"); | |
2344 if ($response->is_success) { | |
2345 printerr "Done\n"; | |
2346 $_ = $response->decoded_content; | |
2347 @webcontent = split (/\n/, $_); | |
2348 } else { | |
2349 printerr "Failed\n"; | |
2350 return; | |
2351 } | |
2352 } | |
2353 | |
2354 $webdate = $webcontent[0]; | |
2355 $webdate =~ s/[\r\n]+$//; | |
2356 $webdate1 = $webdate; | |
2357 $webdate1 =~ s/\-//g; #remove the - sign in webdate | |
2358 if ($curdate < $webdate1) { | |
2359 printerr "----------------------------UPDATE AVAILABLE------------------------------\n"; | |
2360 printerr "--------------------------------------------------------------------------\n"; | |
2361 printerr "WARNING: A new version of ANNOVAR (dated $webdate) is available!\n"; | |
2362 printerr " Download from http://www.openbioinformatics.org/annovar/\n"; | |
2363 | |
2364 if (@webcontent >= 2) { | |
2365 printerr "Changes made in the $webdate version:\n"; | |
2366 for my $i (1 .. @webcontent-1) { | |
2367 if ($webcontent[$i] =~ m/^(\d{4})\-(\d{2})\-(\d{2})[\r\n]+$/) { | |
2368 $webdate = "$1-$2-$3"; | |
2369 $webdate1 = "$1$2$3"; | |
2370 if ($curdate >= $webdate1) { #the current version is more recent than this date | |
2371 last; | |
2372 } else { | |
2373 printerr "Changes made in the $webdate version:\n"; | |
2374 } | |
2375 } else { | |
2376 printerr " * $webcontent[$i]"; | |
2377 } | |
2378 } | |
2379 } | |
2380 printerr "--------------------------------------------------------------------------\n"; | |
2381 printerr "--------------------------------------------------------------------------\n"; | |
2382 } | |
2383 } | |
2384 | |
2385 =head1 SYNOPSIS | |
2386 | |
2387 annotate_variation.pl [arguments] <query-file|table-name> <database-location> | |
2388 | |
2389 Optional arguments: | |
2390 -h, --help print help message | |
2391 -m, --man print complete documentation | |
2392 -v, --verbose use verbose output | |
2393 | |
2394 Arguments to download databases or perform annotations | |
2395 --downdb download UCSC Genome Browser annotation database | |
2396 --geneanno annotate variants by functional consequences on genes | |
2397 --regionanno annotate variants by targetting specific genomics regions | |
2398 --filter filter variants based on a position list | |
2399 --webfrom <string> specify the source of database (default usually works fine) | |
2400 | |
2401 Arguments to control input and output files | |
2402 --outfile <file> output file prefix | |
2403 --zerostart input query file uses half-open zero-start coordinate | |
2404 --dbtype <string> database type | |
2405 --buildver <string> genome build version (default: hg18 for human) | |
2406 --gff3dbfile <file> specify the GFF3 DB file used in region-based annotation | |
2407 --genericdbfile <file> specify the generic DB file used in filter-based annotation | |
2408 --vcfdbfile <file> specify the DB file in VCF format in filter-based annotation | |
2409 --bedfile <file> specify a BED file in region-based annotation | |
2410 --time print out local time during program run | |
2411 --separate separately print out all function of a variant (default: one line per variant) | |
2412 --colsWanted <string> specify which columns to output in -regionanno by comma-delimited numbers | |
2413 --comment print out comment line (those starting with #) in output files | |
2414 --scorecolumn <int> the column with scores in database file (for region-based annotation) | |
2415 --exonsort sort the exon number in output line (for gene-based annotation) | |
2416 --transcript_function use transcript name rather than gene name in gene-based annotation output | |
2417 | |
2418 Arguments to fine-tune the annotation procedure | |
2419 --batchsize <int> batch size for processing variants per batch (default: 5m) | |
2420 --genomebinsize <int> bin size to speed up search (default: 100k for -geneanno, 10k for -regionanno) | |
2421 --expandbin <int> check nearby bin to find neighboring genes (default: 2m/genomebinsize) | |
2422 --neargene <int> distance threshold to define upstream/downstream of a gene | |
2423 --score_threshold <float> minimum score of DB regions to use in annotation | |
2424 --normscore_threshold <float> minimum normalized score of DB regions to use in annotation | |
2425 --rawscore output includes the raw score (not normalized score) in UCSC Browser Track | |
2426 --minqueryfrac <float> minimum percentage of query overlap to define match to DB (default: 0) | |
2427 --splicing_threshold <int> distance between splicing variants and exon/intron boundary (default: 2) | |
2428 --maf_threshold <float> filter 1000G variants with MAF above this threshold (default: 0) | |
2429 --sift_threshold <float> SIFT threshold for deleterious prediction (default: 0.05) | |
2430 --precedence <string> comma-delimited to specify precedence of variant function (default: exonic>intronic...) | |
2431 | |
2432 Arguments to control memory usage | |
2433 --memfree <int> ensure minimum amount of free system memory (default: 100000, in the order of kb) | |
2434 --memtotal <int> limit total amount of memory used by ANNOVAR (default: 0, unlimited, in the order of kb) | |
2435 --chromosome <string> examine these specific chromosomes in database file | |
2436 | |
2437 | |
2438 Function: annotate a list of genetic variants against genome annotation | |
2439 databases saved at local disk. | |
2440 | |
2441 Example: #download gene annotation database (for hg18 build) and save to humandb/ directory | |
2442 annotate_variation.pl -downdb gene humandb/ | |
2443 annotate_variation.pl -buildver mm9 -downdb mce30way mousedb/ | |
2444 annotate_variation.pl -downdb snp130 humandb/ | |
2445 | |
2446 #gene-based annotation of variants in the varlist file (by default --geneanno is ON) | |
2447 annotate_variation.pl ex1.human humandb/ | |
2448 | |
2449 #region-based annotate variants | |
2450 annotate_variation.pl -regionanno -dbtype mce44way ex1.human humandb/ | |
2451 annotate_variation.pl -regionanno -dbtype gff3 -gff3dbfile tfbs.gff3 ex1.human humandb/ | |
2452 | |
2453 #filter rare or unreported variants (in 1000G/dbSNP) or predicted deleterious variants | |
2454 annotate_variation.pl -filter -dbtype 1000g_ceu -maf 0.01 ex1.human humandb/ | |
2455 annotate_variation.pl -filter -dbtype snp130 ex1.human humandb/ | |
2456 annotate_variation.pl -filter -dbtype avsift ex1.human humandb/ | |
2457 | |
2458 Version: $LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $ | |
2459 | |
2460 =head1 OPTIONS | |
2461 | |
2462 =over 8 | |
2463 | |
2464 =item B<--help> | |
2465 | |
2466 print a brief usage message and detailed explanation of options. | |
2467 | |
2468 =item B<--man> | |
2469 | |
2470 print the complete manual of the program. | |
2471 | |
2472 =item B<--verbose> | |
2473 | |
2474 use verbose output. | |
2475 | |
2476 =item B<--downdb> | |
2477 | |
2478 download annotation databases from UCSC Genome Browser, Ensembl, 1000 Genomes | |
2479 Project or other resources. The annotation files in this database are required | |
2480 for the functional annotation of variants. | |
2481 | |
2482 =item B<--geneanno> | |
2483 | |
2484 perform gene-based annotation. For each variant, examine whether it hit exon, | |
2485 intron, intergenic region, or close to a transcript, or hit a non-coding RNA | |
2486 gene, or is located in a untranslated region. In addition, for an exonic variant, | |
2487 determine whether it causes splicing change, non-synonymous amino acid change, | |
2488 synonymous amino acid change or frameshift changes. | |
2489 | |
2490 =item B<--regionanno> | |
2491 | |
2492 perform region-based annotation. For each variant, examine whether it overlaps | |
2493 with a specific genomic region, such as the most conserved elements, the | |
2494 predicted transcription factor binding sites, the specific cytogeneic bands, the | |
2495 evolutionarily conserved RNA secondary structures and so on. | |
2496 | |
2497 =item B<--filter> | |
2498 | |
2499 perform filter-based annotation. For each variants, filter it against a | |
2500 variation database, such as the 1000 Genomes Project database and the dbSNP | |
2501 database, and identify a subset that have not been reported in these databases | |
2502 as novel variants. | |
2503 | |
2504 =item B<--outfile> | |
2505 | |
2506 specify the output file prefix. Several output files will be generated using | |
2507 this prefix and different suffixes. A directory name can also be specified as | |
2508 part of the argument, so that the output files can be written to a different | |
2509 directory than the current directory. | |
2510 | |
2511 =item B<--zerostart> | |
2512 | |
2513 utilize the half-open zero-start coordinate system that is used by many UCSC | |
2514 Genome Browser annotation tables. By default, the 1-based coordinate system will | |
2515 be used. | |
2516 | |
2517 =item B<--dbtype> | |
2518 | |
2519 specify the database type to be used in gene-based, region-based or filter-based | |
2520 annotations. For gene-based annotation, by default refGene annotations from the | |
2521 UCSC Genome Browser will be used for annotating variants. However, users can | |
2522 switch to utilize Ensembl annotations instead, or use the UCSC Gene annotations. | |
2523 In general, RefSeq gene annotations are more conservative, and UCSC Gene | |
2524 annotations are most liberal with many predicted genes and non-coding RNAs. For | |
2525 region-based annotations, users can select any UCSC annotation databases (by | |
2526 providing the database name), or alternatively select a Generic Feature Format | |
2527 version 3 (GFF3) formatted file for annotation (by providing 'gff3' as the -- | |
2528 dbtype and providing the --gff3dbfile argument). For filter-based annotations, | |
2529 users can select a dbSNP file, a 1000G file, a generic format file (with simple | |
2530 columns including chr, start, end, reference, observed, score), a VCF format | |
2531 (which is the current popular format for variants exchange), or a avsift format | |
2532 (which is identital to the generic format but is provided for convenience). | |
2533 | |
2534 =item B<--buildver> | |
2535 | |
2536 genome build version to use. By default, the hg18 build for human genome is | |
2537 used. The build version will be used by ANNOVAR to identify corresponding database files | |
2538 automatically, for example, when gene-based annotation is used for hg18 build, | |
2539 ANNOVAR will search for the hg18_refGene.txt file, but if the hg19 is used as -- | |
2540 buildver, ANNOVAR will examine hg19_refGene.txt instead. | |
2541 | |
2542 =item B<--gff3dbfile> | |
2543 | |
2544 specify the GFF3-formatted database file used in the region-based annotation. | |
2545 | |
2546 =item B<--genericdbfile> | |
2547 | |
2548 specify the generic format database file used in the filter-based annotation. | |
2549 | |
2550 =item B<--vcfdbfile> | |
2551 | |
2552 specify the database file in VCF format in the filter-based annotation. VCF has | |
2553 been a popular format for summarizing SNP and indel calls in a population of | |
2554 samples, and has been adopted by 1000 Genomes Project in their most recent data | |
2555 release. | |
2556 | |
2557 =item B<--time> | |
2558 | |
2559 print out the local time during execution of the program | |
2560 | |
2561 =item B<--separate> | |
2562 | |
2563 for gene-based annotation, separate the effects of each variant, so that each | |
2564 effect (intronic, exonic, splicing) is printed in one output line. By default, | |
2565 all effects are printed in the same line, in the comma-separated form of | |
2566 'UTR3,UTR5' or 'exonic,splicing'. | |
2567 | |
2568 =item B<--colsWanted> | |
2569 | |
2570 specify which columns are desired in the output for -regionanno. By default, | |
2571 ANNOVAR inteligently selects the columns based on the DB type. However, users | |
2572 can use a list of comma-delimited numbers, or use 'all', or use 'none', to | |
2573 request custom output columns. | |
2574 | |
2575 =item B<--comment> | |
2576 | |
2577 specify that the program should include comment lines in the output files. | |
2578 Comment lines are defined as any line starting with #. By default, these lines | |
2579 are not recognized as valid ANNOVAR input and are therefore written to the | |
2580 INVALID_INPUT file. This argument can be very useful to keep columns headers in | |
2581 the output file, if the input file use comment line to flag the column headers | |
2582 (usually the first line in the input file). | |
2583 | |
2584 =item B<--scorecolumn> | |
2585 | |
2586 specify the the column with desired output scores in UCSC database file (for | |
2587 region-based annotation). The default usually works okay. | |
2588 | |
2589 =item B<--exonsort> | |
2590 | |
2591 sort the exon number in output line in the exonic_variant_function file during | |
2592 gene-based annotation. If a mutation affects multiple transcripts, the ones with | |
2593 the smaller exon number will be printed before the transcript with larger exon | |
2594 number in the output. | |
2595 | |
2596 =item B<--batchsize> | |
2597 | |
2598 this argument specifies the batch size for processing variants by gene-based | |
2599 annotation. Normally 5 million variants (usually one human genome will have | |
2600 about 3-5 million variants depending on ethnicity) are annotated as a batch, to | |
2601 reduce the amounts of memory. The users can adjust the parameters: larger values | |
2602 make the program slightly faster, at the expense of slightly larger memory | |
2603 requirements. In a 64bit computer, the default settings usually take 1GB memory | |
2604 for gene-based annotation for human genome for a typical query file, but this | |
2605 depends on the complexity of the query (note that the query has a few required | |
2606 fields, but may have many optional fields and those fields need to be read and | |
2607 kept in memory). | |
2608 | |
2609 =item B<--genomebinsize> | |
2610 | |
2611 the bin size of genome to speed up search. By default 100kb is used for gene- | |
2612 based annotation, so that variant annotation focused on specific bins only | |
2613 (based on the start-end site of a given variant), rather than searching the | |
2614 entire chromosomes for each variant. By default 10kb is used for region-based | |
2615 annotation. The filter-based annotations look for variants directly so no bin is | |
2616 used. | |
2617 | |
2618 =item B<--expandbin> | |
2619 | |
2620 expand bin to both sides to find neighboring genes/regions. For gene-based | |
2621 annotation, ANNOVAR tries to find nearby genes for any intergenic variant, with | |
2622 a maximum number of nearby bins to search. By default, ANNOVAR will | |
2623 automatically set this argument to search 2 megabases to the left and right of | |
2624 the variant in genome. | |
2625 | |
2626 =item B<--neargene> | |
2627 | |
2628 the distance threshold to define whether a variant is in the upstream or | |
2629 downstream region of a gene. By default 1 kilobase from the start or end site of | |
2630 a transcript is defined as upstream or downstream, respectively. This is useful, | |
2631 for example, when one wants to identify variants that are located in the | |
2632 promoter regions of genes across the genome. | |
2633 | |
2634 =item B<--score_threshold> | |
2635 | |
2636 the minimum score to consider when examining region-based annotations on UCSC | |
2637 Genome Browser tables. Some tables do not have such scores and this argument | |
2638 will not be effective. | |
2639 | |
2640 =item B<--normscore_threshold> | |
2641 | |
2642 the minimum normalized score to consider when examining region-based annotations | |
2643 on UCSC Genome Browser tables. The normalized score is calculated by UCSC, | |
2644 ranging from 0 to 1000, to make visualization easier. Some tables do not have | |
2645 such scores and this argument will not be effective. | |
2646 | |
2647 =item B<--rawscore> | |
2648 | |
2649 for region-based annotation, print out raw scores from UCSC Genome Browser | |
2650 tables, rather than normalized scores. By default, normalized scores are printed | |
2651 in the output files. Normalized scores are compiled by UCSC Genome Browser for | |
2652 each track, and they usually range from 0 to 1000, but there are some | |
2653 exceptions. | |
2654 | |
2655 =item B<--minqueryfrac> | |
2656 | |
2657 The minimum fraction of overlap between a query and a database record to decide | |
2658 on their match. By default, any overlap is regarded as a match, but this may not | |
2659 work best when query consist of large copy number variants. | |
2660 | |
2661 =item B<--splicing_threshold> | |
2662 | |
2663 distance between splicing variants and exon/intron boundary, to claim that a | |
2664 variant is a splicing variant. By default, 2bp is used. ANNOVAR is relatively | |
2665 more stringent than some other software to claim variant as regulating splicing. | |
2666 In addition, if a variant is an exonic variant, it will not be reported as | |
2667 splicing variant even if it is within 2bp to an exon/intron boundary. | |
2668 | |
2669 =item B<--maf_threshold> | |
2670 | |
2671 the minor allele frequency (MAF) threshold to be used in the filter-based | |
2672 annotation for the 1000 Genomes Project databases. By default, any variant | |
2673 annotated in the 1000G will be used in filtering. | |
2674 | |
2675 =item B<--memfree> | |
2676 | |
2677 the minimum amount of free system memory that ANNOVAR should ensure to have. By | |
2678 default, if ANNOVAR takes too much memory such that only 100Mb system memory is | |
2679 available, ANNOVAR will stop reading annotation database into memory, and will | |
2680 start annotation procedure, and then clear the memory, and then read the next | |
2681 block of annotation database again. This argument ensures that ANNOVAR will not | |
2682 attempt to use virtual memory in the system (which makes ANNOVAR extremely | |
2683 slow). | |
2684 | |
2685 =item B<--memtotal> | |
2686 | |
2687 the total amount of memory that ANNOVAR should use at most. By default, this | |
2688 value is zero, meaning that there is no limit on that. Decreasing this threshold | |
2689 reduce the memory requirement by ANNOVAR, but may increase the execution time. | |
2690 | |
2691 =item B<--chromosome> | |
2692 | |
2693 examine these specific chromosomes in database file. The argument takes comma- | |
2694 delimited values, and the dash can be correctly recognized. For example, 5-10,X | |
2695 represent chromosome 5 through chromosome 10 plus chromosome X. | |
2696 | |
2697 =back | |
2698 | |
2699 =head1 DESCRIPTION | |
2700 | |
2701 ANNOVAR is a software tool that can be used to functionally annotate a list of genetic variants, | |
2702 possibly generated from next-generation sequencing experiments. For example, | |
2703 given a whole-genome resequencing data set for a human with specific diseases, | |
2704 typically around 3 million SNPs and around half million insertions/deletions | |
2705 will be identified. Given this massive amounts of data (and candidate disease- | |
2706 causing variants), it is necessary to have a fast algorithm that scans the data | |
2707 and identify a prioritized subset of variants that are most likely functional | |
2708 for follow-up Sanger sequencing studies and functional assays. | |
2709 | |
2710 Currently, these various types of functional annotations produced by ANNOVAR can | |
2711 be (1) gene-based annotations (the default behavior), such as exonic variants, | |
2712 intronic variants, intergenic variants, downstream variants, UTR variants, | |
2713 splicing site variants, stc. For exonic variants, ANNOVAR will try to predict | |
2714 whether each of the variants is non-synonymous SNV, synonymous SNV, | |
2715 frameshifting change, nonframeshifting change. (2) region-based annotation, to | |
2716 identify whether a given variant overlaps with a specific type of genomic | |
2717 region, for example, predicted transcription factor binding site or predicted | |
2718 microRNAs.(3) filter-based annotation, to filter a list of variants so that only | |
2719 those not observed in variation databases (such as 1000 Genomes Project and | |
2720 dbSNP) are printed out. | |
2721 | |
2722 Currently, I am expanding the functionality of ANNOVAR on (1) Fusion gene | |
2723 detection from large deletions, where a deletion joins the reading frame of two | |
2724 genes (same orientation of transcription) together to create a new gene. (2) | |
2725 Assignment of functional importance score to each observed mutation in the | |
2726 genome. This will be extremely important for the development of association | |
2727 tests for rare variants, and for prioritization of variants in downstream | |
2728 functional studies after a successful genome-wide association studies (GWAS). | |
2729 | |
2730 =over 8 | |
2731 | |
2732 =item * B<variant file format> | |
2733 | |
2734 A sample variant file contains one variant per line, with the fields being chr, | |
2735 start, end, reference allele, observed allele, other information. The other | |
2736 information can be anything (for example, it may contain sample identifiers for | |
2737 the corresponding variant.) An example is shown below: | |
2738 | |
2739 16 49303427 49303427 C T rs2066844 R702W (NOD2) | |
2740 16 49314041 49314041 G C rs2066845 G908R (NOD2) | |
2741 16 49321279 49321279 - C rs2066847 c.3016_3017insC (NOD2) | |
2742 16 49290897 49290897 C T rs9999999 intronic (NOD2) | |
2743 16 49288500 49288500 A T rs8888888 intergenic (NOD2) | |
2744 16 49288552 49288552 T - rs7777777 UTR5 (NOD2) | |
2745 18 56190256 56190256 C T rs2229616 V103I (MC4R) | |
2746 | |
2747 =item * B<database file format: UCSC Genome Browser annotation database> | |
2748 | |
2749 Most but not all of the gene annotation databases are directly downloaded from | |
2750 UCSC Genome Browser, so the file format is identical to what was used by the | |
2751 genome browser. The users can check Table Browser (for example, human hg18 table | |
2752 browser is at http://www.genome.ucsc.edu/cgi-bin/hgTables?org=Human&db=hg18) to | |
2753 see what fields are available in the annotation file. Note that even for the | |
2754 same species (such as humans), the file format might be different between | |
2755 different genome builds (such as between hg16, hg17 and hg18). ANNOVAR will try | |
2756 to be smart about guessing file format, based on the combination of the -- | |
2757 buildver argument and the number of columns in the input file. In general, the | |
2758 database file format should not be something that users need to worry about. | |
2759 | |
2760 =item * B<database file format: GFF3 format for gene-based annotations)> | |
2761 | |
2762 As of June 2010, ANNOVAR cannot perform gene-based annotations using GFF3 input | |
2763 files, and any annotations on GFF3 is region-based. However, this is expected to | |
2764 be changed in the future. | |
2765 | |
2766 =item * B<database file format: GFF3 format for region-based annotations)> | |
2767 | |
2768 Currently, region-based annotations can support the Generic Feature Format | |
2769 version 3 (GFF3) formatted files. The GFF3 has become the de facto golden | |
2770 standards for many model organism databases, such that many users may want to | |
2771 take a custom annotation database and run ANNOVAR on them, and it would be the | |
2772 most convenient if the custom file is made with GFF3 format. | |
2773 | |
2774 =item * B<database file format: generic format for filter-based annotations)> | |
2775 | |
2776 The 'generic' format is designed for filter-based annotation that looks for | |
2777 exact variants. The format is almost identical to the ANNOVAR input format, with | |
2778 chr, start, end, reference allele, observed allele and scores (higher scores are | |
2779 regarded as better). | |
2780 | |
2781 =item * B<database file format: VCF format for filter-based annotations)> | |
2782 | |
2783 The 1000 Genomes Project now provide their variant annotations in VCF format, so | |
2784 I implemented the functionality to directly interrogate VCF files. A VCF file | |
2785 may contain summary information for variants (for example, this variant has MAF | |
2786 of 5% in this population), or it may contain the actual variant calls for each | |
2787 individual in a specific population. As of March 2010, the files from 1000G website | |
2788 only contains the first type of information (that is, alleles and their | |
2789 frequencies in population). For the purpose of simplicity, ANNOVAR only | |
2790 interrogates the first type of information. | |
2791 | |
2792 =item * B<database file format: avsift for filter-based annotations)> | |
2793 | |
2794 avsift refers to a file that ANNOVAR developers compiled for fast annotation of | |
2795 SIFT scores for non-synonymous variants in the human genome. It conforms to the | |
2796 generic format described above. However, users can directly specify '--dbtype | |
2797 avsift' in command line to perform avsift annotations, making it more convenient | |
2798 for users. Alternatively, users can use '--dbtype generic -genericdbfile | |
2799 hg18_avsift.txt' for the annotation, and the effects are usually the same. | |
2800 | |
2801 =item * B<sequence file format> | |
2802 | |
2803 ANNOVAR can directly examine FASTA-formatted sequence files. For mRNA sequences, | |
2804 the name of the sequences are the mRNA identifier. For genomic sequences, the | |
2805 name of the sequences in the files are usually chr1, chr2, chr3, etc, so that | |
2806 ANNOVAR knows which sequence corresponds to which chromosome. Unfortunately, | |
2807 UCSC uses things like chr6_random to annotate un-assembled sequences, as opposed | |
2808 to using the actual contig identifiers. This causes some issues (depending on | |
2809 how reads alignment algorithms works), but in general should not be something | |
2810 that user need to worry about. If the users absolutely care about the exact | |
2811 contigs rather than chr*_random, then they will need to re-align the short reads | |
2812 at chr*_random to a different FASTA file that contains the contigs, and then | |
2813 execute ANNOVAR on the newly identified variants. | |
2814 | |
2815 =item * B<invalid input> | |
2816 | |
2817 If the query file contains input lines with invalid format, ANNOVAR will skip | |
2818 such line and continue with the annotation on next lines. These invalid input | |
2819 lines will be written to a file with suffix invalid_input. Users should manually | |
2820 examine this file and identify sources of error. | |
2821 | |
2822 =back | |
2823 | |
2824 ANNOVAR is freely available to the academic community for non-commercial use. For | |
2825 questions or comments, please contact kai@openbioinformatics.org. | |
2826 | |
2827 =cut |