annotate gfapts/inc/annovar/annotate_variation.pl @ 0:f753b30013e6 draft

Uploaded
author rdaveau
date Fri, 29 Jun 2012 10:20:55 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1 #!/usr/bin/perl
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2 use warnings;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
3 use strict;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
4 use Pod::Usage;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
5 use Getopt::Long;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
6 use File::Spec;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
7 use Cwd;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
8
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
9 our $VERSION = '$Revision: 466 $';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
10 our $LAST_CHANGED_DATE = '$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
11
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
12 our ($verbose, $help, $man);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
13 our ($queryfile, $dbloc);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
14 our ($outfile, $separate, $batchsize, $dbtype, $neargene, $genomebinsize, $geneanno, $regionanno, $filter, $downdb, $buildver, $score_threshold, $normscore_threshold, $minqueryfrac, $expandbin, $splicing_threshold,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
15 $maf_threshold, $chromosome, $zerostart, $rawscore, $memfree, $memtotal, $sift_threshold, $gff3dbfile, $genericdbfile, $vcfdbfile, $time, $wget, $precedence,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
16 $webfrom, $colsWanted, $comment, $scorecolumn, $transfun, $exonsort, $avcolumn, $bedfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
17 our (%valichr, $dbtype1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
18 our (@precedence, @colsWanted, @avcolumn);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
19 sub printerr; #declare a subroutine
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
20
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
21 our %codon1 = (TTT=>"F", TTC=>"F", TCT=>"S", TCC=>"S", TAT=>"Y", TAC=>"Y", TGT=>"C", TGC=>"C", TTA=>"L", TCA=>"S", TAA=>"*", TGA=>"*", TTG=>"L", TCG=>"S", TAG=>"*", TGG=>"W", CTT=>"L", CTC=>"L", CCT=>"P", CCC=>"P", CAT=>"H", CAC=>"H", CGT=>"R", CGC=>"R", CTA=>"L", CTG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", ATT=>"I", ATC=>"I", ACT=>"T", ACC=>"T", AAT=>"N", AAC=>"N", AGT=>"S", AGC=>"S", ATA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", ATG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GTT=>"V", GTC=>"V", GCT=>"A", GCC=>"A", GAT=>"D", GAC=>"D", GGT=>"G", GGC=>"G", GTA=>"V", GTG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
22 our %codon3 = (TTT=>"Phe", TTC=>"Phe", TCT=>"Ser", TCC=>"Ser", TAT=>"Tyr", TAC=>"Tyr", TGT=>"Cys", TGC=>"Cys", TTA=>"Leu", TCA=>"Ser", TAA=>"*", TGA=>"*", TTG=>"Leu", TCG=>"Ser", TAG=>"*", TGG=>"Trp", CTT=>"Leu", CTC=>"Leu", CCT=>"Pro", CCC=>"Pro", CAT=>"His", CAC=>"His", CGT=>"Arg", CGC=>"Arg", CTA=>"Leu", CTG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", ATT=>"Ile", ATC=>"Ile", ACT=>"Thr", ACC=>"Thr", AAT=>"Asn", AAC=>"Asn", AGT=>"Ser", AGC=>"Ser", ATA=>"Ile", ACA=>"Thr", AAA=>"Lys", AGA=>"Arg", ATG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"Arg", GTT=>"Val", GTC=>"Val", GCT=>"Ala", GCC=>"Ala", GAT=>"Asp", GAC=>"Asp", GGT=>"Gly", GGC=>"Gly", GTA=>"Val", GTG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
23 our %codonfull = (TTT=>"Phenylalanine", TTC=>"Phenylalanine", TCT=>"Serine", TCC=>"Serine", TAT=>"Tyrosine", TAC=>"Tyrosine", TGT=>"Cysteine", TGC=>"Cysteine", TTA=>"Leucine", TCA=>"Serine", TAA=>"Stop", TGA=>"Stop", TTG=>"Leucine", TCG=>"Serine", TAG=>"Stop", TGG=>"Tryptophan", CTT=>"Leucine", CTC=>"Leucine", CCT=>"Proline", CCC=>"Proline", CAT=>"Histidine", CAC=>"Histidine", CGT=>"Arginine", CGC=>"Arginine", CTA=>"Leucine", CTG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", ATT=>"Isoleucine", ATC=>"Isoleucine", ACT=>"Threonine", ACC=>"Threonine", AAT=>"Asparagine", AAC=>"Asparagine", AGT=>"Serine", AGC=>"Serine", ATA=>"Isoleucine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Arginine", ATG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Arginine", GTT=>"Valine", GTC=>"Valine", GCT=>"Alanine", GCC=>"Alanine", GAT=>"Aspartic acid", GAC=>"Aspartic acid", GGT=>"Glycine", GGC=>"Glycine", GTA=>"Valine", GTG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
24 our %codonr1 = (UUU=>"F", UUC=>"F", UCU=>"S", UCC=>"S", UAU=>"Y", UAC=>"Y", UGU=>"C", UGC=>"C", UUA=>"L", UCA=>"S", UAA=>"*", UGA=>"*", UUG=>"L", UCG=>"S", UAG=>"*", UGG=>"W", CUU=>"L", CUC=>"L", CCU=>"P", CCC=>"P", CAU=>"H", CAC=>"H", CGU=>"R", CGC=>"R", CUA=>"L", CUG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", AUU=>"I", AUC=>"I", ACU=>"T", ACC=>"T", AAU=>"N", AAC=>"N", AGU=>"S", AGC=>"S", AUA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", AUG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GUU=>"V", GUC=>"V", GCU=>"A", GCC=>"A", GAU=>"D", GAC=>"D", GGU=>"G", GGC=>"G", GUA=>"V", GUG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
25 our %codonr3 = (UUU=>"Phe", UUC=>"Phe", UCU=>"Ser", UCC=>"Ser", UAU=>"Tyr", UAC=>"Tyr", UGU=>"Cys", UGC=>"Cys", UUA=>"Leu", UCA=>"Ser", UAA=>"*", UGA=>"*", UUG=>"Leu", UCG=>"Ser", UAG=>"*", UGG=>"Trp", CUU=>"Leu", CUC=>"Leu", CCU=>"Pro", CCC=>"Pro", CAU=>"His", CAC=>"His", CGU=>"Arg", CGC=>"Arg", CUA=>"Leu", CUG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", AUU=>"Ile", AUC=>"Ile", ACU=>"Thr", ACC=>"Thr", AAU=>"Asn", AAC=>"Asn", AGU=>"Ser", AGC=>"Ser", AUA=>"Ile", ACA=>"Thr", AAA=>"Lys", AGA=>"Arg", AUG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"Arg", GUU=>"Val", GUC=>"Val", GCU=>"Ala", GCC=>"Ala", GAU=>"Asp", GAC=>"Asp", GGU=>"Gly", GGC=>"Gly", GUA=>"Val", GUG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
26 our %codonrfull = (UUU=>"Phenylalanine", UUC=>"Phenylalanine", UCU=>"Serine", UCC=>"Serine", UAU=>"Tyrosine", UAC=>"Tyrosine", UGU=>"Cysteine", UGC=>"Cysteine", UUA=>"Leucine", UCA=>"Serine", UAA=>"Stop", UGA=>"Stop", UUG=>"Leucine", UCG=>"Serine", UAG=>"Stop", UGG=>"Tryptophan", CUU=>"Leucine", CUC=>"Leucine", CCU=>"Proline", CCC=>"Proline", CAU=>"Histidine", CAC=>"Histidine", CGU=>"Arginine", CGC=>"Arginine", CUA=>"Leucine", CUG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", AUU=>"Isoleucine", AUC=>"Isoleucine", ACU=>"Threonine", ACC=>"Threonine", AAU=>"Asparagine", AAC=>"Asparagine", AGU=>"Serine", AGC=>"Serine", AUA=>"Isoleucine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Arginine", AUG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Arginine", GUU=>"Valine", GUC=>"Valine", GCU=>"Alanine", GCC=>"Alanine", GAU=>"Aspartic acid", GAC=>"Aspartic acid", GGU=>"Glycine", GGC=>"Glycine", GUA=>"Valine", GUG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
27 our %iupac = (R=>'AG', Y=>'CT', S=>'GC', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-');
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
28
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
29 processArguments (); #process program arguments, set up default values, check for errors, check for existence of db files
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
30 if ($geneanno) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
31 annotateQueryByGene (); #generate gene-based annoations (classify variants into intergenic, introgenic, non-synonymous, synonymous, UTR, frameshift, etc)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
32 } elsif ($regionanno) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
33 annotateQueryByRegion (); #generate region-based annotations (most conserved elements, transcription factor binding sites, etc)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
34 } elsif ($filter) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
35 filterQuery (); #generate filter-based annotations (identify variants not reported in variation databases)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
36 } elsif ($downdb) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
37 downloadDB (); #download annotation databases from Internet
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
38 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
39
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
40 sub processArguments {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
41 my @command_line = @ARGV; #command line argument
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
42 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'outfile=s'=>\$outfile, 'separate'=>\$separate,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
43 'batchsize=s'=>\$batchsize, 'dbtype=s'=>\$dbtype, 'neargene=i'=>\$neargene, 'genomebinsize=s'=>\$genomebinsize,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
44 'geneanno'=>\$geneanno, 'regionanno'=>\$regionanno, , 'filter'=>\$filter, 'downdb'=>\$downdb, 'buildver=s'=>\$buildver, 'score_threshold=f'=>\$score_threshold,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
45 'normscore_threshold=i'=>\$normscore_threshold, 'minqueryfrac=f'=>\$minqueryfrac, 'expandbin=i'=>\$expandbin, 'splicing_threshold=i'=>\$splicing_threshold,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
46 'maf_threshold=f'=>\$maf_threshold, 'chromosome=s'=>\$chromosome, 'zerostart'=>\$zerostart, 'rawscore'=>\$rawscore, 'memfree=i'=>\$memfree,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
47 'memtotal=i'=>\$memtotal, 'sift_threshold=f'=>\$sift_threshold, 'gff3dbfile=s'=>\$gff3dbfile, 'genericdbfile=s'=>\$genericdbfile, 'vcfdbfile=s'=>\$vcfdbfile,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
48 'time'=>\$time, 'wget!'=>\$wget, 'precedence=s'=>\$precedence, 'webfrom=s'=>\$webfrom, 'colsWanted=s'=>\$colsWanted, 'comment'=>\$comment,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
49 'scorecolumn=i'=>\$scorecolumn, 'transcript_function'=>\$transfun, 'exonsort'=>\$exonsort, 'avcolumn=s'=>\$avcolumn, 'bedfile=s'=>\$bedfile) or pod2usage ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
50
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
51 $help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
52 $man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
53 @ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
54 @ARGV == 2 or pod2usage ("Syntax error");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
55
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
56 ($queryfile, $dbloc) = @ARGV;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
57
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
58 $dbloc =~ s/[\\\/]$//; #delete the trailing / or \ sign as part of the directory name
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
59 if (defined $batchsize) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
60 $batchsize =~ s/k$/000/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
61 $batchsize =~ s/m$/000000/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
62 $batchsize =~ m/^\d+$/ or pod2usage ("Error: the --batchsize argument must be a positive integer (suffix of k or m is okay)");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
63 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
64 $batchsize = 5_000_000;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
65 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
66 if (defined $genomebinsize) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
67 $genomebinsize =~ s/k$/000/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
68 $genomebinsize =~ s/m$/000000/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
69 $genomebinsize =~ m/^\d+$/ or pod2usage ("Error: the --genomebinsize argument must be a positive integer (suffix of k or m is okay)");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
70 $genomebinsize > 1000 or pod2suage ("Error: the --genomebinsize argument must be larger than 1000");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
71 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
72 if ($geneanno) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
73 $genomebinsize = 100_000; #gene usually span large genomic regions
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
74 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
75 $genomebinsize = 10_000; #MCE, TFBS, miRNA, etc are small genomic regions
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
76 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
77 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
78
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
79 $verbose ||= 0; #when it is not specified, it is zero
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
80 $neargene ||= 1_000; #for upstream/downstream annotation of variants, specify the distance threshold between variants and genes
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
81 $expandbin ||= int(2_000_000/$genomebinsize); #for gene-based annotations, when intergenic variants are found, expand to specified number of nearby bins to find closest genes
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
82 $outfile ||= $queryfile; #specify the prefix of output file names
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
83
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
84 #set up log file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
85 if ($downdb) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
86 if (not -d $dbloc) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
87 mkdir ($dbloc) or die "Error: the directory $dbloc does not exist and cannot be created\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
88 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
89 my $errfile = File::Spec->catfile ($dbloc, "annovar_downdb.log");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
90 open (LOG, ">$errfile") or die "Error: cannot write LOG information to log file $errfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
91 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
92 open (LOG, ">$outfile.log") or die "Error: cannot write LOG information to log file $outfile.log: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
93 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
94 print LOG "ANNOVAR Version:\n\t", q/$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $/, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
95 print LOG "ANNOVAR Information:\n\tFor questions, comments, documentation, bug reports and program update, please visit http://www.openbioinformatics.org/annovar/\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
96 print LOG "ANNOVAR Command:\n\t$0 @command_line\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
97 print LOG "ANNOVAR Started:\n\t", scalar (localtime), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
98
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
99 my $num = 0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
100 $geneanno and $num++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
101 $downdb and $num++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
102 $filter and $num++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
103 $regionanno and $num++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
104 $num <= 1 or pod2usage ("Error in argument: please specify only one of --geneanno, -regionanno, --downdb, --filter");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
105 if (not $num) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
106 $geneanno++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
107 printerr "NOTICE: The --geneanno operation is set to ON by default\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
108 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
109
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
110 my %dbtype1 = ('gene'=>'refGene', 'refgene'=>'refGene', 'knowngene'=>'knownGene', 'ensgene'=>'ensGene', 'band'=>'cytoBand', 'cytoband'=>'cytoBand', 'tfbs'=>'tfbsConsSites', 'mirna'=>'wgRna',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
111 'mirnatarget'=>'targetScanS', 'segdup'=>'genomicSuperDups', 'omimgene'=>'omimGene', 'gwascatalog'=>'gwasCatalog',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
112 '1000g_ceu'=>'CEU.sites.2009_04', '1000g_yri'=>'YRI.sites.2009_04', '1000g_jptchb'=>'JPTCHB.sites.2009_04',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
113 '1000g2010_ceu'=>'CEU.sites.2010_03', '1000g2010_yri'=>'YRI.sites.2010_03', '1000g2010_jptchb'=>'JPTCHB.sites.2010_03',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
114 '1000g2010jul_ceu'=>'CEU.sites.2010_07', '1000g2010jul_yri'=>'YRI.sites.2010_07', '1000g2010jul_jptchb'=>'JPTCHB.sites.2010_07',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
115 '1000g2010nov_all'=>'ALL.sites.2010_11',
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
116 );
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
117
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
118 if ($geneanno) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
119 $dbtype ||= 'refGene';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
120 $dbtype1 = $dbtype1{$dbtype} || $dbtype;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
121 #$dbtype1 =~ m/^(refGene|knownGene|ensGene)$/ or pod2usage ("Error: the gene-based annotation procedure currently only support -dbtype of refGene, knownGene and ensGene"); #commented 2011feb18
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
122 } elsif ($regionanno) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
123 defined $dbtype or pod2usage ("Error in argument: please specify --dbtype (required for the --regionanno operation)");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
124 $dbtype1 = $dbtype1{$dbtype} || $dbtype;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
125 if ($dbtype =~ m/^mce(\d+)way/) { #added 2010Feb16
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
126 $dbtype1 = "phastConsElements$1way";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
127 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
128 if ($dbtype1 eq 'gff3') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
129 defined $gff3dbfile or pod2usage ("Error in argument: please specify --gff3dbfile for the --dbtype of 'gff3'");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
130 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
131 } elsif ($filter) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
132 defined $dbtype or pod2usage ("Error in argument: please specify --dbtype (required for the --filter operation)");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
133 $dbtype =~ m/^avsift|generic|1000g_(ceu|yri|jptchb)|1000g2010_(ceu|yri|jptchb)|1000g20\d\d[a-z]{3}_[a-z]+|snp\d+|vcf|(ljb_\w+)$/ or pod2usage ("Error in argument: the specified --dbtype $dbtype is not valid for --filter operation (valid ones are '1000g_ceu', '1000g2010_yri', 'snp129', 'avsift', 'vcf', 'generic', etc)");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
134 $dbtype1 = $dbtype1{$dbtype} || $dbtype;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
135 if ($dbtype1 eq 'generic') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
136 defined $genericdbfile or pod2usage ("Error in argument: please specify --genericdbfile for the --dbtype of 'generic'");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
137 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
138 if ($dbtype eq 'vcf') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
139 defined $vcfdbfile or pod2usage ("Error in argument: please specify --vcfdbfile for the --dbtype of 'vcf'");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
140 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
141 } elsif ($downdb) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
142 defined $dbtype and pod2usage ("Error in argument: please do not specify --dbtype for the --downdb operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
143 $dbtype1 = $dbtype1{$queryfile} || $queryfile;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
144 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
145
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
146 if (not $buildver) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
147 $buildver = 'hg18';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
148 printerr "NOTICE: The --buildver is set as 'hg18' by default\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
149 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
150
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
151 if ($score_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
152 $score_threshold > 0 or pod2usage ("Error in argument: the --score_threshold must be a positive number (you specified $score_threshold)");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
153 $geneanno || $downdb and pod2usage ("Error in argument: the --score_threshold is not useful for --geneanno or --downdb operations");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
154 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
155 if ($normscore_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
156 $normscore_threshold <= 1000 or pod2usage ("Error in argument: the --normscore_threshold must be between 0 and 1000 (you specified $normscore_threshold)");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
157 $regionanno or pod2usage ("Error in argument: the --score_threshold is supported only for the --regionanno operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
158 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
159
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
160
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
161 if (defined $sift_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
162 $filter or pod2usage ("Error in argument: the --sift_threshold is supported only for the --filter operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
163 $dbtype1 eq 'avsift' or pod2usage ("Error in argument: the --sift_threshold argument can be used only if '--dbtype avsift' is used");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
164 $sift_threshold >= 0 and $sift_threshold <= 1 or pod2usage ("Error in argument: the --sift_threshold must be between 0 and 1 inclusive");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
165 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
166 $sift_threshold = 0.05;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
167 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
168
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
169 #operation-specific argument
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
170 if (defined $splicing_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
171 $geneanno or pod2usage ("Error in argument: the --splicing_threshold is supported only for the --geneanno operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
172 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
173 $splicing_threshold = 2; #for splicing annotation, specify the distance threshold between variants and exon/intron boundaries
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
174 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
175 if (defined $maf_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
176 $filter or pod2usage ("Error in argument: the --maf_threshold is supported only for the --filter operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
177 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
178 $maf_threshold = 0; #for filter-based annotations on 1000 Genomes Project data, specify the MAF threshold to be used in filtering
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
179 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
180 if (defined $minqueryfrac) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
181 $regionanno or pod2usage ("Error in argument: the --minqueryfrac is supported only for the --regionanno operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
182 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
183 $minqueryfrac = 0; #minimum query overlap to declare a "match" with database records
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
184 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
185 if (defined $gff3dbfile) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
186 $dbtype eq 'gff3' or pod2usage ("Error in argument: the --gff3dbfile argument can be used only if '--dbtype gff3' is used");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
187 $geneanno or $regionanno or pod2usage ("Error in argument: the --gff3dbfile argument is supported only for the --geneanno or --regionanno operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
188 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
189 if (defined $bedfile) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
190 $dbtype eq 'bed' or pod2usage ("Error in argument: the --bedfile argument can be used only if '--dbtype bed' is used");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
191 $regionanno or pod2usage ("Error in argument: the --bedfile argument is supported only for the --regionanno operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
192 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
193 if (defined $genericdbfile) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
194 $filter or pod2usage ("Error in argument: the --genericdbfile argument is supported only for the --filter operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
195 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
196 if (defined $wget) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
197 $downdb or pod2usage ("Error in argument: the --wget argument is supported only for the --downdb operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
198 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
199 $wget = 1; #by default, use wget for downloading files from Internet
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
200 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
201 if (defined $precedence) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
202 $geneanno or pod2usage ("Error in argument: the --precedence argument is supported only for the --geneanno operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
203 @precedence = split (/,/, $precedence);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
204 @precedence >= 2 or pod2usage ("Error in argument: the --precedence argument should be comma delimited");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
205 for my $i (0 .. @precedence-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
206 $precedence[$i] =~ m/^(exonic|intronic|splicing|utr5|utr3|upstream|downstream|splicing|ncrna)$/ or pod2usage ("Error in argument: the --precedence argument contains invalid keywords (valid ones are exonic|intronic|splicing|utr5|utr3|upstream|downstream|splicing)");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
207 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
208 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
209
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
210 if (defined $colsWanted) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
211 $regionanno or pod2usage ("Error in argument: the --colWanted argument is supported only for the --geneanno operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
212 if (lc $colsWanted eq 'all') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
213 @colsWanted = ('all');
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
214 } elsif (lc $colsWanted eq 'none') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
215 @colsWanted = ('none');
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
216 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
217 @colsWanted = split (/,/, $colsWanted);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
218 for my $i (0 .. @colsWanted-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
219 $colsWanted[$i]=~m/^\d+$/ or pod2usage ("Error in argument: the --colsWanted argument ($colsWanted) must be a list of comma delimited numbers or be 'all' or be 'none'");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
220 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
221 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
222 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
223
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
224 if (defined $scorecolumn) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
225 $regionanno or pod2usage ("Error in argument: the --scorecolumn argument is supported only for the --regionanno operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
226 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
227
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
228 if ($exonsort) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
229 $geneanno or pod2usage ("Error in argument: the --exonsort argument is supported only for the --geneanno operation");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
230 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
231
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
232 if (defined $avcolumn) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
233 $avcolumn =~ m/^\d+,\d+,\d+,\d+,\d+$/ or pod2usage ("Error in argument: the --avcolumn argument must be five integer numbers separated by comma");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
234 @avcolumn = split (/,/, $avcolumn);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
235 @avcolumn = map {$_-1} @avcolumn;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
236 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
237 @avcolumn = (0..4); #by default, the first five columns are the required AVINPUT information
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
238 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
239
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
240 if (defined $webfrom) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
241 if ($webfrom ne 'ucsc' and $webfrom ne 'annovar') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
242 $webfrom =~ m#^(http://|ftp://)# or pod2usage ("Error: the --webfrom argument needs to be 'ucsc', 'annovar', or a URL");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
243 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
244 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
245
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
246 $maf_threshold >= 0 and $maf_threshold <= 0.5 or pod2usage ("Error in argument: the --maf_threshold must be between 0 and 0.5 (you specified $maf_threshold)");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
247 $minqueryfrac >= 0 and $minqueryfrac <= 1 or pod2usage ("Error in argument: the --minqueryfrac must be between 0 and 1 (you specified $minqueryfrac)");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
248 $memfree and $memfree >= 100_000 || pod2usage ("Error in argument: the --memfree argument must be at least 100000 (in the order of kilobytes)");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
249 $memtotal and $memtotal >= 100_000 || pod2usage ("Error in argument: the --memtotal argument must be at least 100000 (in the order of kilobytes)");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
250
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
251 if ($chromosome) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
252 my @chr = split (/,/, $chromosome);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
253 for my $i (0 .. @chr-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
254 if ($chr[$i] =~ m/^(\d+)-(\d+)$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
255 for my $j ($1 .. $2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
256 $valichr{$j}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
257 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
258 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
259 $valichr{$chr[$i]}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
260 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
261 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
262 printerr "NOTICE: These chromosomes in database will be examined: ", join (",", sort keys %valichr), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
263 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
264 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
265
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
266
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
267 sub annotateQueryByGene {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
268 my ($queryfh); #query file handle
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
269 my ($totalquerycount, $totalinvalidcount, $batchcount) = qw/0 0 1/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
270 open ($queryfh, $queryfile) or die "Error: cannot read from --queryfile ($queryfile): $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
271
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
272 open (OUT, ">$outfile.variant_function") or die "Error: cannot write to output file $outfile.variant_function: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
273 open (EXONIC, ">$outfile.exonic_variant_function") or die "Error: cannot write to output file $outfile.exonic_variant_function: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
274 open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
275
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
276 my ($genedb, $geneidmap, $cdslen, $mrnalen) = readUCSCGeneAnnotation ($dbloc);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
277
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
278 $time and printerr "NOTICE: Current time (before examining variants) is ", scalar (localtime), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
279 while (1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
280 my ($linecount, $invalidcount) = newprocessNextQueryBatchByGene ($queryfh, $batchsize, $genedb, $geneidmap, $cdslen, $mrnalen);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
281 $totalquerycount += $linecount;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
282 $totalinvalidcount += $invalidcount;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
283 $linecount == $batchsize or last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
284 $batchcount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
285 printerr "NOTICE: Begin processing batch $batchcount (each batch contains $batchsize variants)\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
286 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
287 close (INVALID);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
288 close (EXONIC);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
289 close (OUT);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
290 close ($queryfh);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
291 $time and printerr "NOTICE: Current time (after examining variants) is ", scalar (localtime), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
292
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
293 $totalinvalidcount or unlink ("$outfile.invalid_input"); #delete the file as it is empty
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
294 printerr "NOTICE: Finished gene-based annotation on $totalquerycount genetic variants in $queryfile";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
295 $totalinvalidcount and printerr " (including $totalinvalidcount with invalid format written to $outfile.invalid_input)";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
296 printerr "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
297 printerr "NOTICE: Output files were written to $outfile.variant_function, $outfile.exonic_variant_function\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
298 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
299
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
300 sub newprocessNextQueryBatchByGene {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
301 my ($queryfh, $batchsize, $genedb, $geneidmap, $cdslen, $mrnalen) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
302 my (%refseqvar);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
303
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
304 my ($chr, $start, $end, $ref, $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
305 my ($name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend, $name2);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
306 my ($invalid);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
307 my ($linecount, $invalidcount) = qw/0 0/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
308
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
309 for my $i (1 .. $batchsize) { #process up to batchsize variants
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
310 my $nextline = <$queryfh>; #read the next line in variant file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
311 defined $nextline or last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
312 $nextline =~ s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
313
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
314 if ($nextline =~ m/^#/ and $comment) { #comment line start with #, do not include this is $linecount
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
315 print OUT "#comment\t$nextline\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
316 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
317 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
318
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
319 $linecount++; #linecount does not include the comment line
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
320 $invalid = 0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
321
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
322 my @nextline = split (/\s+/, $nextline);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
323 ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
324 if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
325 $invalid++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
326 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
327 ($ref, $obs) = (uc $ref, uc $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
328 $zerostart and $start++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
329 $chr =~ s/^chr//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
330 if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
331 $invalid++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
332 } elsif ($ref eq '-' and $obs eq '-' #both are empty allele
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
333 or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
334 or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
335 or $start =~ m/[^\d]/ #start is not a number
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
336 or $end =~ m/[^\d]/ #end is not a number
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
337 or $start > $end #start is more than end
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
338 or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
339 or $ref eq '-' and $start != $end #length mismatch for insertion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
340 ) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
341 $invalid++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
342 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
343 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
344
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
345
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
346
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
347 if ($invalid) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
348 print INVALID $nextline, "\n"; #invalid record found
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
349 $invalidcount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
350 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
351 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
352
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
353 my (%intronic, %utr5, %utr3, %exonic, %upstream, %downstream, %ncrna, %intergenic, %splicing);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
354 my $foundgenic; #variant found in genic region (between start and end position of a gene in genome)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
355 my ($distl, $distr, $genel, $gener); #for intergenic variant, the distance and gene name to the left and right side of gene
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
356 my $bin1 = int ($start/$genomebinsize)-1; #start bin
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
357 $bin1 < 0 and $bin1=0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
358 my $bin2 = int ($end/$genomebinsize)+1; #end bin (usually same as start bin, unless the query is really big that spans multiple megabases)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
359
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
360 while (not exists $genedb->{$chr, $bin1} and $bin1 > int ($start/$genomebinsize)-$expandbin) { #examine at least 5 bins (by default 5Mb) to the left to make sure that a gene is found in the bin
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
361 $bin1 > 0 or last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
362 $bin1--;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
363 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
364
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
365 while (not exists $genedb->{$chr, $bin2} and $bin2 < int ($end/$genomebinsize)+$expandbin) { #examine at least 5 bins (by default 5Mb) to the right to make sure that a gene is found in the bin
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
366 $bin2++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
367 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
368
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
369 my (%seen);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
370 for my $nextbin ($bin1 .. $bin2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
371 exists $genedb->{$chr, $nextbin} or next; #this genome bin has no annotated gene (a complete intergenic region)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
372 for my $nextgene (@{$genedb->{$chr, $nextbin}}) { #when $genedb->{$chr, $nextbin} is undefined, this automatically create an array!!!
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
373 ($name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend, $name2) = @$nextgene;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
374 defined $name2 or printerr "WARNING: name2 field is not provided for transcript $name (start=$txstart end=$txend)\n" and $name2='';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
375 $seen{$name, $txstart} and next; #name and txstart uniquely identify a transcript and chromosome position (sometimes same transcript may map to two nearby positions, such as nearby segmental duplications)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
376 $seen{$name, $txstart}++; #a transcript may be in two adjacent bins, so if one is already scanned, there is no need to work on it again
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
377
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
378 if ($transfun) { #variant_function output contains transcript name, rather than gene name
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
379 $name2 = $name;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
380 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
381
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
382 if (not $foundgenic) { #this variant has not hit a genic region yet
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
383 if ($start > $txend) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
384 defined $distl or $distl = $start-$txend and $genel=$name2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
385 $distl > $start-$txend and $distl = $start-$txend and $genel=$name2; #identify left closest gene
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
386 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
387
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
388 if ($end < $txstart) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
389 defined $distr or $distr = $txstart-$end and $gener=$name2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
390 $distr > $txstart-$end and $distr = $txstart-$end and $gener=$name2; #identify right closest gene
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
391 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
392 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
393
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
394 if ($end < $txstart) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
395 #query ---
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
396 #gene <-*----*->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
397 $foundgenic and last; #if found a genic annotation already, end the search of the bins
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
398 if ($end > $txstart - $neargene) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
399 if ($dbstrand eq '+') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
400 $upstream{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
401 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
402 $downstream{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
403 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
404 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
405 last; #if transcript is too far away from end, end the search of the bins
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
406 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
407 } elsif ($start > $txend) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
408 #query ---
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
409 #gene <-*----*->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
410 if (not $foundgenic and $start < $txend + $neargene) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
411 if ($dbstrand eq '+') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
412 $downstream{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
413 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
414 $upstream{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
415 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
416 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
417 } elsif ($cdsstart == $cdsend+1) { #non-coding RNA (could be microRNA, or could be due to lack of CDS annotation for mRNA such as NR_026730 or BC039000). Previously we already did cdsstart++ so here the cdsstart is more than cdsend
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
418 if ($start >= $txstart and $start <= $txend or $end >= $txstart and $end <= $txend or $start <= $txstart and $end >= $txend) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
419 $ncrna{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
420 $foundgenic++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
421 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
422 } else { #query overlaps with coding region of gene
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
423 my ($lenintron) = (0); #cumulative intron length at a given exon
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
424 my ($rcdsstart, $rvarstart, $rvarend); #start of coding and variant in reference mRNA sequence
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
425 my @exonstart = @$exonstart;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
426 my @exonend = @$exonend;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
427 my $foundexonic;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
428 if ($dbstrand eq '+') { #forward strand, search from left to right (first exon to last exon)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
429 for my $k (0 .. @exonstart-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
430 $k and $lenintron += ($exonstart[$k]-$exonend[$k-1]-1); #calculate cumulative intron length
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
431 if ($cdsstart >= $exonstart[$k]) { #calculate CDS start accurately by considering intron length
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
432 $rcdsstart = $cdsstart-$txstart-$lenintron+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
433 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
434
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
435 #splicing calculation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
436 if ($start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1 or $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
437 $splicing{$name2}++; #when query start site is close to exon start or exon end
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
438 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
439 if ($end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]+$splicing_threshold-1 or $end >= $exonend[$k]-$splicing_threshold+1 and $end <= $exonend[$k]+$splicing_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
440 $splicing{$name2}++; #when query end site is close to exon start or exon end
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
441 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
442 if ($start <= $exonstart[$k] and $end>=$exonstart[$k] or $start <= $exonend[$k] and $end >= $exonend[$k]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
443 $splicing{$name2}++; #when query encompass the exon/intron boundary
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
444 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
445
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
446 if ($start < $exonstart[$k]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
447 if ($end >= $exonstart[$k]) { #exonic
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
448 $rvarstart = $exonstart[$k]-$txstart-$lenintron+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
449
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
450 for my $m ($k .. @exonstart-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
451 $m > $k and $lenintron += ($exonstart[$m]-$exonend[$m-1]-1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
452 if ($end < $exonstart[$m]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
453 #query --------
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
454 #gene <--**---******---****---->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
455 $rvarend = $exonend[$m-1]-$txstart-$lenintron+1 + ($exonstart[$m]-$exonend[$m-1]-1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
456 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
457 } elsif ($end <= $exonend[$m]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
458 #query -----------
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
459 #gene <--**---******---****---->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
460 $rvarend = $end-$txstart-$lenintron+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
461 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
462 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
463 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
464 if (not defined $rvarend) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
465 $rvarend = $txend-$txstart-$lenintron+1; #if this value is longer than transcript length, it suggest whole gene deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
466 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
467
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
468 #here the trick begins to differentiate UTR versus coding exonic
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
469 if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
470 #query ----
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
471 #gene <--*---*->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
472 $utr5{$name2}++; #positive strand for UTR5
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
473 } elsif ($start > $cdsend) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
474 #query ----
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
475 #gene <--*---*->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
476 $utr3{$name2}++; #positive strand for UTR3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
477 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
478 $exonic{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
479 $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '+', $i, $k+1, $nextline]; #refseq CDS start, refseq variant start
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
480 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
481 $foundgenic++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
482 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
483 } elsif ($k and $start > $exonend[$k-1]) { #intronic
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
484 $intronic{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
485 $foundgenic++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
486 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
487 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
488 } elsif ($start <= $exonend[$k]) { #exonic
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
489 $rvarstart = $start-$txstart-$lenintron+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
490
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
491 for my $m ($k .. @exonstart-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
492 $m > $k and $lenintron += ($exonstart[$m]-$exonend[$m-1]-1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
493 if ($end < $exonstart[$m]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
494 #query ------
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
495 #gene <--**---******---****---->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
496 $rvarend = $exonend[$m-1]-$txstart-$lenintron+1 + ($exonstart[$m]-$exonend[$m-1]-1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
497 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
498 } elsif ($end <= $exonend[$m]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
499 #query -----------
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
500 #gene <--**---******---****---->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
501 $rvarend = $end-$txstart-$lenintron+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
502 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
503 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
504 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
505 if (not defined $rvarend) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
506 $rvarend = $txend-$txstart-$lenintron+1; #if this value is longer than transcript length, it suggest whole gene deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
507 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
508
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
509 #here is the trick begins to differentiate UTR versus coding exonic
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
510 if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
511 #query ----
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
512 #gene <--*---*->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
513 $utr5{$name2}++; #positive strand for UTR5
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
514 } elsif ($start > $cdsend) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
515 #query ----
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
516 #gene <--*---*->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
517 $utr3{$name2}++; #positive strand for UTR3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
518 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
519 $exonic{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
520 $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '+', $i, $k+1, $nextline]; #queryindex, refseq CDS start, refseq variant start
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
521 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
522 $foundgenic++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
523 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
524 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
525 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
526 } elsif ($dbstrand eq '-') { #process negative strand (in the future, this should be fused to the paragraph above for positive strands; for now, I keep them separate for easier debugging)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
527 for (my $k = @exonstart-1; $k>=0; $k--) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
528 $k < @exonstart-1 and $lenintron += ($exonstart[$k+1]-$exonend[$k]-1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
529 if ($cdsend <= $exonend[$k]) { #calculate CDS start accurately by considering intron length
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
530 $rcdsstart = $txend-$cdsend-$lenintron+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
531 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
532
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
533 #splicing calculation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
534 if ($start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1 or $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
535 $splicing{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
536 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
537 if ($end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]+$splicing_threshold-1 or $end >= $exonend[$k]-$splicing_threshold+1 and $end <= $exonend[$k]+$splicing_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
538 $splicing{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
539 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
540 if ($start <= $exonstart[$k] and $end>=$exonstart[$k] or $start <= $exonend[$k] and $end >= $exonend[$k]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
541 $splicing{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
542 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
543
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
544 if ($end > $exonend[$k]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
545 if ($start <= $exonend[$k]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
546 $rvarstart = $txend-$exonend[$k]-$lenintron+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
547
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
548 for (my $m = $k; $m >= 0; $m--) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
549 $m < $k and $lenintron += ($exonstart[$m+1]-$exonend[$m]-1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
550 if ($start > $exonend[$m]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
551 #query --------
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
552 #gene <--**---******---****---->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
553 #$rvarend = $txend-$exonstart[$m]-$lenintron+1 - ($exonstart[$m+1]-$exonend[$m]-1); #commented out 2011feb18
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
554 $rvarend = $txend-$exonstart[$m+1]+1-$lenintron + ($exonstart[$m+1]-$exonend[$m]-1); #fixed this 2011feb18
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
555 last; #finsih the cycle!!!!!!!!!!!!!!!!!!!
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
556 } elsif ($start >= $exonstart[$m]) { #start within exons
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
557 #query ----
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
558 #gene <--**---******---****---->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
559 $rvarend = $txend-$start-$lenintron+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
560 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
561 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
562 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
563 if (not defined $rvarend) { #if rvarend is not found, then the whole tail of gene is covered
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
564 $rvarend = $txend-$txstart-$lenintron+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
565 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
566
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
567 #here is the trick begins to differentiate UTR versus coding exonic
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
568 if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
569 #query ----
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
570 #gene <--*---*->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
571 $utr3{$name2}++; #negative strand for UTR5
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
572 } elsif ($start > $cdsend) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
573 #query ----
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
574 #gene <--*---*->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
575 $utr5{$name2}++; #negative strand for UTR3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
576 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
577 $exonic{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
578 $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '-', $i, @exonstart-$k, $nextline];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
579 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
580 $foundgenic++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
581 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
582 } elsif ($k < @exonstart-1 and $end < $exonstart[$k+1]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
583 $intronic{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
584 $foundgenic++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
585 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
586 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
587 } elsif ($end >= $exonstart[$k]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
588 $rvarstart = $txend-$end-$lenintron+1; #all the rvarstart, rvarend are with respect to the cDNA sequence (so rvarstart corresponds to end of variants)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
589
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
590 for (my $m = $k; $m >= 0; $m--) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
591 $m < $k and $lenintron += ($exonstart[$m+1]-$exonend[$m]-1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
592 if ($start > $exonend[$m]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
593 #query ----
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
594 #gene <--**---******---****---->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
595 #$rvarend = $txend-$exonstart[$m]-$lenintron+1 - ($exonstart[$m+1]-$exonend[$m]-1); #commented out 2011feb18 due to bug (10 42244567 42244600 CACCTTTGCTTGATATGATAATATAGTGCCAAGG - hetero)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
596 $rvarend = $txend-$exonstart[$m+1]+1 - $lenintron + ($exonstart[$m+1]-$exonend[$m]-1); #fixed this 2011feb18
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
597 last; #finish the circle of counting exons!!!!!
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
598 } elsif ($start >= $exonstart[$m]) { #the start is right located within exon
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
599 #query -------
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
600 #gene <--**---******---****---->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
601 $rvarend = $txend-$start-$lenintron+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
602 last; #finish the cycle
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
603 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
604 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
605 if (not defined $rvarend) { #if rvarend is not found, then the whole tail of gene is covered
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
606 $rvarend = $txend-$txstart-$lenintron+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
607 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
608
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
609 #here the trick begins to differentiate UTR versus coding exonic
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
610 if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
611 #query ----
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
612 #gene <--*---*->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
613 $utr3{$name2}++; #negative strand for UTR5
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
614 } elsif ($start > $cdsend) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
615 #query ----
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
616 #gene <--*---*->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
617 $utr5{$name2}++; #negative strand for UTR3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
618 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
619 $exonic{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
620 $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '-', $i, @exonstart-$k, $nextline];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
621 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
622 $foundgenic++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
623 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
624 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
625 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
626 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
627 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
628 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
629 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
630 $foundgenic or $intergenic{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
631 $i =~ m/000000$/ and printerr "NOTICE: Finished analyzing $i query variants\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
632
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
633
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
634 my (@txname, %genename);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
635
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
636 if ($separate) { #separately print out each effect on one line
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
637 if (%exonic or %splicing or %intronic or %utr5 or %utr3 or %ncrna or %upstream or %downstream) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
638 %exonic and print OUT "exonic\t", join(",", sort keys %exonic), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
639 %splicing and $end-$start+1<=$splicing_threshold and print OUT "splicing\t", join (",", sort keys %splicing), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
640 %intronic and print OUT "intronic\t", join(",", sort keys %intronic), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
641 %utr5 and print OUT "UTR5\t", join(",", sort keys %utr5), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
642 %utr3 and print OUT "UTR3\t", join(",", sort keys %utr3), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
643 %ncrna and print OUT "ncRNA\t", join(",", sort keys %ncrna), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
644 %upstream and print OUT "upstream\t", join(",", sort keys %upstream), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
645 %downstream and print OUT "downstream\t", join(",", sort keys %downstream), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
646 } elsif (%intergenic) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
647 $genel ||= "NONE";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
648 $gener ||= "NONE";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
649 $distl ||= "NONE";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
650 $distr ||= "NONE";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
651 print OUT "intergenic\t", "$genel(dist=$distl),$gener(dist=$distr)", "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
652 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
653 die "FATAL ERROR: please report bug to ANNOVAR author with your input file\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
654 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
655 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
656 if (@precedence) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
657 my $foundmatch;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
658 for my $i (0 .. @precedence-2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
659 $precedence[$i] eq 'exonic' and %exonic and $foundmatch++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
660 $precedence[$i] eq 'splicing' and %splicing and $foundmatch++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
661 $precedence[$i] eq 'intronic' and %intronic and $foundmatch++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
662 $precedence[$i] eq 'utr5' and %utr5 and $foundmatch++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
663 $precedence[$i] eq 'utr3' and %utr3 and $foundmatch++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
664 $precedence[$i] eq 'ncrna' and %ncrna and $foundmatch++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
665 $precedence[$i] eq 'upstream' and %upstream and $foundmatch++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
666 $precedence[$i] eq 'downstream' and %downstream and $foundmatch++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
667 $precedence[$i] eq 'intergenic' and %intergenic and $foundmatch++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
668 if ($foundmatch) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
669 for my $j ($i+1 .. @precedence-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
670 $precedence[$j] eq 'exonic' and %exonic = ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
671 $precedence[$j] eq 'splicing' and %splicing = ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
672 $precedence[$j] eq 'intronic' and %intronic = ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
673 $precedence[$j] eq 'utr5' and %utr5 = ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
674 $precedence[$j] eq 'utr3' and %utr3 = ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
675 $precedence[$j] eq 'ncrna' and %ncrna = ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
676 $precedence[$j] eq 'upstream' and %upstream = ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
677 $precedence[$j] eq 'downstream' and %downstream = ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
678 $precedence[$j] eq 'intergenic' and %intergenic = ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
679 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
680 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
681 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
682 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
683 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
684
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
685
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
686 if (%exonic) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
687 if (%splicing and $end-$start+1<=$splicing_threshold) { #a big deletion spanning splicing site is not really a "splicing" mutation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
688 print OUT "exonic;splicing\t", join(",", sort keys %exonic), ";", join (",", sort keys %splicing), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
689 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
690 print OUT "exonic\t", join(",", sort keys %exonic), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
691 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
692 } elsif (%splicing) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
693 print OUT "splicing\t", join (",", sort keys %splicing), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
694 } elsif (%ncrna) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
695 print OUT "ncRNA\t", join(",", sort keys %ncrna), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
696 } elsif (%utr5 or %utr3) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
697 if (%utr5 and %utr3) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
698 print OUT "UTR5;UTR3\t", join(",", sort keys %utr5), ";", join(",", sort keys %utr3), "\t", $nextline, "\n"; #use ";" to separate UTR5 and UTR3 genes
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
699 } elsif (%utr5) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
700 print OUT "UTR5\t", join(",", sort keys %utr5), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
701 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
702 print OUT "UTR3\t", join(",", sort keys %utr3), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
703 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
704 } elsif (%intronic) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
705 print OUT "intronic\t", join(",", sort keys %intronic), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
706 } elsif (%upstream or %downstream) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
707 if (%upstream and %downstream) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
708 print OUT "upstream;downstream\t", join(",", sort keys %upstream), ";", join(",", sort keys %downstream), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
709 } elsif (%upstream) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
710 print OUT "upstream\t", join(",", sort keys %upstream), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
711 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
712 print OUT "downstream\t", join(",", sort keys %downstream), "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
713 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
714 } elsif (%intergenic) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
715 $genel ||= "NONE";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
716 $gener ||= "NONE";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
717 $distl ||= "NONE";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
718 $distr ||= "NONE";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
719 print OUT "intergenic\t", "$genel(dist=$distl),$gener(dist=$distr)", "\t", $nextline, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
720 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
721 die "FATAL ERROR: please report bug to ANNOVAR author with your input file\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
722 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
723 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
724 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
725 %refseqvar and annotateExonicVariants (\%refseqvar, $geneidmap, $cdslen, $mrnalen);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
726
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
727 return ($linecount, $invalidcount);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
728 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
729
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
730 sub annotateExonicVariants {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
731 my ($refseqvar, $geneidmap, $cdslen, $mrnalen) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
732 my $refseqhash;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
733 my $function = {};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
734 my %varinfo; #variants information (same as input line)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
735
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
736 $refseqhash = readSeqFromFASTADB ($refseqvar);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
737
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
738 for my $seqid (keys %$refseqvar) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
739 for my $i (0 .. @{$refseqvar->{$seqid}}-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
740 my ($refcdsstart, $refvarstart, $refvarend, $refstrand, $index, $exonpos, $nextline) = @{$refseqvar->{$seqid}->[$i]};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
741 my ($wtnt3, $wtnt3_after, @wtnt3, $varnt3, $wtaa, $wtaa_after, $varaa, $varpos); #wtaa_after is the aa after the wtaa
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
742 my ($chr, $start, $end, $ref, $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
743
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
744 my @nextline = split (/\s+/, $nextline);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
745 ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
746 ($ref, $obs) = (uc $ref, uc $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
747 $zerostart and $start++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
748 $chr =~ s/^chr//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
749
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
750 $varinfo{$index} = $nextline;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
751
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
752 if (not $refseqhash->{$seqid}) { #this refseq do not have FASTA sequence so cannot be interrogated
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
753 $function->{$index}{unknown} = "UNKNOWN";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
754 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
755 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
756
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
757 my $fs = (($refvarstart-$refcdsstart) % 3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
758 if ($refvarstart-$fs-1 > length($refseqhash->{$seqid})) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
759 printerr "WARNING: Potential database annotation error seqid=$seqid, refvarstart=$refvarstart, fs=$fs, seqlength=", length($refseqhash->{$seqid}), " refcdsstart=$refcdsstart, with inputline=$nextline\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
760 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
761 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
762
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
763 $wtnt3 = substr ($refseqhash->{$seqid}, $refvarstart-$fs-1, 3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
764 if (length ($refseqhash->{$seqid}) >= $refvarstart-$fs+3) { #going into UTR
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
765 $wtnt3_after = substr ($refseqhash->{$seqid}, $refvarstart-$fs+2, 3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
766 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
767 $wtnt3_after = ''; #last amino acid in the sequence without UTR (extremely rare situation) (example: 17 53588444 53588444 - T 414 hetero)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
768 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
769 @wtnt3 = split (//, $wtnt3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
770 if (@wtnt3 != 3 and $refvarstart-$fs-1>=0) { #some times there are database annotation errors (example: chr17:3,141,674-3,141,683), so the last coding frame is not complete and as a result, the cDNA sequence is not complete
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
771 $function->{$index}{unknown} = "UNKNOWN";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
772 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
773 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
774
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
775 if ($refstrand eq '-') { #change the observed nucleotide to the reverse strand
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
776 $obs = revcom ($obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
777 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
778
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
779 if ($start == $end) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
780 if ($ref eq '-') { #insertion variant
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
781 #the insertion coordinate system in ANNOVAR always uses "position after the current site"
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
782 #in positive strand, this is okay
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
783 #in negative strand, the "after current site" becomes "before current site" during transcription
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
784 #therefore, appropriate handling is necessary to take this into account
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
785 #for example, for a trinucleotide GCC with frameshift of 1 and insertion of CCT
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
786 #in positive strand, it is G-CTT-CC
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
787 #but if the transcript is in negative strand, the genomic sequence should be GC-CCT-C, and transcript is G-AGG-GC
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
788 if ($refstrand eq '+') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
789 if ($fs == 1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
790 $varnt3 = $wtnt3[0] . $wtnt3[1] . $obs . $wtnt3[2];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
791 } elsif ($fs == 2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
792 $varnt3 = $wtnt3[0] . $wtnt3[1] . $wtnt3[2] . $obs;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
793 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
794 $varnt3 = $wtnt3[0] . $obs . $wtnt3[1] . $wtnt3[2];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
795 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
796 } elsif ($refstrand eq '-') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
797 if ($fs == 1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
798 $varnt3 = $wtnt3[0] . $obs . $wtnt3[1] . $wtnt3[2];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
799 } elsif ($fs == 2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
800 $varnt3 = $wtnt3[0] . $wtnt3[1] . $obs . $wtnt3[2];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
801 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
802 $varnt3 = $obs . $wtnt3[0] . $wtnt3[1] . $wtnt3[2];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
803 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
804 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
805 ($wtaa, $wtaa_after, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($wtnt3_after), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
806 $wtaa_after and $wtaa_after eq '*' and $wtaa_after = 'X'; #wtaa_after could be undefined, if the current aa is the stop codon (X) (example: 17 53588444 53588444 - T)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
807
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
808 my $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarstart-$refcdsstart+2) . "ins$obs"; #cDNA level annotation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
809 if (length ($obs) % 3 == 0) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
810 if ($wtaa eq '*') { #mutation on stop codon
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
811 if ($varaa =~ m/\*/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
812 $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
813 $function->{$index}{nfsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,"; #stop codon is stil present
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
814 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
815 $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,"; #stop codon is lost
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
816 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
817 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
818 if ($varaa =~ m/\*/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
819 $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
820 $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "delins$varaa,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
821 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
822 $function->{$index}{nfsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "delins$varaa,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
823 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
824 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
825 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
826 if ($wtaa eq '*') { #mutation on stop codon
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
827 if ($varaa =~ m/\*/) { #in reality, this cannot be differentiated from non-frameshift insertion, but we'll still call it frameshift
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
828 $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
829 $function->{$index}{fsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
830 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
831 $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "delins$varaa,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
832 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
833 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
834 if ($varaa =~ m/\*/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
835 $varaa =~ s/\*.*/X/; #delete all aa after stop codon, but keep the aa before
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
836 $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "_$wtaa_after" . ($varpos+1) . "delins$varaa,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
837 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
838 $function->{$index}{fsins} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "fs,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
839 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
840 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
841 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
842 } elsif ($obs eq '-') { #single nucleotide deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
843 my $deletent;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
844 if ($fs == 1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
845 $deletent = $wtnt3[1];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
846 $varnt3 = $wtnt3[0].$wtnt3[2].$wtnt3_after;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
847 } elsif ($fs == 2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
848 $deletent = $wtnt3[2];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
849 $varnt3 = $wtnt3[0].$wtnt3[1].$wtnt3_after;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
850 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
851 $deletent = $wtnt3[0];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
852 $varnt3 = $wtnt3[1].$wtnt3[2].$wtnt3_after;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
853 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
854 ($wtaa, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
855 my $canno = "c." . ($refvarstart-$refcdsstart+1) . "del$deletent";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
856 if ($wtaa eq '*') { #mutation on stop codon
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
857 if ($varaa =~ m/\*/) { #stop codon is still stop codon
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
858 $function->{$index}{nfsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "X,"; #changed fsdel to nfsdel on 2011feb19
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
859 } else { #stop codon is lost
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
860 $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos" . "$varaa,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
861 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
862 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
863 if ($varaa =~ m/\*/) { #new stop codon created
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
864 $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "X,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
865 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
866 $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos" . "fs,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
867 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
868 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
869 } elsif (length ($obs) > 1) { #block substitution (since start==end, this changed from 1nt to several nt)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
870 if (($refvarend-$refvarstart+1-length($obs)) % 3 == 0) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
871 $function->{$index}{nfssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "delins$obs,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
872 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
873 $function->{$index}{fssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "delins$obs,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
874 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
875 } else { #single nucleotide substitution variant
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
876 my $canno;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
877 if ($fs == 1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
878 $varnt3 = $wtnt3[0] . $obs . $wtnt3[2];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
879 $canno = "c.$wtnt3[1]" . ($refvarstart-$refcdsstart+1) . $obs;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
880 } elsif ($fs == 2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
881 $varnt3 = $wtnt3[0] . $wtnt3[1]. $obs;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
882 $canno = "c.$wtnt3[2]" . ($refvarstart-$refcdsstart+1) . $obs;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
883 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
884 $varnt3 = $obs . $wtnt3[1] . $wtnt3[2];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
885 $canno = "c.$wtnt3[0]" . ($refvarstart-$refcdsstart+1) . $obs;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
886 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
887 ($wtaa, $varaa, $varpos) = (translateDNA ($wtnt3), translateDNA ($varnt3), int(($refvarstart-$refcdsstart)/3)+1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
888
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
889 if ($wtaa eq $varaa) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
890 $wtaa eq '*' and ($wtaa, $varaa) = qw/X X/; #change * to X in the output
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
891 $function->{$index}{ssnv} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos$varaa,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
892 } elsif ($varaa eq '*') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
893 $function->{$index}{stopgain} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa${varpos}X,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
894 } elsif ($wtaa eq '*') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
895 $function->{$index}{stoploss} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.X$varpos$varaa,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
896 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
897 $function->{$index}{nssnv} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.$wtaa$varpos$varaa,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
898 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
899 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
900 } elsif ($obs eq '-') { #deletion variant involving several nucleotides
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
901 ($wtaa, $varpos) = (translateDNA ($wtnt3), int(($refvarstart-$refcdsstart)/3)+1); #wildtype amino acid, position of amino acid
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
902 my ($varposend, $canno); #the position of the last amino acid in the deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
903 if ($refvarstart<=$refcdsstart) { #since the first amino acid is deleted, the whole gene is considered deleted
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
904 $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:wholegene,"; #it is exonic variant, so the varend has to hit the first exon
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
905 } elsif ($refvarend >= $cdslen->{$seqid}+$refcdsstart) { #3' portion of the gene is deleted
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
906 $varposend = int ($cdslen->{$seqid}/3); #cdslen should be multiples of 3, but just in case of database mis-annotation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
907 $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($cdslen->{$seqid}+$refcdsstart-1) . "del";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
908 $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
909 } elsif (($refvarend-$refvarstart+1) % 3 == 0) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
910 $varposend = int (($refvarend-$refcdsstart)/3) + 1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
911 $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "del";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
912 $function->{$index}{nfsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
913 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
914 $varposend = int (($refvarend-$refcdsstart)/3) + 1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
915 $canno = "c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "del";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
916 $function->{$index}{fsdel} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:$canno:p.${varpos}_${varposend}del,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
917 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
918 } else { #block substitution event
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
919 if (($refvarend-$refvarstart+1-length($obs)) % 3 == 0) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
920 $function->{$index}{nfssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "$obs,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
921 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
922 $function->{$index}{fssub} .= "$geneidmap->{$seqid}:$seqid:exon$exonpos:c." . ($refvarstart-$refcdsstart+1) . "_" . ($refvarend-$refcdsstart+1) . "$obs,";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
923 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
924 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
925 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
926 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
927
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
928 for my $index (sort {$a<=>$b} keys %$function) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
929 if ($separate) { #print out each type of exonic mutations separately (one effect in one line), rather than printing out only the most important function
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
930 if ($function->{$index}{fsins}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
931 print EXONIC "line$index\t", "frameshift insertion\t$function->{$index}{fsins}\t", $varinfo{$index}, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
932 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
933 if ($function->{$index}{fsdel}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
934 print EXONIC "line$index\t", "frameshift deletion\t$function->{$index}{fsdel}\t", $varinfo{$index}, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
935 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
936 if ($function->{$index}{fssub}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
937 print EXONIC "line$index\t", "frameshift substitution\t$function->{$index}{fssub}\t", $varinfo{$index}, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
938 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
939 if ($function->{$index}{stopgain}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
940 print EXONIC "line$index\t", "stopgain SNV\t$function->{$index}{stopgain}\t", $varinfo{$index}, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
941 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
942 if ($function->{$index}{stoploss}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
943 print EXONIC "line$index\t", "stoploss SNV\t$function->{$index}{stoploss}\t", $varinfo{$index}, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
944 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
945 if ($function->{$index}{nfsins}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
946 print EXONIC "line$index\t", "nonframeshift insertion\t$function->{$index}{nfsins}\t", $varinfo{$index}, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
947 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
948 if ($function->{$index}{nfsdel}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
949 print EXONIC "line$index\t", "nonframeshift deletion\t$function->{$index}{nfsdel}\t", $varinfo{$index}, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
950 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
951 if ($function->{$index}{nfssub}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
952 print EXONIC "line$index\t", "nonframeshift substitution\t$function->{$index}{nfssub}\t", $varinfo{$index}, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
953 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
954 if ($function->{$index}{nssnv}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
955 print EXONIC "line$index\t", "nonsynonymous SNV\t$function->{$index}{nssnv}\t", $varinfo{$index}, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
956 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
957 if ($function->{$index}{ssnv}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
958 print EXONIC "line$index\t", "synonymous SNV\t$function->{$index}{ssnv}\t", $varinfo{$index}, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
959 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
960 if ($function->{$index}{unknown}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
961 print EXONIC "line$index\t", "unknown\t$function->{$index}{unknown}\t", $varinfo{$index}, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
962 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
963 } else { #print out only the most important functional changes (for example, chr3:9931279-9931279 G->A can be both non-synonymous and synonymous mutations based on UCSC gene model)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
964 print EXONIC "line$index\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
965 my $sortout;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
966 if ($sortout = $function->{$index}{fsins}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
967 $exonsort and $sortout = sortExonicAnnotation ($sortout);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
968 print EXONIC "frameshift insertion\t$sortout\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
969 } elsif ($sortout = $function->{$index}{fsdel}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
970 $exonsort and $sortout = sortExonicAnnotation ($sortout);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
971 print EXONIC "frameshift deletion\t$sortout\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
972 } elsif ($sortout = $function->{$index}{fssub}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
973 $exonsort and $sortout = sortExonicAnnotation ($sortout);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
974 print EXONIC "frameshift substitution\t$sortout\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
975 } elsif ($sortout = $function->{$index}{stopgain}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
976 $exonsort and $sortout = sortExonicAnnotation ($sortout);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
977 print EXONIC "stopgain SNV\t$sortout\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
978 } elsif ($sortout = $function->{$index}{stoploss}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
979 $exonsort and $sortout = sortExonicAnnotation ($sortout);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
980 print EXONIC "stoploss SNV\t$sortout\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
981 } elsif ($sortout = $function->{$index}{nfsins}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
982 $exonsort and $sortout = sortExonicAnnotation ($sortout);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
983 print EXONIC "nonframeshift insertion\t$sortout\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
984 } elsif ($sortout = $function->{$index}{nfsdel}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
985 $exonsort and $sortout = sortExonicAnnotation ($sortout);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
986 print EXONIC "nonframeshift deletion\t$sortout\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
987 } elsif ($sortout = $function->{$index}{nfssub}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
988 $exonsort and $sortout = sortExonicAnnotation ($sortout);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
989 print EXONIC "nonframeshift substitution\t$sortout\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
990 } elsif ($sortout = $function->{$index}{nssnv}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
991 $exonsort and $sortout = sortExonicAnnotation ($sortout);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
992 print EXONIC "nonsynonymous SNV\t$sortout\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
993 } elsif ($sortout = $function->{$index}{ssnv}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
994 $exonsort and $sortout = sortExonicAnnotation ($sortout);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
995 print EXONIC "synonymous SNV\t$sortout\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
996 } elsif ($sortout = $function->{$index}{unknown}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
997 $exonsort and $sortout = sortExonicAnnotation ($sortout);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
998 print EXONIC "unknown\t$sortout\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
999 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1000 print EXONIC $varinfo{$index}, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1001 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1002 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1003 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1004
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1005 sub sortExonicAnnotation {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1006 my ($anno) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1007 my @anno1 = split (/,/, $anno);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1008 my @anno2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1009 for my $i (0 .. @anno1-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1010 my @temp = split (/:/, $anno1[$i]);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1011 $temp[2] =~ s/^exon//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1012 push @anno2, [$anno1[$i], @temp];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1013 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1014 @anno2 = sort {$a->[3] <=> $b->[3] or $a->[2] cmp $b->[2]} @anno2; #first sort by exon number, then by transcript name
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1015 my @anno3 = map {$_->[0]} @anno2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1016 return join (',', @anno3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1017 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1018
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1019 sub filterQuery {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1020 open (FIL, ">$outfile.${buildver}_${dbtype1}_filtered") or die "Error: cannot write to output file $outfile.${buildver}_${dbtype1}_filtered: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1021 open (DROPPED, ">$outfile.${buildver}_${dbtype1}_dropped") or die "Error: cannot write to output file $outfile.${buildver}_${dbtype1}_dropped: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1022 open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1023
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1024 printerr "NOTICE: Variants matching filtering criteria are written to $outfile.${buildver}_${dbtype1}_dropped, other variants are written to $outfile.${buildver}_${dbtype1}_filtered\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1025
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1026 open (QUERY, $queryfile) or die "Error: cannot read from query file $queryfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1027
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1028 my (%variant, $filedone, $batchdone);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1029 my ($linecount, $batchlinecount, $invalid, $invalidcount) = (0, 0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1030 my ($chr, $start, $end, $ref, $obs, $info);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1031 while (1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1032 $_ = <QUERY>;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1033 if (not defined $_) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1034 $filedone++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1035 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1036 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1037
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1038 if (m/^#/ and $comment) { #comment line start with #, do not include this is $linecount
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1039 print FIL "$_\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1040 print DROPPED "#comment\t#comment\t$_\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1041 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1042 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1043
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1044 $linecount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1045 $batchlinecount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1046 if ($batchlinecount == $batchsize) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1047 $batchdone++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1048 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1049
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1050 if ($memfree or $memtotal) { #if these arguments are specified
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1051 if ($linecount =~ m/00000$/) { #about 40Mb memory per 10k lines for a typical input dataset
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1052 my ($availmem, $allmem) = currentAvailMemory();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1053 $verbose and printerr "NOTICE: Current available system memory is $availmem kb (this program uses $allmem bytes memory), after reading $linecount query\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1054 if ($availmem and $availmem <= $memfree+50_000) { #some subsequent steps may take ~50Mb memory, so here we try to allocate some more memory
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1055 $batchdone++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1056 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1057 if ($memtotal and $allmem >= $memtotal-50_000) { #when --memtotal is specified, ensure that program use less memory
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1058 $batchdone++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1059 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1060 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1061 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1062
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1063 $invalid = 0; #reset invalid status
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1064
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1065 my @nextline = split (/\s+/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1066 ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1067 if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1068 $invalid++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1069 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1070 ($ref, $obs) = (uc $ref, uc $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1071 $zerostart and $start++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1072 $chr =~ s/^chr//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1073 if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1074 $invalid++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1075 } elsif ($ref eq '-' and $obs eq '-' #both are empty allele
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1076 or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1077 or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1078 or $start =~ m/[^\d]/ #start is not a number
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1079 or $end =~ m/[^\d]/ #end is not a number
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1080 or $start > $end #start is more than end
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1081 or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1082 or $ref eq '-' and $start != $end #length mismatch for insertion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1083 ) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1084 $invalid++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1085 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1086 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1087
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1088 if ($invalid) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1089 print INVALID $_, "\n"; #invalid record found
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1090 $invalidcount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1091 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1092 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1093
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1094 if ($start == $end and $ref eq '-') { #insertion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1095 $obs = "0$obs";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1096 } elsif ($obs eq '-') { #deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1097 $obs = $end-$start+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1098 } elsif ($end>$start or $start==$end and length($obs)>1) { #block substitution #fixed the bug here 2011feb19
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1099 $obs = ($end-$start+1) . $obs;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1100 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1101
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1102 if (exists $variant{$chr, $start, $obs}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1103 $variant{$chr, $start, $obs} .= "\n$_";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1104 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1105 $variant{$chr, $start, $obs} = "$ref\n$_";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1106 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1107 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1108
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1109 if ($filedone or $batchdone) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1110 printerr "NOTICE: Processing next batch with ${\(scalar keys %variant)} unique variants in $batchlinecount input lines\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1111 filterNextBatch (\%variant);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1112 %variant = ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1113 $batchlinecount = 0; #reset the line count for this batch
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1114 $batchdone = 0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1115 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1116 if ($filedone) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1117 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1118 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1119 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1120 close (INVALID); close (DROPPED); close (FIL);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1121 if ($invalidcount) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1122 printerr "NOTICE: Variants with invalid input format were written to $outfile.invalid_input\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1123 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1124 unlink ("$outfile.invalid_input");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1125 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1126 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1127
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1128 sub filterNextBatch {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1129 my ($variant) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1130 my $dbfile;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1131
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1132 if ($dbtype1 eq 'generic') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1133 $dbfile = File::Spec->catfile ($dbloc, $genericdbfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1134 } elsif ($dbtype1 eq 'vcf') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1135 $dbfile = File::Spec->catfile ($dbloc, $vcfdbfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1136 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1137 $dbfile = File::Spec->catfile ($dbloc, "${buildver}_$dbtype1.txt");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1138 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1139
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1140 open (DB, $dbfile) or die "Error: cannot read from input database file $dbfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1141 printerr "NOTICE: Scanning filter database $dbfile...";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1142
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1143 my (@record, $chr, $start, $end, $ref, $obs, $score, $qual, $fil, $info);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1144 my ($rsid, $strand, $ucscallele, $twoallele, $class, $af, $attribute);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1145 my $count_invalid_dbline;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1146 while (<DB>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1147 my (@obs2, @score2); #for 1000G2010 data set in VCF format, some tri-allelic SNPs are present; in the future, some quad-allelic SNPs may be also present in VCF files
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1148 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1149 m/\S/ or next; #skip empty lines in the database file (sometimes this occurs)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1150 m/^#/ and next; #skip the comment line
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1151 if ($dbtype eq 'avsift') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1152 @record = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1153 @record == 8 or die "Error: invalid record found in DB file $dbfile (8 tab-delimited fields expected): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1154 ($chr, $start, $end, $ref, $obs, $score) = @record;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1155 if ($chromosome) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1156 $valichr{$chr} or next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1157 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1158 if ($score < $sift_threshold) { #this is a deleterious mutation, skip it (equal sign should not be used, otherwise the score=0 will be skipped)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1159 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1160 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1161 } elsif ($dbtype =~ m/^ljb_/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1162 @record = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1163 @record >= 5 or die "Error: invalid record found in DB file $dbfile (at least 5 tab-delimited fields expected): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1164 ($chr, $start, $end, $ref, $obs, $score) = @record;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1165 if ($chromosome) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1166 $valichr{$chr} or next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1167 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1168 if (defined $score and defined $score_threshold and $score < $score_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1169 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1170 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1171 } elsif ($dbtype =~ m/^snp\d+/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1172 @record = split (/\t/, $_, -1); #-1 is required before some dbSNP records have many empty tab fields in the end
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1173 @record == 18 or @record == 26 or die "Error: invalid record found in dbSNP database file $dbfile (18 or 26 fields expected but found ${\(scalar @record)}): <$_>\n" . join("\n",@record);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1174 $record[1] =~ s/^chr// or die "Error: invalid record found in DB file (2nd field should start with 'chr'): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1175 ($chr, $start, $end, $rsid, $strand, $ucscallele, $twoallele, $class) = @record[1,2,3,4,6,8,9,11];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1176 $start++; #UCSC use zero-start system
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1177 if ($chromosome) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1178 $valichr{$chr} or next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1179 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1180 unless ($class eq 'single' or $class eq 'deletion' or $class eq 'in-del' or $class eq 'insertion') { #enum('unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion')
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1181 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1182 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1183
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1184 my @allele = split (/\//, $twoallele);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1185
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1186 #before Jan 2011, only di-allelic SNPs are handled in ANNOVAR
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1187 #@allele == 2 or next; #many entries have no allele information (for example, rs71010435)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1188 #in Jan 2011 version, I decided to handle tri-allelic and quad-allelic SNP as well
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1189
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1190 @allele >= 2 or next; #Jan 2011 modification
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1191 if ($strand eq '-') { #handle reverse strand annotation (the vast majority of records in dbSNP should be already in + strand)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1192 for my $i (0 .. @allele-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1193 $allele[$i] = revcom ($allele[$i]);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1194 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1195 #$ucscallele = revcom ($ucscallele); #added Jan 24, 2011 (per Kevin Ha) removed Feb 10, 2011 (per Eric Stawiski)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1196 #note that some SNPs (e.g., rs28434453) may have multiple location in diferent chromosome or strand; I may want to handle this by a special flag in the future
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1197 #585 chr1 13301 13302 rs28434453 0 - C C C/T genomic single etc...
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1198 #1367 chr15 102517867 102517868 rs28434453 0 + G G C/T genomic single etc...
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1199 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1200
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1201 #in-del is usually annotated below, so they require special treatment
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1202 #587 chr1 384538 384539 rs3971283 0 + T T -/ATT genomic in-del unknown 0 0 unknown exact 3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1203 if ($class eq 'in-del') { #indel are usually annotated as -/xxx, where xxx is the alternative allele
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1204 $obs = length ($ucscallele) . $allele[1]; #prefix a number before the alleles, indicating block substitution
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1205 defined $allele[1] or die "no allele 1 <$_>";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1206 } elsif ($class eq 'insertion') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1207 $start--;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1208 $obs = "0$allele[1]";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1209 } elsif ($class eq 'deletion') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1210 $obs = length ($ucscallele);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1211 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1212 for my $i (0 .. @allele-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1213 if ($ucscallele eq $allele[$i]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1214 @obs2 = @allele;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1215 splice (@obs2, $i, 1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1216 for my $j (0 .. @obs2-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1217 push @score2, $rsid;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1218 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1219 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1220 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1221 if (@obs2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1222 $obs = shift @obs2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1223 $score = shift @score2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1224 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1225 $verbose and printerr ("Database error: wildtype base $ucscallele is not part of the allele description in <$_>\n");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1226 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1227 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1228 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1229 $score = $rsid;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1230 } elsif ($dbtype =~ m/^1000g_(\w+)/ or $dbtype =~ m/^1000g2010_(\w+)/ or $dbtype =~ m/^1000g2010\w\w\w_(\w+)/) { #dbtype1 should NOT be used here
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1231 @record = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1232 @record == 5 or @record == 6 or die "Error: invalid record found in 1000G database file $dbfile (5 or 6 fields expected): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1233 ($chr, $start, $ref, $obs, $af) = @record; #there is no "END" in 1000G input file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1234 if ($chromosome) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1235 $valichr{$chr} or next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1236 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1237 if ($maf_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1238 if ($af > 0.5) { #the frequency is the non-reference allele frequency, which could exceed 0.5
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1239 1-$af >= $maf_threshold or next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1240 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1241 $af >= $maf_threshold or next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1242 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1243 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1244 $score = $af;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1245 } elsif ($dbtype eq 'generic') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1246 ($chr, $start, $end, $ref, $obs, $score) = split (/\t/, uc $_); #make sure to use upper case, as query is always in upper case
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1247 defined $obs or die "Error: the generic database file must contains at least five tab-delimited fields per line (but observed line: $_)\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1248 defined $score or $score = "NA";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1249 if ($chromosome) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1250 $valichr{$chr} or next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1251 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1252 defined $obs or die "Error: invalid record found in DB file $dbfile (at least 5 fields expected for 'generic' dbtype): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1253 if ($start == $end and $ref eq '-') { #insertion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1254 $obs = "0$obs";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1255 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1256 if ($obs eq '-') { #deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1257 $obs = $end-$start+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1258 } elsif ($start != $end) { #block substitution
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1259 $obs = ($end-$start+1) . $obs;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1260 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1261 if (defined $score and defined $score_threshold and $score < $score_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1262 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1263 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1264 } elsif ($dbtype eq 'vcf') { #vcf file is adopted by 1000 Genomes Project; it can describe both SNPs and indels, and it may contain both summary level statistics and individual level genotype calls
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1265 ($chr, $start, $rsid, $ref, $obs, $qual, $fil, $info) = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1266 if ($chromosome) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1267 $valichr{$chr} or next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1268 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1269
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1270 my ($ac, $an);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1271
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1272 if ($info =~ m/AF=([^;]+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1273 $score = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1274 if ($obs =~ m/(\w),(\w)/) { #1000G November; this format is not really valid because it does not handle tri-allelic SNP
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1275 ($obs, @obs2) = ($1, $2);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1276 @score2 = ($score);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1277 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1278 } elsif ($info =~ m/AC=(\S+?);AN=(\d+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1279 my ($alleles, $count) = ($1, $2);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1280 if ($alleles =~ m/^(\d+),(.+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1281 $score = sprintf ("%.3f", $1/$count);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1282 @score2 = split (/,/, $2);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1283 @score2 = map {sprintf("%.3f", $_/$count)} @score2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1284 ($obs, @obs2) = split (/,/, $obs); #the obs is composed of two alleles
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1285 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1286 $af = sprintf ("%.3f", $alleles/$count);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1287 $score = $af;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1288 #this is an invalid record in 1000GJuly: 1 2266231 rs11589451 C T,A . PASS AA=c;AC=20;AN=120;DP=237
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1289 if ($obs =~ m/(\w),/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1290 $count_invalid_dbline++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1291 $verbose and printerr "WARNING: Invalid input line found in $dbfile (more than one alleles are observed, but only one is annotated with allelic counts): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1292 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1293 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1294 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1295 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1296 printerr "WARNING: the VCF file does not contain allele frequency information. ANNOVAR cannot process this file\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1297 exit;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1298 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1299
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1300 if (length ($ref) == 1 and length ($obs) == 1) {#single base substitution
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1301 1; #the obs and obs2 is already handled
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1302 } elsif ($obs =~ m/^\-((\w)(\w*))$/) { #deletion (1000G March)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1303 $2 eq $ref or $ref eq 'N' or die "Error: mismatch of deleted allele and reference allele: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1304 $obs = length ($1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1305 } elsif ($obs =~ m/^\+(\w+)$/) { #insertion (1000G March)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1306 $obs = "0$1";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1307 } elsif ($ref =~ m/^[ACGTN]+$/ and $obs =~ m/^[ACGTN]+$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1308 if (length ($obs) == 1) { #deletion (1000G July)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1309 substr ($ref, 0, 1) eq $obs or die "Error: mismatch of deleted allele and reference allele: ref=$ref obs=$obs in <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1310 $start++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1311 $obs = length ($ref)-1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1312 } elsif (length ($ref) == 1) { #duplication (1000G July)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1313 substr ($obs, 0, 1) eq $ref or die "Error: mismatch of duplicated allele and reference allele: ref=$ref obs=$obs in <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1314 $start++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1315 $obs = "0" . substr ($obs, 1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1316 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1317 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1318 die "Error: invalid record found in VCF file: ref=$ref obs=$obs <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1319 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1320 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1321 die "invalid dbtype: $dbtype\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1322 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1323
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1324 if ($variant->{$chr, $start, $obs}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1325 my ($ref, @info) = split (/\n/, $variant->{$chr, $start, $obs}); #most likely, only one piece of information
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1326 for my $i (0 .. @info-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1327 print DROPPED join ("\t", $dbtype, $score), "\t", $info[$i], "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1328 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1329 delete $variant->{$chr, $start, $obs};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1330 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1331 if (@obs2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1332 for my $j (0 .. @obs2-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1333 if ($variant->{$chr, $start, $obs2[$j]}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1334 my ($ref, @info) = split (/\n/, $variant->{$chr, $start, $obs2[$j]}); #most likely, only one piece of information
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1335 for my $i (0 .. @info-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1336 print DROPPED join ("\t", $dbtype, $score2[$j]), "\t", $info[$i], "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1337 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1338 delete $variant->{$chr, $start, $obs2[$j]};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1339 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1340 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1341 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1342 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1343 for my $key (keys %$variant) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1344 my ($chr, $start, $obs) = split ($;, $key); #hash key separator
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1345 my ($ref, @info) = split (/\n/, $variant->{$key});
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1346 my $len;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1347 if ($obs =~ m/^(\d+)(.*)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1348 ($len, $obs) = ($1, $2);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1349 $obs ||= '-'; #deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1350 if ($len) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1351 $end = $start+$len-1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1352 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1353 $end = $start;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1354 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1355 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1356 $end = $start;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1357 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1358 for my $i (0 .. @info-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1359 print FIL $info[$i], "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1360 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1361 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1362 printerr "Done\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1363 $count_invalid_dbline and printerr "WARNING: $count_invalid_dbline lines in dbfile $dbfile were ignored due to invalid formats\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1364 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1365
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1366 sub annotateQueryByRegion {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1367 open (QUERY, $queryfile) or die "Error: cannot read from --queryfile ($queryfile): $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1368 open (OUT, ">$outfile.${buildver}_$dbtype1") or die "Error: cannot write to output file $outfile.${buildver}_$dbtype1: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1369 open (INVALID, ">$outfile.invalid_input") or die "Error: cannot write to output file $outfile.invalid_input: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1370
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1371 my ($regiondb, $parent) = ({}, {});
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1372
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1373 if ($dbtype eq 'gff3') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1374 ($regiondb, $parent) = readGFF3RegionAnnotation ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1375 } elsif ($dbtype eq 'bed') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1376 ($regiondb) = readBedRegionAnnotation ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1377 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1378 ($regiondb) = readUCSCRegionAnnotation ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1379 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1380
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1381 my ($chr, $start, $end, $ref, $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1382 my ($invalid);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1383 my ($linecount, $invalidcount) = qw/0 0/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1384
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1385 $time and printerr "NOTICE: Current time (before examining variants) is ", scalar (localtime), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1386 while (<QUERY>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1387 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1388
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1389 if (m/^#/ and $comment) { #comment line start with #, do not include this is $linecount
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1390 print OUT "#comment\t#comment\t$_\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1391 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1392 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1393
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1394 $linecount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1395
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1396 $invalid = 0; #reset invalid status
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1397
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1398 my @nextline = split (/\s+/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1399 ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1400 if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1401 $invalid++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1402 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1403 ($ref, $obs) = (uc $ref, uc $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1404 $zerostart and $start++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1405 $chr =~ s/^chr//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1406 if ($chr =~ m/[^\w]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1407 $invalid++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1408 } elsif ($ref eq '-' and $obs eq '-' #both are empty allele
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1409 or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1410 or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1411 or $start =~ m/[^\d]/ #start is not a number
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1412 or $end =~ m/[^\d]/ #end is not a number
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1413 or $start > $end #start is more than end
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1414 or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1415 or $ref eq '-' and $start != $end #length mismatch for insertion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1416 ) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1417 $invalid++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1418 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1419 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1420
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1421
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1422 if ($invalid) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1423 print INVALID $_, "\n"; #invalid record found
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1424 $invalidcount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1425 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1426 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1427
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1428 my $bin1 = int ($start/$genomebinsize); #start bin
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1429 my $bin2 = int ($end/$genomebinsize); #end bin (usually same as start bin, unless the query is really big that spans multiple megabases)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1430 my ($foundhit, $score, $name);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1431 for my $bin ($bin1 .. $bin2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1432 for my $nextgene (@{$regiondb->{$chr, $bin}}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1433 my ($txstart, $txend, $txscore, $txname) = @$nextgene;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1434
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1435 if ($end < $txstart) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1436 #db: <------------------------->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1437 #query: <--->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1438 last; #if genomic region is too far away from end, end the search of the bins
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1439 } elsif ($end <= $txend) { #query contained completely within db region
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1440 if ($start >= $txstart) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1441 #db: <-------------------------->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1442 #query: <------------------>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1443 } else { #query overlap but upstream of db region
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1444 #db: <------------------------->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1445 #query: <---------------------->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1446 if ($minqueryfrac) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1447 if (($end-$txstart+1)/($end-$start+1) < $minqueryfrac) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1448 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1449 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1450 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1451 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1452 $foundhit++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1453 $score ||= $txscore; $name ||= $txname;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1454 if ($score < $txscore) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1455 $score = $txscore;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1456 $name=$txname;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1457 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1458 if ($score == $txscore and defined $name and $name ne $txname) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1459 $name .= ",$txname";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1460 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1461 if ($dbtype1 eq 'cytoBand') { #a new chromosome band is encountered
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1462 $name ne $txname and $name .= ",$txname";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1463 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1464 } elsif ($start <= $txend) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1465 if ($start >= $txstart) { #query overlap but downstream of db region
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1466 #db: <------------------------>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1467 #query: <----------------------->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1468 if ($minqueryfrac) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1469 if (($txend-$start+1)/($end-$start+1) < $minqueryfrac) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1470 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1471 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1472 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1473 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1474 #db region completely contained within query
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1475 #db: <------------------------->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1476 #query: <------------------------------>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1477 if ($minqueryfrac) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1478 if (($txend-$txstart+1)/($end-$start+1) < $minqueryfrac) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1479 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1480 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1481 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1482 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1483 $foundhit++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1484 $score ||= $txscore; $name ||= $txname;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1485 if ($score < $txscore) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1486 $score = $txscore;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1487 $name=$txname;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1488 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1489 if ($score == $txscore and defined $name and $name ne $txname) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1490 $name .= ",$txname";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1491 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1492 if ($dbtype1 eq 'cytoBand') { #a new chromosome band is encountered
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1493 $name ne $txname and $name .= ",$txname";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1494 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1495 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1496 #query ---
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1497 #gene <-*----*->
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1498 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1499 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1500 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1501 $linecount =~ m/000000$/ and printerr "NOTICE: Finished processing $linecount variants in queryfile\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1502 if ($foundhit) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1503 $name ||= '';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1504 my @name = split (/,/, $name);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1505 my %name = map {$_, 1} @name;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1506 @name = keys %name;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1507
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1508 if ($dbtype1 eq 'cytoBand') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1509 map {s/^chr//} @name;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1510 if (@name >= 2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1511 $name[$#name] =~ s/^\d+//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1512 $name = $name[0] . '-' . $name[$#name];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1513 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1514 $name = $name[0];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1515 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1516 print OUT "$dbtype\t$name\t$_", "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1517 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1518 $name = join (",", @name);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1519 print OUT "$dbtype\t", $score?"Score=$score;":"", $name?"Name=$name":"", "\t", $_, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1520 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1521 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1522 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1523 close (QUERY);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1524 close (OUT);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1525 close (INVALID);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1526 $time and printerr "NOTICE: Current time (after examining variants) is ", scalar (localtime), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1527
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1528 printerr "NOTICE: Finished region-based annotation on $linecount genetic variants in $queryfile";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1529 if ($invalidcount) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1530 printerr " (including $invalidcount with invalid format written to $outfile.invalid_input)";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1531 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1532 unlink ("$outfile.invalid_input");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1533 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1534 printerr "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1535 printerr "NOTICE: Output files were written to $outfile.${buildver}_$dbtype1\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1536 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1537
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1538 sub readGFF3RegionAnnotation {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1539 my ($dbfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1540 my ($regioncount, $dbcount) = (0, 0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1541 my (@record, %regiondb, %parent);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1542
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1543 $dbfile = File::Spec->catfile ($dbloc, $gff3dbfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1544 -f $dbfile or die "Error: required database $dbfile does not exists. Please use 'annotate_variation.pl -downdb $dbtype $dbloc -buildver $buildver' to download annotation database.\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1545
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1546 open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1547 printerr "NOTICE: Reading annotation database $dbfile ... ";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1548 $_ = <DB>;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1549 $_ =~ m/^##gff-version\s+3/ or die "Error: invalid header line found in the GFF3 database $dbfile (expect to see '##gff-version 3'): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1550 while (<DB>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1551 m/^#/ and next; #skip comments line
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1552 m/^##FASTA/ and last; #reached the FASTA sequence section of GFF3 file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1553 $dbcount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1554 s/[\r\n]+$//; #deleting the newline characters
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1555 @record = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1556 @record == 9 or die "Error: invalid records found in the GFF3 database $dbfile (9 fields expected): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1557 my ($chr, $start, $end, $score, $attribute) = @record[0,3,4,5,8];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1558 $chr=~s/^chr//; #sometimes the chr prefix is present and should be removed (query usually does not contain this chr prefix)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1559 my $name;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1560 defined $score_threshold and $score < $score_threshold and next; #if --score_threshold is set, the low scoring segment will be skipped
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1561
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1562 my @feature = split (/;/, $attribute);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1563 for my $i (0 .. @feature-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1564 $feature[$i] =~ m/ID=(\S+)/ and $name = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1565 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1566 defined $name or die "Error: invalid record in GFF3 database $dbfile (ID field not found): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1567 for my $i (0 .. @feature-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1568 if ($feature[$i] =~ m/Parent=(.+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1569 my @parent = split (/,/, $1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1570 for my $j (0 .. @parent-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1571 $parent{$name} .= $parent[$j];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1572 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1573 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1574 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1575
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1576 my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1577 for my $nextbin ($bin1 .. $bin2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1578 push @{$regiondb{$chr, $nextbin}}, [$start, $end, $score, $name];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1579 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1580 $regioncount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1581 if ($verbose and $dbcount =~ m/000000$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1582 my ($availmem, $allmem) = currentAvailMemory();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1583 printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1584 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1585 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1586 close (DB);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1587 for my $key (keys %regiondb) { #pre-sort gene DB by txstart to faciliate future use
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1588 @{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1589 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1590 printerr "Done with $regioncount regions\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1591 return (\%regiondb, \%parent);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1592 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1593
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1594 sub readBedRegionAnnotation {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1595 my ($dbfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1596 my ($regioncount, $dbcount) = (0, 0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1597 my (@record, %regiondb);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1598 my ($chr, $start, $end);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1599
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1600 $dbfile = File::Spec->catfile ($dbloc, $bedfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1601
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1602 -f $dbfile or die "Error: required bedfile $dbfile does not exists.\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1603
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1604 open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1605 printerr "NOTICE: Reading annotation database $dbfile ... ";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1606
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1607 while (<DB>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1608 $dbcount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1609 s/[\r\n]+$//; #deleting the newline characters
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1610 @record = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1611
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1612 ($chr, $start, $end) = @record;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1613
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1614
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1615 $chr =~ s/^chr//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1616 $start++; #due to the zero-opening coordinate system in UCSC
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1617
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1618 my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1619 for my $nextbin ($bin1 .. $bin2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1620 push @{$regiondb{$chr, $nextbin}}, [$start, $end, 0, 'NA'];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1621 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1622 $regioncount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1623 if ($verbose and $dbcount =~ m/000000$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1624 my ($availmem, $allmem) = currentAvailMemory();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1625 printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1626 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1627 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1628 close (DB);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1629
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1630 for my $key (keys %regiondb) { #pre-sort gene DB by txstart to faciliate future use
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1631 @{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1632 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1633 printerr "Done with $regioncount regions\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1634 return (\%regiondb);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1635 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1636
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1637 sub readUCSCRegionAnnotation {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1638 my ($dbfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1639 my ($regioncount, $dbcount) = (0, 0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1640 my (@record, %regiondb);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1641 my ($chr, $start, $end, $score, $normscore, $name);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1642 my ($expectedLength, @positionCols, @scoreCols, @colsToOutput);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1643
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1644 if ($dbtype1 =~ m/^mce(\d+way)$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1645 $dbfile = File::Spec->catfile ($dbloc, "${buildver}_phastConsElements$1.txt");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1646 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1647 $dbfile = File::Spec->catfile ($dbloc, "${buildver}_$dbtype1.txt");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1648 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1649 -f $dbfile or die "Error: required database $dbfile does not exists. Please use 'annotate_variation.pl -downdb $dbtype $dbloc' to download annotation database.\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1650
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1651 #################$$$
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1652 ### The following SWITCH structure is modified Jan 2011 to faciliate future expansion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1653 ### $expectedLength is the number of cols expected in each line
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1654 ### @postionCols => location of ($chr,$start,$end) columns
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1655 ### @scoreCols => location of ($score, $normscore) columns leave empty is set not present (then set to zero below) ; WARNING must be empty or of length 2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1656 ### @colsToOutPut => location of ($name) columns to put into $name concatinated with ":" below
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1657
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1658 if ($dbtype1 =~ m/^phastConsElements\d+way/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1659 $expectedLength=6;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1660 @positionCols=(1,2,3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1661 @scoreCols=(4,5); #normalized score
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1662 @colsToOutput=(4); #lod=xxx is the Name output
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1663 } elsif ($dbtype1 eq 'evofold') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1664 $expectedLength=10;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1665 @positionCols=(1,2,3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1666 @scoreCols=(5,5);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1667 @colsToOutput=(4);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1668 } elsif ($dbtype1 eq 'tfbsConsSites') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1669 $expectedLength=8;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1670 @positionCols=(1,2,3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1671 @scoreCols=(7,5);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1672 @colsToOutput=(4);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1673 } elsif ($dbtype1 eq 'wgRna') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1674 $expectedLength=10;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1675 @positionCols=(1,2,3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1676 @scoreCols=(5,5);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1677 @colsToOutput=(4);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1678 } elsif ($dbtype1 eq 'targetScanS') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1679 $expectedLength=7;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1680 @positionCols=(1,2,3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1681 @scoreCols=(5,5);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1682 @colsToOutput=(4);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1683 } elsif ($dbtype1 eq 'genomicSuperDups') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1684 $expectedLength=30;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1685 @positionCols=(1,2,3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1686 @scoreCols=(27,27);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1687 @colsToOutput=(4);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1688 } elsif ($dbtype1 eq 'omimGene') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1689 $expectedLength=5;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1690 @positionCols=(1,2,3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1691 @scoreCols=();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1692 @colsToOutput=(4);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1693 } elsif ($dbtype1 eq 'gwasCatalog') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1694 $expectedLength=23;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1695 @positionCols=(1,2,3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1696 @scoreCols=();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1697 @colsToOutput=(10);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1698 } elsif ($dbtype1 eq 'dgv') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1699 $expectedLength=16;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1700 @positionCols=(1,2,3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1701 @scoreCols=();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1702 @colsToOutput=(4);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1703 } elsif ($dbtype1 eq 'cytoBand') { #special handling required
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1704 $expectedLength=5;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1705 @positionCols=(0,1,2);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1706 @scoreCols=();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1707 @colsToOutput=(0,3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1708 } elsif ($dbtype1 =~ m/^chr\w+_chainSelf$/) { #example: chr1_selfChain
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1709 $expectedLength=13;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1710 @positionCols=(2,4,5);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1711 @scoreCols=(12,12);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1712 @colsToOutput=(11);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1713 } elsif ($dbtype1 =~ m/^chr\w+_chain\w+$/) { #example: chr1_chainPanTro2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1714 $expectedLength=12;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1715 @positionCols=(2,4,5);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1716 @scoreCols=();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1717 @colsToOutput=(11);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1718 } elsif ($dbtype1 eq 'snp130' or $dbtype1 eq 'snp131') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1719 $expectedLength=18;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1720 @positionCols=(1,2,3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1721 @scoreCols=();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1722 @colsToOutput=(4);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1723 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1724 #other UCSC format if file is not defined above
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1725 $expectedLength='';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1726 @positionCols=(1,2,3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1727 @scoreCols=();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1728 @colsToOutput=(4);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1729 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1730
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1731 if ($scorecolumn) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1732 @scoreCols = ($scorecolumn, $scorecolumn);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1733 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1734
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1735 open (DB, $dbfile) or die "Error: cannot read from database file $dbfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1736 printerr "NOTICE: Reading annotation database $dbfile ... ";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1737
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1738 if ($expectedLength eq '') { # if DB is unknown "generic format" use first line to get $expectedLength : file rewound afterwards
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1739 my $line = <DB>;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1740 @record = split (/\t/, $line);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1741 $expectedLength=@record;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1742 seek (DB, 0, 0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1743 };
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1744
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1745 ########$$ Check to see if user has defined columns to output (intergers or all allowed)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1746 if (defined $colsWanted) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1747 if ($colsWanted[0] eq 'all') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1748 @colsToOutput= 0 .. ($expectedLength-1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1749 } elsif ($colsWanted[0] eq 'none') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1750 @colsToOutput = ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1751 } else{
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1752 @colsToOutput = @colsWanted;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1753 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1754 };
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1755
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1756 ########$$ check that the columns requested exist in the current DB
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1757 for my $i (0 .. @colsToOutput-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1758 if ($colsToOutput[$i] > $expectedLength) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1759 die "Error: The DB file $dbfile has only $expectedLength columns but output column $colsToOutput[$i] is requested by --colsWanted!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1760 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1761 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1762
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1763 while (<DB>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1764 $dbcount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1765 s/[\r\n]+$//; #deleting the newline characters
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1766 @record = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1767
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1768 @record == $expectedLength or die "Error: invalid record in dbfile $dbfile ($expectedLength fields expected): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1769 ($chr, $start, $end) = @record[@positionCols];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1770 if (@colsToOutput) { #I think there should always be a Name in the output column
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1771 $name = join (':', @record[@colsToOutput]);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1772 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1773
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1774 if(@scoreCols){
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1775 ($score, $normscore)=(@record[@scoreCols])
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1776 } else{
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1777 ($score, $normscore) = qw/0 0/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1778 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1779
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1780 #########$$ Unusual exceptions for phastCons
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1781 if ($dbtype1 =~ m/^phastConsElements\d+way/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1782 $score =~ s/^lod=// or die "Error: invalid lod score designation (no 'lod=' found) in dbfile $dbfile: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1783 } ##lod= in the score for conservation tracks
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1784
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1785 #########$$ Unusual exceptions for cytoBand
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1786 if ($dbtype1 eq 'cytoBand' and not defined $colsWanted) { #the name for chromosome band is concatenated as single word
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1787 $name =~ s/://;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1788 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1789
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1790 defined $score_threshold and $score < $score_threshold and next; #if --score_threshold is set, the low scoring segment will be skipped
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1791 defined $normscore_threshold and $normscore < $normscore_threshold and next; #if --normscore_threshold is set, the low scoring segment will be skipped
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1792
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1793 $chr =~ s/^chr//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1794 $start++; #due to the zero-opening coordinate system in UCSC
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1795
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1796 my ($bin1, $bin2) = (int($start/$genomebinsize), int($end/$genomebinsize));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1797 for my $nextbin ($bin1 .. $bin2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1798 if ($rawscore) { #print out rawscore, rather than normalized score (default)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1799 $normscore = $score;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1800 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1801 if (defined $name) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1802 push @{$regiondb{$chr, $nextbin}}, [$start, $end, $normscore, $name];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1803 } else { #name is not requested in the output
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1804 push @{$regiondb{$chr, $nextbin}}, [$start, $end, $normscore];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1805 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1806 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1807 $regioncount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1808 if ($verbose and $dbcount =~ m/000000$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1809 my ($availmem, $allmem) = currentAvailMemory();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1810 printerr "NOTICE: Current system available memory is $availmem kb (this ANNOVAR program used $allmem kb)\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1811 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1812 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1813 close (DB);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1814
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1815 for my $key (keys %regiondb) { #pre-sort gene DB by txstart to faciliate future use
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1816 @{$regiondb{$key}} = sort {$a->[0] <=> $b->[0]} @{$regiondb{$key}};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1817 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1818 printerr "Done with $regioncount regions";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1819 if (defined $score_threshold or $normscore_threshold) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1820 printerr " (that passed --score_threhsold or --normscore_threshold from a total of $dbcount regions)\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1821 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1822 printerr "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1823 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1824 return (\%regiondb);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1825 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1826
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1827
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1828 sub translateDNA {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1829 my ($seq) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1830 my ($nt3, $protein);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1831 $seq = uc $seq;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1832 #length ($seq) % 3 == 0 or printerr "WARNING: length of DNA sequence to be translated is not multiples of 3: <length=${\(length $seq)}>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1833 while ($seq =~ m/(...)/g) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1834 defined $codon1{$1} or printerr "WARNING: invalid triplets found in DNA sequence to be translated: <$1>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1835 $protein .= $codon1{$1};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1836 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1837 return $protein;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1838 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1839
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1840 sub translateRNA {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1841 my ($seq) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1842 my ($nt3, $protein);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1843 $seq = uc $seq;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1844 #length ($seq) % 3 == 0 or printerr "WARNING: length of RNA sequence to be translated is not multiples of 3: <length=${\(length $seq)}>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1845 while ($seq =~ m/(...)/g) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1846 defined $codonr1{$1} or printerr "WARNING: invalid triplets found in RNA sequence to be translated: <$1>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1847 $protein .= $codonr1{$1};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1848 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1849 return $protein;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1850 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1851
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1852 sub revcom {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1853 my ($seq) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1854 $seq = reverse $seq;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1855 $seq =~ tr/acgtACGT/tgcaTGCA/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1856 return ($seq);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1857 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1858
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1859 sub readSeqFromFASTADB {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1860 my ($refseqvar) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1861 my (%seqhash);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1862 my $seqdbfile;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1863
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1864 #the four statements below should be condensed in the future (they are identical)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1865 $seqdbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1" . "Mrna.fa");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1866
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1867 my ($seqid, $curseq) = ('', '');
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1868
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1869 -f $seqdbfile or die "Error: FASTA sequence file $seqdbfile does not exist. Please use 'annotate_variation.pl --downdb $dbtype $dbloc' download the database.\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1870 open (SEQ, $seqdbfile) or die "Error: cannot read from seqdbfile $seqdbfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1871 printerr "NOTICE: Reading FASTA sequences from $seqdbfile ... ";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1872 while (<SEQ>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1873 if (m/^>(\S+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1874 if ($refseqvar->{$seqid}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1875 not defined $seqhash{$seqid} and $seqhash{$seqid} = $curseq; #finish reading the sequence for seqid and save it (unless the sequence is already read from the file)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1876 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1877 $seqid = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1878 $curseq = '';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1879 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1880 if ($refseqvar->{$seqid}) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1881 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1882 $curseq .= uc $_; #only use upper case characters
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1883 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1884 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1885 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1886 if ($refseqvar->{$seqid}) { #finish the last sequence in the file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1887 not defined $seqhash{$seqid} and $seqhash{$seqid} = $curseq;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1888 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1889 close (SEQ);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1890 printerr "Done with ", scalar keys %seqhash, " sequences\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1891 if (keys %seqhash < keys %$refseqvar) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1892 my (@seqnotfound, @seqnotfound_example);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1893 for $seqid (keys %$refseqvar) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1894 exists $seqhash{$seqid} or push @seqnotfound, $seqid;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1895 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1896 printerr "WARNING: A total of ${\(scalar @seqnotfound)} sequences cannot be found in $seqdbfile";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1897 @seqnotfound_example = splice (@seqnotfound, 0, 3);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1898 printerr " (example: @seqnotfound_example)\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1899 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1900 return (\%seqhash);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1901 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1902
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1903 sub readKgXref {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1904 my ($inputfile) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1905 my (%gene_xref);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1906 open (XREF, $inputfile) or die "Error: cannot read from kgxref file $inputfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1907 while (<XREF>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1908 m/^#/ and next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1909 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1910 my @record = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1911 @record == 8 or die "Error: invalid record found in knownGene cross-reference file (6 fields expected): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1912 #some genes were given names that are prefixed with "Em:" which should be removed due to the presence of ":" in exonic variant annotation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1913 #Em:AC006547.7 Em:AC005003.4 Em:U62317.15 Em:AC008101.5 Em:AC004997.11 Em:U51561.2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1914 $record[4] =~ s/^Em:/Em./;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1915 if ($gene_xref{$record[0]}) { #BC003168 occur twice in kgxref file (OSBPL10, BC003168)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1916 if ($gene_xref{$record[0]} =~ m/^(BC|AK)\d+$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1917 $gene_xref{$record[0]} = $record[4];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1918 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1919 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1920 $gene_xref{$record[0]} = $record[4];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1921 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1922 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1923 close (XREF);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1924 return (\%gene_xref);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1925 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1926
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1927 sub readUCSCGeneAnnotation { #read RefGene annotation database from the UCSC Genome Browser, convert 0-based coordinates to 1-based coordinates
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1928 my ($dbloc) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1929 my ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1930 my (%genedb, %geneidmap, %name2count, %cdslen, %mrnalen);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1931 my ($genecount, $ncgenecount) = (0, 0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1932
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1933 my $dbfile;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1934 my $kgxref;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1935
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1936 if ($dbtype1 eq 'refGene') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1937 $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1938 } elsif ($dbtype1 eq 'knownGene') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1939 $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1940 my $kgxreffile = File::Spec->catfile($dbloc, $buildver . "_kgXref.txt");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1941 -f $kgxreffile or die "Error: the knownGene cross-reference file $kgxreffile does not exist. Please use 'annotate_variation.pl --downdb knownGene $dbloc' to download the database.\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1942 $kgxref = readKgXref ($kgxreffile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1943 } elsif ($dbtype1 eq 'ensGene') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1944 $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1945 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1946 $dbfile = File::Spec->catfile($dbloc, $buildver . "_$dbtype1.txt"); #added 2011feb18
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1947 #die "FATAL ERROR: the dbype $dbtype1 is not supported in the readUCSCGeneAnnotation() subroutine.\n"; #commented 2011feb18
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1948 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1949 -f $dbfile or die "Error: The gene annotation database $dbfile does not exist. Please use 'annotate_variation.pl --downdb $dbtype $dbloc -build $buildver' to download the database.\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1950
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1951 open (GENEDB, $dbfile) or die "Error: cannot read from gene annotaion database $dbfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1952 printerr "NOTICE: Reading gene annotation from $dbfile ... ";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1953 while (<GENEDB>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1954 s/[\r\n]+$//; #deleting the newline characters
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1955 my @record = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1956
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1957 if ($dbtype1 eq 'refGene') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1958 @record == 16 or die "Error: invalid record in $dbfile (expecting 16 tab-delimited fields in refGene file): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1959 ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15]; #human hg18, mouse
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1960 } elsif ($dbtype1 eq 'knownGene') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1961 @record >= 11 or die "Error: invalid record in $dbfile (>=11 fields expected in knownGene file): <$_>\n"; #mm8=11, hg18=hg19=12
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1962 ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend) = @record[0..9];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1963 $name2 = $kgxref->{$name} || $name;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1964 } elsif ($dbtype1 eq 'ensGene') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1965 @record == 16 or die "Error: invalid record in $dbfile (expecting 16 fields in ensGene file): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1966 ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1967 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1968 @record >= 11 or die "Error: invalid record in $dbfile (>=11 fields expected in $dbtype1 gene definition file): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1969 ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1970 defined $name2 or $name2=$name;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1971 #die "FATAL ERROR: the --dbtype $dbtype is not supported in readUCSCGeneAnnotation() subroutine.\n"; #commented 2011feb18
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1972 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1973
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1974 #handle situations where the same transcript is mapped to several chromosomes or regions (for example, NM_019105 is mapped to chr6, chr6_cox_hap1, chr6_qbl_hap2; NM_002538 is mapped to chr5 positive and negative strand and also in chr5_h2_hap1)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1975 if ($chr =~ m/hap\d+$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1976 next; #this is a temporary solution on 2011feb19, to ignore alternative haplotype chromosomes
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1977 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1978
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1979 $chr =~ s/^chr// or die "Error: invalid record found in $dbfile (chrom field not found): <$_>\n"; #UCSC always prefix "chr" to the chromosome identifier, so this is a good check to make sure that the file is the correct file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1980 $dbstrand eq '+' or $dbstrand eq '-' or die "Error: invalid dbstrand information found in $dbfile (dbstrand has to be + or -): <$_>\n"; #dbstrand is important to know and cannot be optional
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1981 my @exonstart = split (/,/, $exonstart); #remove trailing comma
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1982 my @exonend = split (/,/, $exonend); #remove trailing comma
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1983 $exoncount == @exonstart or die "Error: invalid record found in $dbfile (exoncount discordance): <$exoncount vs ${\(scalar @exonstart)}>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1984 @exonstart == @exonend or die "Error: invalid record found in $dbfile (exonstart and exonend count discordance): <${\(scalar @exonstart)} vs ${\(scalar @exonend)}>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1985 $txstart++; $cdsstart++; map {$_++} @exonstart; #convert 0-based coordinate to 1-based coordinate
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1986
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1987 #LOGIC here:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1988 #first calcluate mRNA length, and if the transcript maps to multiple locations with discordant mRNA length, only consider the leftmost chromosome and leftmost coordinate (because the FASTA file is sorted in this manner)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1989
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1990 my $cdslength = 0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1991 my $mrnalength = 0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1992 for my $i (0 .. @exonstart-1) { #this calculation is valid regardless of strand
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1993 $mrnalength += $exonend[$i]-$exonstart[$i]+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1994 if ($cdsstart >= $exonstart[$i] and $cdsstart <= $exonend[$i]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1995 if ($cdsend <= $exonend[$i]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1996 $cdslength = $cdsend-$cdsstart+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1997 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1998 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1999 $cdslength += $exonend[$i]-$cdsstart+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2000 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2001 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2002 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2003 if ($cdslength and $cdsend < $exonstart[$i]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2004 die "FATAL ERROR: impossible scenario for $name in $dbfile (cdsend is less than exon start)";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2005 } elsif ($cdslength and $cdsend <= $exonend[$i]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2006 $cdslength += $cdsend-$exonstart[$i]+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2007 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2008 } elsif ($cdslength and $cdsend > $exonend[$i]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2009 $cdslength += $exonend[$i]-$exonstart[$i]+1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2010 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2011
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2012 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2013
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2014 if ($cdsstart != $cdsend+1) { #coding gene
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2015 if (defined $mrnalen{$name} and $mrnalen{$name} != $mrnalength) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2016 $verbose and printerr "WARNING: $name occurs more than once in $dbfile with different mRNA length. The first occurences with identical mRNA length will be uesd in analysis.\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2017 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2018 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2019
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2020
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2021 if (defined $cdslen{$name} and $cdslen{$name} != $cdslength) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2022 $verbose and printerr "WARNING: $name occurs more than once in $dbfile with different CDS length. The first occurences with identical mRNA length will be uesd in analysis.\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2023 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2024 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2025 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2026
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2027 $cdslen{$name} = $cdslength;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2028 $mrnalen{$name} = $mrnalength;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2029
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2030 my ($bin1, $bin2) = (int(($txstart - $neargene)/$genomebinsize), int(($txend + $neargene)/$genomebinsize));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2031 for my $nextbin ($bin1 .. $bin2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2032 push @{$genedb{$chr, $nextbin}}, [$name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, [@exonstart], [@exonend], $name2];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2033 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2034 $geneidmap{$name} = $name2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2035 $genecount++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2036 $name2count{$name2}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2037 $cdsstart == $cdsend+1 and $ncgenecount++; #non-coding gene has the same start and end site
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2038 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2039 close (GENEDB);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2040 for my $key (keys %genedb) { #pre-sort gene DB by txstart to faciliate future use
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2041 @{$genedb{$key}} = sort {$a->[2] <=> $b->[2]} @{$genedb{$key}};
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2042 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2043 printerr "Done with $genecount transcripts (including $ncgenecount without coding sequence annotation) for ", scalar (keys %name2count), " unique genes\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2044 return (\%genedb, \%geneidmap, \%cdslen, \%mrnalen);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2045 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2046
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2047 sub downloadDB {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2048 my ($cwd, $msg, $sc);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2049
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2050 $cwd = Cwd::cwd();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2051
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2052 -w $dbloc or die "Error: the directory $dbloc is not writable by the current user\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2053 chdir ($dbloc) or die "Error: the directory $dbloc cannot be accessed\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2054
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2055 my (@urlin, @filein, @fileout, %fail); #the fail hash contains index of files that fail to be downloaded
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2056 my $count_success;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2057 if ($dbtype1 eq 'refGene') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2058 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/refGene.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2059 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/refLink.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2060 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_refGeneMrna.fa.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2061 } elsif ($dbtype1 eq 'knownGene') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2062 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/knownGene.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2063 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/kgXref.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2064 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_knownGeneMrna.fa.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2065 } elsif ($dbtype1 eq 'ensGene') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2066 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/ensGene.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2067 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_ensGeneMrna.fa.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2068 } elsif ($dbtype1 eq 'seq') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2069 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/chromFa.zip"; #example: hg18, hg19
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2070 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/chromFa.tar.gz"; #example: panTro2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2071 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/bigZips/$buildver.fa.gz"; #example: bosTau4
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2072 } elsif ($dbtype1 =~ m/^mce(\d+way)$/) { #it could be 17 way, 28 way, 30 way, 44 way, etc, depending on genome and on build
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2073 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/phastConsElements$1.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2074 } elsif ($dbtype1 eq 'avsift') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2075 $buildver eq 'hg18' or $buildver eq 'hg19' or die "Error: currently the --dbtype of avsift only support --buildver of 'hg18' or 'hg19'\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2076 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_avsift.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2077 } elsif ($dbtype1 eq '1000g') { #dbtype1 is same as queryfile, when --downdb is used
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2078 $buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g' only support --buildver of 'hg18'\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2079 push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/CEU.sites.2009_04.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2080 push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/YRI.sites.2009_04.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2081 push @urlin, "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2009_04/JPTCHB.sites.2009_04.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2082 } elsif ($dbtype1 eq '1000g2010') { #dbtype1 is same as queryfile, when --downdb is used
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2083 $buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g2010' only support --buildver of 'hg18'\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2084 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_CEU.sites.2010_03.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2085 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_YRI.sites.2010_03.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2086 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_JPTCHB.sites.2010_03.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2087 } elsif ($dbtype1 eq '1000g2010jul') { #dbtype1 is same as queryfile, when --downdb is used
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2088 $buildver eq 'hg18' or die "Error: currently the --dbtype of '1000g2010jul' only support --buildver of 'hg18'\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2089 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_CEU.sites.2010_07.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2090 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_YRI.sites.2010_07.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2091 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg18_JPTCHB.sites.2010_07.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2092 } elsif ($dbtype1 eq '1000g2010nov') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2093 $buildver eq 'hg19' or die "Error: currently the --dbtype of '1000g2010nov' only support --buildver of 'hg19'\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2094 push @urlin, "http://www.openbioinformatics.org/annovar/download/hg19_ALL.sites.2010_11.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2095 } elsif ($dbtype1 eq 'null') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2096 1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2097 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2098 if ($webfrom) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2099 if ($webfrom eq 'annovar') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2100 push @urlin, "http://www.openbioinformatics.org/annovar/download/${buildver}_$dbtype1.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2101 } elsif ($webfrom eq 'ucsc') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2102 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/$dbtype1.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2103 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2104 push @urlin, "$webfrom/$dbtype1.txt.gz";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2105 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2106 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2107 push @urlin, "ftp://hgdownload.cse.ucsc.edu/goldenPath/$buildver/database/$dbtype1.txt.gz"; #default goes to UCSC
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2108 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2109 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2110
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2111 @filein = @urlin;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2112 map {s/.+\///} @filein;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2113 @fileout = @filein;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2114 map {s/\.gz$//; s/\.zip$//} @fileout;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2115
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2116 if ($wget) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2117 $msg = qx/wget --help 2>&1/ || ''; #collect the output of the system command
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2118 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2119 $msg = ''; #when --nowget is specified, do not use wget to retrieve files from Internet
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2120 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2121 if ($msg =~ m/Usage/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2122 checkProgramUpdate ("wget");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2123 for my $i (0 .. @urlin-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2124 printerr "NOTICE: Downloading annotation database $urlin[$i] ... ";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2125 if ($verbose) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2126 $sc = "wget -t 1 -T 10 -O $filein[$i] $urlin[$i]";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2127 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2128 $sc = "wget -t 1 -T 10 -q -O $filein[$i] $urlin[$i]";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2129 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2130 if (system ($sc)) { #time-out is 10 seconds, with 1 retry attempt
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2131 printerr "Failed\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2132 $verbose and print "WARNING: unable to execute system command: <$sc>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2133 unlink ($filein[$i]); #delete the temporary files generated by wget
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2134 $fail{$i}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2135 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2136 printerr "OK\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2137 $count_success++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2138 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2139 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2140 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2141 eval {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2142 require Net::FTP;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2143 require LWP::UserAgent;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2144 };
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2145 if ($@) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2146 printerr "WARNING: cannot retrieve remote files automatically (by 'wget' command or by standard Net::FTP/LWP::UserAgent Perl module).\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2147 printerr "Please manually download the following file, uncompress the files to $dbloc directory, then add a ${buildver}_ prefix to the file names.\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2148 printerr join ("\n", @urlin), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2149 exit (100);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2150 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2151
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2152 checkProgramUpdate ("lwp");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2153 my ($http, $ftp);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2154 for my $i (0 .. @urlin-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2155 printerr "NOTICE: Downloading annotation database $urlin[$i] ... ";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2156 if ($urlin[$i] =~ m/^http/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2157 $http = LWP::UserAgent->new (timeout=>10, show_progress=>$verbose);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2158 $http->env_proxy;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2159
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2160 my $response = $http->get ($urlin[$i], ':content_file'=>$filein[$i]);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2161 if ($response->is_success) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2162 printerr "Done\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2163 $count_success++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2164 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2165 printerr "Failed\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2166 $verbose and printerr "WARNING: cannot retrieve remote files ($urlin[$i]) via LWP::UserAgent Perl module: ", $response->status_line, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2167 $fail{$i}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2168 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2169 } elsif ($urlin[$i] =~ m#^ftp://([^\\\/]+)#) { #for hgdownload.cse.ucsc.edu, ftp-trace.ncbi.nih.gov, ftp.ensembl.org, etc
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2170 my $urlroot = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2171 if ($ftp = Net::FTP->new($urlroot, Timeout=>10, Debug=>$verbose)) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2172 $ftp->login("anonymous", 'anonymous@');
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2173 $ftp->binary();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2174 my $url = $urlin[$i];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2175 $url =~ s#ftp://[\w\.\-]+/##; #remove the URL root
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2176 if (not $ftp->get($url)) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2177 printerr "Failed\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2178 $verbose and printerr "WARNING: cannot retrieve remote file ($url) in FTP server $urlroot\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2179 $fail{$i}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2180 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2181 printerr "Done\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2182 $count_success++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2183 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2184 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2185 printerr "Failed\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2186 $verbose and printerr "WARNING: cannot retrieve remote file ($urlin[$i]) via Net::FTP Perl module\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2187 $fail{$i}++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2188 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2189
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2190 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2191 die "Error: The URL $urlin[$i] uses an unsupported protocol. Download cannot continue\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2192 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2193 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2194 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2195
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2196 $count_success and printerr "NOTICE: Uncompressing downloaded files\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2197 for my $i (0 .. @filein-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2198 $fail{$i} and next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2199 if ($filein[$i] =~ m/\.zip$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2200 $msg = qx/unzip --help 2>&1/ || ''; #collect the output of the system command
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2201 if ($msg =~ m/Usage/i) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2202 if ($verbose) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2203 system ("unzip -o $filein[$i]");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2204 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2205 system ("unzip -o -q $filein[$i]");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2206 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2207 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2208 printerr "ERROR: unzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2209 exit (101);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2210 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2211 } elsif ($filein[$i] =~ m/\.tar\.gz$/) { #panTro2 FASTA sequence is stored as tar.gz rather than zip
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2212 $msg = qx/tar --help 2>&1/ || ''; #collect the output of the system command
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2213 if ($msg =~ m/Usage/i) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2214 system ("tar -x -z -f $filein[$i]");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2215 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2216 printerr "ERROR: tar/gunzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2217 exit (102);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2218 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2219 } elsif ($filein[$i] =~ m/\.gz$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2220 $msg = qx/gunzip --help 2>&1/ || ''; #collect the output of the system command
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2221 if ($msg =~ m/Usage/i) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2222 system ("gunzip -f $filein[$i]");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2223 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2224 printerr "ERROR: gunzip is not installed in your system.\nPlease manually uncompress the files (@filein) at the $dbloc directory", $dbtype1 eq 'seq'?", and rename them by adding ${buildver}_ prefix to the file names.\n":".\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2225 exit (103);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2226 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2227 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2228 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2229
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2230 for my $i (0 .. @fileout-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2231 $fail{$i} and next; #skip the file that failed to be downloaded
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2232 my $fileout = $fileout[$i];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2233 $dbtype1 eq 'seq' and next; #the zip file contains dozens of FASTA files so cannot rename them automatically
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2234 if (not $fileout =~ m/^${buildver}_/) { #if the buildver is not the prefix of the files
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2235 rename ($fileout, "${buildver}_$fileout") or die "Error: cannot rename $fileout to ${buildver}_$fileout\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2236 $fileout = "${buildver}_$fileout";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2237 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2238 if (not $fileout =~ m/\.txt$/ and not $fileout =~ m/\.fa$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2239 rename ($fileout, "$fileout.txt");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2240 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2241 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2242
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2243 $count_success and printerr "NOTICE: Finished downloading annotation files for $buildver build version, with files saved at the '$dbloc' directory\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2244 $cwd and chdir ($cwd);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2245 if (%fail) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2246 my @failindex = keys %fail;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2247 if ($dbtype1 eq 'seq' and @failindex == 1) { #not really a fail, because for seq, ANNOVAR attempts on tar.gz and zip file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2248 1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2249 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2250 printerr "WARNING: Some files cannot be downloaded, including ", join (', ', @urlin[@failindex]), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2251 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2252
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2253 for my $index (@failindex) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2254 if ($urlin[$index] =~ m#^http://www\.openbioinformatics\.org.+Mrna.fa.gz$#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2255 printerr "---------------------------ADDITIONAL PROCEDURE---------------------------\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2256 printerr "--------------------------------------------------------------------------\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2257 printerr "NOTICE: the FASTA file $urlin[$index] is not available to download but can be generated by the ANNOVAR software. ";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2258 printerr "PLEASE RUN THE FOLLOWING TWO COMMANDS CONSECUTIVELY TO GENERATE THE FASTA FILES:\n\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2259 printerr "\tannotate_variation.pl --buildver $buildver --downdb seq $dbloc/${buildver}_seq\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2260 printerr "\tretrieve_seq_from_fasta.pl $dbloc/${buildver}_$dbtype1.txt -seqdir $dbloc/${buildver}_seq -format $dbtype1 -outfile $dbloc/${buildver}_${dbtype1}Mrna.fa\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2261 printerr "--------------------------------------------------------------------------\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2262 printerr "--------------------------------------------------------------------------\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2263 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2264 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2265 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2266 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2267
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2268 sub currentAvailMemory {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2269 my ($availmem, $allmem) = (0, 0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2270 if ($^O eq "MSWin32") { #no easy solution to get available memory from Windows.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2271 ($availmem, $allmem) = (0, 0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2272 } elsif ($^O eq 'linux' or $^O eq 'aix' or $^O eq 'solaris') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2273 if (open (TOP, "top -b -n 1 2>&1 |")) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2274 my $index;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2275 while (<TOP>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2276 if (m/^Mem:.+\s(\d+)k free/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2277 $availmem = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2278 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2279 s/^\s+//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2280 my @field = split (/\s+/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2281 @field >= 10 or next; #make sure that the PID lines are reached
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2282 if ($field[0] eq 'PID') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2283 for my $i (0 .. @field-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2284 $field[$i] eq 'RES' and $index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2285 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2286 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2287 if ($field[0] eq $$) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2288 defined $index or die "Error: invalid output from top command: the line with PID and RES is not found\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2289 $allmem = $field[$index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2290 if ($allmem =~ m/^([\d\.]+)(\w)$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2291 if ($2 eq 'g') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2292 $allmem = $1 * 1_000_000;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2293 } elsif ($2 eq 'm') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2294 $allmem = $1 * 1_000;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2295 } elsif ($2 eq 'k') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2296 $allmem = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2297 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2298 printerr "WARNING: unrecognizable output from top command: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2299 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2300 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2301 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2302 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2303 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2304 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2305 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2306 ($availmem, $allmem) = (0, 0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2307 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2308 return ($availmem, $allmem);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2309 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2310
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2311 sub printerr {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2312 print STDERR @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2313 print LOG @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2314 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2315
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2316 sub checkProgramUpdate {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2317 my ($method) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2318 my $sc;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2319 my ($curdate, $webdate, $webdate1) = $LAST_CHANGED_DATE;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2320 my (@webcontent);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2321 $method eq 'wget' or $method eq 'lwp' or die "Error: update checking method can be only 'wget' or 'lwp'";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2322 printerr "NOTICE: Web-based checking to see whether ANNOVAR new version is available ... ";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2323 $LAST_CHANGED_DATE =~ m/LastChangedDate: (\d+)\-(\d+)-(\d+)/ or printerr "Failed\n" and return;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2324 $curdate = $1.$2.$3;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2325 if ($method eq 'wget') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2326 $sc = "wget -t 1 -T 10 -q -O .annovar_date http://www.openbioinformatics.org/annovar/download/annovar_date";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2327 if (system ($sc)) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2328 printerr "Failed\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2329 return;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2330 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2331 if (not open (AVDATE, ".annovar_date")) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2332 printerr "Cannot access version information\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2333 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2334 printerr "Done\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2335 @webcontent = <AVDATE>; #$LAST_CHANGED_DATE = '$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2336 close (AVDATE);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2337 unlink (".annovar_date");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2338 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2339 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2340 } elsif ($method eq 'lwp') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2341 my $http = LWP::UserAgent->new (timeout=>10);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2342 $http->env_proxy;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2343 my $response = $http->get("http://www.openbioinformatics.org/annovar/download/annovar_date");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2344 if ($response->is_success) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2345 printerr "Done\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2346 $_ = $response->decoded_content;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2347 @webcontent = split (/\n/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2348 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2349 printerr "Failed\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2350 return;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2351 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2352 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2353
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2354 $webdate = $webcontent[0];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2355 $webdate =~ s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2356 $webdate1 = $webdate;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2357 $webdate1 =~ s/\-//g; #remove the - sign in webdate
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2358 if ($curdate < $webdate1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2359 printerr "----------------------------UPDATE AVAILABLE------------------------------\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2360 printerr "--------------------------------------------------------------------------\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2361 printerr "WARNING: A new version of ANNOVAR (dated $webdate) is available!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2362 printerr " Download from http://www.openbioinformatics.org/annovar/\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2363
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2364 if (@webcontent >= 2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2365 printerr "Changes made in the $webdate version:\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2366 for my $i (1 .. @webcontent-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2367 if ($webcontent[$i] =~ m/^(\d{4})\-(\d{2})\-(\d{2})[\r\n]+$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2368 $webdate = "$1-$2-$3";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2369 $webdate1 = "$1$2$3";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2370 if ($curdate >= $webdate1) { #the current version is more recent than this date
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2371 last;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2372 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2373 printerr "Changes made in the $webdate version:\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2374 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2375 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2376 printerr " * $webcontent[$i]";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2377 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2378 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2379 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2380 printerr "--------------------------------------------------------------------------\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2381 printerr "--------------------------------------------------------------------------\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2382 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2383 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2384
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2385 =head1 SYNOPSIS
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2386
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2387 annotate_variation.pl [arguments] <query-file|table-name> <database-location>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2388
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2389 Optional arguments:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2390 -h, --help print help message
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2391 -m, --man print complete documentation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2392 -v, --verbose use verbose output
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2393
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2394 Arguments to download databases or perform annotations
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2395 --downdb download UCSC Genome Browser annotation database
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2396 --geneanno annotate variants by functional consequences on genes
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2397 --regionanno annotate variants by targetting specific genomics regions
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2398 --filter filter variants based on a position list
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2399 --webfrom <string> specify the source of database (default usually works fine)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2400
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2401 Arguments to control input and output files
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2402 --outfile <file> output file prefix
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2403 --zerostart input query file uses half-open zero-start coordinate
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2404 --dbtype <string> database type
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2405 --buildver <string> genome build version (default: hg18 for human)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2406 --gff3dbfile <file> specify the GFF3 DB file used in region-based annotation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2407 --genericdbfile <file> specify the generic DB file used in filter-based annotation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2408 --vcfdbfile <file> specify the DB file in VCF format in filter-based annotation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2409 --bedfile <file> specify a BED file in region-based annotation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2410 --time print out local time during program run
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2411 --separate separately print out all function of a variant (default: one line per variant)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2412 --colsWanted <string> specify which columns to output in -regionanno by comma-delimited numbers
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2413 --comment print out comment line (those starting with #) in output files
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2414 --scorecolumn <int> the column with scores in database file (for region-based annotation)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2415 --exonsort sort the exon number in output line (for gene-based annotation)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2416 --transcript_function use transcript name rather than gene name in gene-based annotation output
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2417
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2418 Arguments to fine-tune the annotation procedure
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2419 --batchsize <int> batch size for processing variants per batch (default: 5m)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2420 --genomebinsize <int> bin size to speed up search (default: 100k for -geneanno, 10k for -regionanno)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2421 --expandbin <int> check nearby bin to find neighboring genes (default: 2m/genomebinsize)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2422 --neargene <int> distance threshold to define upstream/downstream of a gene
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2423 --score_threshold <float> minimum score of DB regions to use in annotation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2424 --normscore_threshold <float> minimum normalized score of DB regions to use in annotation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2425 --rawscore output includes the raw score (not normalized score) in UCSC Browser Track
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2426 --minqueryfrac <float> minimum percentage of query overlap to define match to DB (default: 0)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2427 --splicing_threshold <int> distance between splicing variants and exon/intron boundary (default: 2)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2428 --maf_threshold <float> filter 1000G variants with MAF above this threshold (default: 0)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2429 --sift_threshold <float> SIFT threshold for deleterious prediction (default: 0.05)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2430 --precedence <string> comma-delimited to specify precedence of variant function (default: exonic>intronic...)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2431
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2432 Arguments to control memory usage
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2433 --memfree <int> ensure minimum amount of free system memory (default: 100000, in the order of kb)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2434 --memtotal <int> limit total amount of memory used by ANNOVAR (default: 0, unlimited, in the order of kb)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2435 --chromosome <string> examine these specific chromosomes in database file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2436
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2437
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2438 Function: annotate a list of genetic variants against genome annotation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2439 databases saved at local disk.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2440
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2441 Example: #download gene annotation database (for hg18 build) and save to humandb/ directory
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2442 annotate_variation.pl -downdb gene humandb/
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2443 annotate_variation.pl -buildver mm9 -downdb mce30way mousedb/
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2444 annotate_variation.pl -downdb snp130 humandb/
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2445
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2446 #gene-based annotation of variants in the varlist file (by default --geneanno is ON)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2447 annotate_variation.pl ex1.human humandb/
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2448
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2449 #region-based annotate variants
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2450 annotate_variation.pl -regionanno -dbtype mce44way ex1.human humandb/
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2451 annotate_variation.pl -regionanno -dbtype gff3 -gff3dbfile tfbs.gff3 ex1.human humandb/
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2452
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2453 #filter rare or unreported variants (in 1000G/dbSNP) or predicted deleterious variants
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2454 annotate_variation.pl -filter -dbtype 1000g_ceu -maf 0.01 ex1.human humandb/
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2455 annotate_variation.pl -filter -dbtype snp130 ex1.human humandb/
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2456 annotate_variation.pl -filter -dbtype avsift ex1.human humandb/
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2457
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2458 Version: $LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2459
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2460 =head1 OPTIONS
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2461
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2462 =over 8
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2463
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2464 =item B<--help>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2465
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2466 print a brief usage message and detailed explanation of options.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2467
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2468 =item B<--man>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2469
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2470 print the complete manual of the program.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2471
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2472 =item B<--verbose>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2473
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2474 use verbose output.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2475
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2476 =item B<--downdb>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2477
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2478 download annotation databases from UCSC Genome Browser, Ensembl, 1000 Genomes
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2479 Project or other resources. The annotation files in this database are required
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2480 for the functional annotation of variants.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2481
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2482 =item B<--geneanno>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2483
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2484 perform gene-based annotation. For each variant, examine whether it hit exon,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2485 intron, intergenic region, or close to a transcript, or hit a non-coding RNA
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2486 gene, or is located in a untranslated region. In addition, for an exonic variant,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2487 determine whether it causes splicing change, non-synonymous amino acid change,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2488 synonymous amino acid change or frameshift changes.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2489
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2490 =item B<--regionanno>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2491
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2492 perform region-based annotation. For each variant, examine whether it overlaps
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2493 with a specific genomic region, such as the most conserved elements, the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2494 predicted transcription factor binding sites, the specific cytogeneic bands, the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2495 evolutionarily conserved RNA secondary structures and so on.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2496
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2497 =item B<--filter>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2498
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2499 perform filter-based annotation. For each variants, filter it against a
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2500 variation database, such as the 1000 Genomes Project database and the dbSNP
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2501 database, and identify a subset that have not been reported in these databases
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2502 as novel variants.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2503
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2504 =item B<--outfile>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2505
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2506 specify the output file prefix. Several output files will be generated using
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2507 this prefix and different suffixes. A directory name can also be specified as
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2508 part of the argument, so that the output files can be written to a different
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2509 directory than the current directory.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2510
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2511 =item B<--zerostart>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2512
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2513 utilize the half-open zero-start coordinate system that is used by many UCSC
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2514 Genome Browser annotation tables. By default, the 1-based coordinate system will
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2515 be used.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2516
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2517 =item B<--dbtype>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2518
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2519 specify the database type to be used in gene-based, region-based or filter-based
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2520 annotations. For gene-based annotation, by default refGene annotations from the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2521 UCSC Genome Browser will be used for annotating variants. However, users can
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2522 switch to utilize Ensembl annotations instead, or use the UCSC Gene annotations.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2523 In general, RefSeq gene annotations are more conservative, and UCSC Gene
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2524 annotations are most liberal with many predicted genes and non-coding RNAs. For
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2525 region-based annotations, users can select any UCSC annotation databases (by
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2526 providing the database name), or alternatively select a Generic Feature Format
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2527 version 3 (GFF3) formatted file for annotation (by providing 'gff3' as the --
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2528 dbtype and providing the --gff3dbfile argument). For filter-based annotations,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2529 users can select a dbSNP file, a 1000G file, a generic format file (with simple
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2530 columns including chr, start, end, reference, observed, score), a VCF format
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2531 (which is the current popular format for variants exchange), or a avsift format
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2532 (which is identital to the generic format but is provided for convenience).
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2533
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2534 =item B<--buildver>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2535
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2536 genome build version to use. By default, the hg18 build for human genome is
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2537 used. The build version will be used by ANNOVAR to identify corresponding database files
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2538 automatically, for example, when gene-based annotation is used for hg18 build,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2539 ANNOVAR will search for the hg18_refGene.txt file, but if the hg19 is used as --
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2540 buildver, ANNOVAR will examine hg19_refGene.txt instead.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2541
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2542 =item B<--gff3dbfile>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2543
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2544 specify the GFF3-formatted database file used in the region-based annotation.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2545
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2546 =item B<--genericdbfile>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2547
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2548 specify the generic format database file used in the filter-based annotation.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2549
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2550 =item B<--vcfdbfile>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2551
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2552 specify the database file in VCF format in the filter-based annotation. VCF has
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2553 been a popular format for summarizing SNP and indel calls in a population of
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2554 samples, and has been adopted by 1000 Genomes Project in their most recent data
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2555 release.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2556
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2557 =item B<--time>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2558
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2559 print out the local time during execution of the program
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2560
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2561 =item B<--separate>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2562
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2563 for gene-based annotation, separate the effects of each variant, so that each
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2564 effect (intronic, exonic, splicing) is printed in one output line. By default,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2565 all effects are printed in the same line, in the comma-separated form of
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2566 'UTR3,UTR5' or 'exonic,splicing'.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2567
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2568 =item B<--colsWanted>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2569
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2570 specify which columns are desired in the output for -regionanno. By default,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2571 ANNOVAR inteligently selects the columns based on the DB type. However, users
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2572 can use a list of comma-delimited numbers, or use 'all', or use 'none', to
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2573 request custom output columns.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2574
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2575 =item B<--comment>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2576
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2577 specify that the program should include comment lines in the output files.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2578 Comment lines are defined as any line starting with #. By default, these lines
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2579 are not recognized as valid ANNOVAR input and are therefore written to the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2580 INVALID_INPUT file. This argument can be very useful to keep columns headers in
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2581 the output file, if the input file use comment line to flag the column headers
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2582 (usually the first line in the input file).
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2583
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2584 =item B<--scorecolumn>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2585
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2586 specify the the column with desired output scores in UCSC database file (for
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2587 region-based annotation). The default usually works okay.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2588
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2589 =item B<--exonsort>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2590
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2591 sort the exon number in output line in the exonic_variant_function file during
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2592 gene-based annotation. If a mutation affects multiple transcripts, the ones with
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2593 the smaller exon number will be printed before the transcript with larger exon
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2594 number in the output.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2595
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2596 =item B<--batchsize>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2597
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2598 this argument specifies the batch size for processing variants by gene-based
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2599 annotation. Normally 5 million variants (usually one human genome will have
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2600 about 3-5 million variants depending on ethnicity) are annotated as a batch, to
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2601 reduce the amounts of memory. The users can adjust the parameters: larger values
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2602 make the program slightly faster, at the expense of slightly larger memory
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2603 requirements. In a 64bit computer, the default settings usually take 1GB memory
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2604 for gene-based annotation for human genome for a typical query file, but this
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2605 depends on the complexity of the query (note that the query has a few required
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2606 fields, but may have many optional fields and those fields need to be read and
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2607 kept in memory).
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2608
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2609 =item B<--genomebinsize>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2610
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2611 the bin size of genome to speed up search. By default 100kb is used for gene-
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2612 based annotation, so that variant annotation focused on specific bins only
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2613 (based on the start-end site of a given variant), rather than searching the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2614 entire chromosomes for each variant. By default 10kb is used for region-based
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2615 annotation. The filter-based annotations look for variants directly so no bin is
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2616 used.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2617
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2618 =item B<--expandbin>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2619
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2620 expand bin to both sides to find neighboring genes/regions. For gene-based
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2621 annotation, ANNOVAR tries to find nearby genes for any intergenic variant, with
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2622 a maximum number of nearby bins to search. By default, ANNOVAR will
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2623 automatically set this argument to search 2 megabases to the left and right of
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2624 the variant in genome.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2625
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2626 =item B<--neargene>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2627
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2628 the distance threshold to define whether a variant is in the upstream or
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2629 downstream region of a gene. By default 1 kilobase from the start or end site of
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2630 a transcript is defined as upstream or downstream, respectively. This is useful,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2631 for example, when one wants to identify variants that are located in the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2632 promoter regions of genes across the genome.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2633
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2634 =item B<--score_threshold>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2635
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2636 the minimum score to consider when examining region-based annotations on UCSC
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2637 Genome Browser tables. Some tables do not have such scores and this argument
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2638 will not be effective.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2639
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2640 =item B<--normscore_threshold>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2641
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2642 the minimum normalized score to consider when examining region-based annotations
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2643 on UCSC Genome Browser tables. The normalized score is calculated by UCSC,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2644 ranging from 0 to 1000, to make visualization easier. Some tables do not have
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2645 such scores and this argument will not be effective.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2646
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2647 =item B<--rawscore>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2648
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2649 for region-based annotation, print out raw scores from UCSC Genome Browser
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2650 tables, rather than normalized scores. By default, normalized scores are printed
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2651 in the output files. Normalized scores are compiled by UCSC Genome Browser for
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2652 each track, and they usually range from 0 to 1000, but there are some
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2653 exceptions.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2654
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2655 =item B<--minqueryfrac>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2656
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2657 The minimum fraction of overlap between a query and a database record to decide
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2658 on their match. By default, any overlap is regarded as a match, but this may not
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2659 work best when query consist of large copy number variants.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2660
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2661 =item B<--splicing_threshold>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2662
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2663 distance between splicing variants and exon/intron boundary, to claim that a
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2664 variant is a splicing variant. By default, 2bp is used. ANNOVAR is relatively
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2665 more stringent than some other software to claim variant as regulating splicing.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2666 In addition, if a variant is an exonic variant, it will not be reported as
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2667 splicing variant even if it is within 2bp to an exon/intron boundary.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2668
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2669 =item B<--maf_threshold>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2670
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2671 the minor allele frequency (MAF) threshold to be used in the filter-based
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2672 annotation for the 1000 Genomes Project databases. By default, any variant
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2673 annotated in the 1000G will be used in filtering.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2674
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2675 =item B<--memfree>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2676
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2677 the minimum amount of free system memory that ANNOVAR should ensure to have. By
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2678 default, if ANNOVAR takes too much memory such that only 100Mb system memory is
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2679 available, ANNOVAR will stop reading annotation database into memory, and will
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2680 start annotation procedure, and then clear the memory, and then read the next
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2681 block of annotation database again. This argument ensures that ANNOVAR will not
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2682 attempt to use virtual memory in the system (which makes ANNOVAR extremely
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2683 slow).
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2684
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2685 =item B<--memtotal>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2686
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2687 the total amount of memory that ANNOVAR should use at most. By default, this
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2688 value is zero, meaning that there is no limit on that. Decreasing this threshold
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2689 reduce the memory requirement by ANNOVAR, but may increase the execution time.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2690
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2691 =item B<--chromosome>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2692
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2693 examine these specific chromosomes in database file. The argument takes comma-
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2694 delimited values, and the dash can be correctly recognized. For example, 5-10,X
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2695 represent chromosome 5 through chromosome 10 plus chromosome X.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2696
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2697 =back
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2698
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2699 =head1 DESCRIPTION
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2700
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2701 ANNOVAR is a software tool that can be used to functionally annotate a list of genetic variants,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2702 possibly generated from next-generation sequencing experiments. For example,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2703 given a whole-genome resequencing data set for a human with specific diseases,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2704 typically around 3 million SNPs and around half million insertions/deletions
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2705 will be identified. Given this massive amounts of data (and candidate disease-
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2706 causing variants), it is necessary to have a fast algorithm that scans the data
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2707 and identify a prioritized subset of variants that are most likely functional
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2708 for follow-up Sanger sequencing studies and functional assays.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2709
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2710 Currently, these various types of functional annotations produced by ANNOVAR can
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2711 be (1) gene-based annotations (the default behavior), such as exonic variants,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2712 intronic variants, intergenic variants, downstream variants, UTR variants,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2713 splicing site variants, stc. For exonic variants, ANNOVAR will try to predict
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2714 whether each of the variants is non-synonymous SNV, synonymous SNV,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2715 frameshifting change, nonframeshifting change. (2) region-based annotation, to
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2716 identify whether a given variant overlaps with a specific type of genomic
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2717 region, for example, predicted transcription factor binding site or predicted
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2718 microRNAs.(3) filter-based annotation, to filter a list of variants so that only
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2719 those not observed in variation databases (such as 1000 Genomes Project and
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2720 dbSNP) are printed out.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2721
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2722 Currently, I am expanding the functionality of ANNOVAR on (1) Fusion gene
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2723 detection from large deletions, where a deletion joins the reading frame of two
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2724 genes (same orientation of transcription) together to create a new gene. (2)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2725 Assignment of functional importance score to each observed mutation in the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2726 genome. This will be extremely important for the development of association
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2727 tests for rare variants, and for prioritization of variants in downstream
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2728 functional studies after a successful genome-wide association studies (GWAS).
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2729
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2730 =over 8
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2731
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2732 =item * B<variant file format>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2733
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2734 A sample variant file contains one variant per line, with the fields being chr,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2735 start, end, reference allele, observed allele, other information. The other
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2736 information can be anything (for example, it may contain sample identifiers for
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2737 the corresponding variant.) An example is shown below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2738
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2739 16 49303427 49303427 C T rs2066844 R702W (NOD2)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2740 16 49314041 49314041 G C rs2066845 G908R (NOD2)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2741 16 49321279 49321279 - C rs2066847 c.3016_3017insC (NOD2)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2742 16 49290897 49290897 C T rs9999999 intronic (NOD2)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2743 16 49288500 49288500 A T rs8888888 intergenic (NOD2)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2744 16 49288552 49288552 T - rs7777777 UTR5 (NOD2)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2745 18 56190256 56190256 C T rs2229616 V103I (MC4R)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2746
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2747 =item * B<database file format: UCSC Genome Browser annotation database>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2748
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2749 Most but not all of the gene annotation databases are directly downloaded from
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2750 UCSC Genome Browser, so the file format is identical to what was used by the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2751 genome browser. The users can check Table Browser (for example, human hg18 table
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2752 browser is at http://www.genome.ucsc.edu/cgi-bin/hgTables?org=Human&db=hg18) to
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2753 see what fields are available in the annotation file. Note that even for the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2754 same species (such as humans), the file format might be different between
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2755 different genome builds (such as between hg16, hg17 and hg18). ANNOVAR will try
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2756 to be smart about guessing file format, based on the combination of the --
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2757 buildver argument and the number of columns in the input file. In general, the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2758 database file format should not be something that users need to worry about.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2759
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2760 =item * B<database file format: GFF3 format for gene-based annotations)>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2761
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2762 As of June 2010, ANNOVAR cannot perform gene-based annotations using GFF3 input
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2763 files, and any annotations on GFF3 is region-based. However, this is expected to
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2764 be changed in the future.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2765
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2766 =item * B<database file format: GFF3 format for region-based annotations)>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2767
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2768 Currently, region-based annotations can support the Generic Feature Format
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2769 version 3 (GFF3) formatted files. The GFF3 has become the de facto golden
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2770 standards for many model organism databases, such that many users may want to
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2771 take a custom annotation database and run ANNOVAR on them, and it would be the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2772 most convenient if the custom file is made with GFF3 format.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2773
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2774 =item * B<database file format: generic format for filter-based annotations)>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2775
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2776 The 'generic' format is designed for filter-based annotation that looks for
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2777 exact variants. The format is almost identical to the ANNOVAR input format, with
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2778 chr, start, end, reference allele, observed allele and scores (higher scores are
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2779 regarded as better).
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2780
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2781 =item * B<database file format: VCF format for filter-based annotations)>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2782
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2783 The 1000 Genomes Project now provide their variant annotations in VCF format, so
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2784 I implemented the functionality to directly interrogate VCF files. A VCF file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2785 may contain summary information for variants (for example, this variant has MAF
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2786 of 5% in this population), or it may contain the actual variant calls for each
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2787 individual in a specific population. As of March 2010, the files from 1000G website
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2788 only contains the first type of information (that is, alleles and their
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2789 frequencies in population). For the purpose of simplicity, ANNOVAR only
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2790 interrogates the first type of information.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2791
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2792 =item * B<database file format: avsift for filter-based annotations)>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2793
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2794 avsift refers to a file that ANNOVAR developers compiled for fast annotation of
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2795 SIFT scores for non-synonymous variants in the human genome. It conforms to the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2796 generic format described above. However, users can directly specify '--dbtype
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2797 avsift' in command line to perform avsift annotations, making it more convenient
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2798 for users. Alternatively, users can use '--dbtype generic -genericdbfile
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2799 hg18_avsift.txt' for the annotation, and the effects are usually the same.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2800
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2801 =item * B<sequence file format>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2802
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2803 ANNOVAR can directly examine FASTA-formatted sequence files. For mRNA sequences,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2804 the name of the sequences are the mRNA identifier. For genomic sequences, the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2805 name of the sequences in the files are usually chr1, chr2, chr3, etc, so that
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2806 ANNOVAR knows which sequence corresponds to which chromosome. Unfortunately,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2807 UCSC uses things like chr6_random to annotate un-assembled sequences, as opposed
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2808 to using the actual contig identifiers. This causes some issues (depending on
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2809 how reads alignment algorithms works), but in general should not be something
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2810 that user need to worry about. If the users absolutely care about the exact
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2811 contigs rather than chr*_random, then they will need to re-align the short reads
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2812 at chr*_random to a different FASTA file that contains the contigs, and then
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2813 execute ANNOVAR on the newly identified variants.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2814
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2815 =item * B<invalid input>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2816
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2817 If the query file contains input lines with invalid format, ANNOVAR will skip
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2818 such line and continue with the annotation on next lines. These invalid input
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2819 lines will be written to a file with suffix invalid_input. Users should manually
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2820 examine this file and identify sources of error.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2821
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2822 =back
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2823
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2824 ANNOVAR is freely available to the academic community for non-commercial use. For
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2825 questions or comments, please contact kai@openbioinformatics.org.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2826
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2827 =cut