Mercurial > repos > iarc > mutspec
diff mutspecAnnot.pl @ 7:eda59b985b1c draft default tip
Uploaded
author | iarc |
---|---|
date | Mon, 13 Mar 2017 08:21:19 -0400 |
parents | 46a10309dfe2 |
children |
line wrap: on
line diff
--- a/mutspecAnnot.pl Tue Jun 28 02:59:32 2016 -0400 +++ b/mutspecAnnot.pl Mon Mar 13 08:21:19 2017 -0400 @@ -1,1235 +1,1327 @@ -#!/usr/bin/env perl - -#-----------------------------------# -# Author: Maude # -# Script: mutspecAnnot.pl # -# Last update: 21/06/16 # -#-----------------------------------# - -use strict; -use warnings; -use Getopt::Long; -use Pod::Usage; -use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); -use File::Path; -use Parallel::ForkManager; - - -our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. -our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files -our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context. -our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines) - -GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp) or pod2usage(2); - -our ($input) = @ARGV; - -pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); -pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); -pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script -pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input) - - - -###################################################################################################################################################### -# GLOBAL VARIABLES # -###################################################################################################################################################### - -######################################### -### SPECIFY THE NUMBER OF CPU ### -######################################### -our $max_cpu = 1; # Max number of CPU to use for the annotation - - -# Recover the current path -our $pwd = `pwd`; -chomp($pwd); - -# Input file path -our @pathInput = split("/", $input); -# Output directories -our ($folderMutAnalysis, $folderAnnovar) = ("", ""); -# File with the list of Annovar databases to use -our $listAVDB = ""; -# Initialisation of chromosome, position, ref and alt values -our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a"); - - -###################################################################################################################################################### -# MAIN # -###################################################################################################################################################### -## Check the presence of the flags and create the output and temp directories -CheckFlags(); - -## Format the file in the correct format if they are vcf or MuTect output and recover the column positions -FormatingInputFile(); - -# Annotate the file with Annovar, add the strand orientation and the sequence context -FullAnnotation(); - -###################################################################################################################################################### -# FUNCTIONS # -###################################################################################################################################################### - -## Check the presence of the flags and create the output and temp directories -sub CheckFlags -{ - # Check the reference genome - if($refGenome eq "empty") { print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; exit; } - if($intervalEnd eq "empty") { print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n"; exit; } - # If no output is specified write the result as the same place as the input file - if($output eq "empty") - { - my $folderRes = ""; - for(my $i=0; $i<$#pathInput; $i++) { $folderRes .= "$pathInput[$i]/"; } - - $folderMutAnalysis = "$folderRes/Mutational_Analysis"; - if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } - } - else - { - if(!-e $output) { mkdir($output) or die "$!: $output\n"; } - - $folderMutAnalysis = "$output/Mutational_Analysis"; - if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } - } - # Create the output folder for Annovar - $folderAnnovar = "$folderMutAnalysis/Annovar"; - if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; } - - # Verify the access to Annovar databases - if($path_AVDB eq "empty") { print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n"; exit; } - elsif(!-e $path_AVDB) { print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n"; exit; } - - # Check the file list AV DB - if($pathAVDBList eq "empty") { print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n"; exit; } - else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" } - - # If no temp folder is specified write the result in the current path - if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$pathInput[$#pathInput]"; } - if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; } -} - -## Format the file in the correct format if they are vcf or MuTect output and recover the column positions -sub FormatingInputFile -{ - # The input is a folder - if(-d $input) - { - foreach my $file (`ls $input`) - { - my $headerOriginalFile = ""; - chomp($file); - my ($filename, $directories, $suffix) = fileparse("$input/$file", qr/\.[^.]*/); - - CheckLengthFilename("$input/$file"); - - ################################################# - ### Recover the input format ### - ################################################# - RecoverInputFormat("$input/$file", \$headerOriginalFile); - } - } - # The input is one file - else - { - my $headerOriginalFile = ""; - - CheckLengthFilename($input); - my ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/); - - ################################################# - ### Recover the input format ### - ################################################# - RecoverInputFormat($input, \$headerOriginalFile); - } -} - -# The name for the Excel sheet can't be longer than 31 characters -sub CheckLengthFilename -{ - my ($inputFile) = @_; - - ## Verify the name of file, must be <= 31 chars for the sheet name - my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); - - if(length($filename) > 32) { print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n"; exit; } -} - -# Recover the input format (vcf or txt) and depending on the format convert the input file in a suitable format for Annovar (ex: for MuTect files keep only the confident variants) -sub RecoverInputFormat -{ - my ($file, $refS_headerOriginalFile) = @_; - - my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); - - my $inputFormat = ""; - - open(F1, $file) or die "$!: $file\n"; - my $header = <F1>; - close F1; - - ### VCF and MuTect files have in their first line the type of the file - if($header =~ /fileformat=VCF/i) { $inputFormat = "vcf"; } - elsif($header =~ /mutect/i) { $inputFormat = "mutect"; } - else { $inputFormat = "unknown"; } - - - ### VCF files - if($inputFormat eq "vcf") - { - ### MuTect2 output VCFs - my $testVC = `grep MuTect2 $file`; - if($testVC =~ /MuTect2/) - { - # Keep only the variants passing MuTect2 filters - `grep PASS $file > $folder_temp/$filename-PASS.txt`; - - # Recover the header - $$refS_headerOriginalFile = `grep '#CHROM' $file`; - - # Add the header - # Sed command doesn't work... sed 's/^/some text\n/' file > res - open(OUT, ">", "$folder_temp/$filename-KEEP.txt") or die "$!: $folder_temp/$filename-KEEP.txt\n"; - print OUT $$refS_headerOriginalFile; - open(F1, "$folder_temp/$filename-PASS.txt") or die "$!: $folder_temp/$filename-PASS.txt\n"; - while(<F1>) { print OUT $_; } - close F1; close OUT; - - `rm $folder_temp/$filename-PASS.txt`; - - # Check if there if no empty column - CheckEmptyColumn("$folder_temp/$filename-KEEP.txt"); - `rm $folder_temp/$filename-KEEP.txt`; - - # Set the col number for the chr,start,ref and alt - ($chrValue, $positionValue, $refValue, $altValue) = (0, 1, 3, 4); - } - else - { - open(F1, $file) or die "$!: $file\n"; - open(OUT, ">", "$folder_temp/$filename.txt") or die "$!: $folder_temp/$filename.txt\n"; - while (<F1>) - { - $_ =~ s/[\r\n]+$//; - my @tab = split("\t", $_); - # Print the VCF header - if($tab[0] eq "#CHROM") - { - $tab[0] =~ /#(.+)/; - print OUT "$1"; - for(my $i=1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } - print OUT "\n"; - } - elsif($tab[0] !~ /##/) - { - # Don't consider chromosome random, GL and MT - if( ($tab[0] =~ /random/) || ($tab[0] =~ /GL/i) ) { next; } - print OUT "$_\n"; - } - } - close F1; close OUT; - - ## Recover the header - open(F1, "$folder_temp/$filename.txt") or die "$!: $folder_temp/$filename.txt\n"; - $$refS_headerOriginalFile = <F1>; - close F1; - - # Check if there if no empty column - CheckEmptyColumn("$folder_temp/$filename.txt"); - `rm $folder_temp/$filename.txt`; - - - # Set the col number for the chr,start,ref and alt - ($chrValue, $positionValue, $refValue, $altValue) = (0, 1, 3, 4); - } - } - ### MuTect files - elsif($inputFormat eq "mutect") - { - `sed '1d' $file > $folder_temp/$filename-HeaderOK`; - # Keep only the SNVs of good quality - `grep -v REJECT $folder_temp/$filename-HeaderOK > $folder_temp/$filename-KEEP.txt`; - `rm $folder_temp/$filename-HeaderOK`; - - # Recover the header - open(F1, "$folder_temp/$filename-KEEP.txt") or die "$!: $folder_temp/$filename-KEEP.txt\n"; - $$refS_headerOriginalFile = <F1>; - close F1; - - # Check if there if no empty column - CheckEmptyColumn("$folder_temp/$filename-KEEP.txt"); - `rm $folder_temp/$filename-KEEP.txt`; - - # Recover the name and the number of the columns that contain the chromosome number, the start position, the ref and alt alleles. - # Use the dictionary for recover the names and the position of the columns - RecoverColNameAuto("$folder_temp/$filename-KEEPColumnCorrect.txt", $$refS_headerOriginalFile, \$chrValue, \$positionValue, \$refValue, \$altValue); - } - ### Unknown type - else - { - ## Recover the header - open(F1, $file) or die "$!: $file\n"; - $$refS_headerOriginalFile = <F1>; - close F1; - - # Check if there if no empty column - CheckEmptyColumn($file); - - ## Recover the name and the number of the columns that contain the chromosome number, the start position, the ref and alt alleles. - # Use the dictionary for recover the names and the position of the columns - RecoverColNameAuto($file, $$refS_headerOriginalFile, \$chrValue, \$positionValue, \$refValue, \$altValue); - } -} - -# Some files can have empty column with no information -sub CheckEmptyColumn -{ - my ($inputFile) = @_; - - my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); - - if($filename =~ /(.+)-KEEP/) { $filename = $1; } - - open(OUT, ">", "$folder_temp/$filename-ColumnCorrect.txt") or die "$!: $folder_temp/$filename-ColumnCorrect.txt\n"; - - open(F1, $inputFile) or die "$!: $inputFile\n"; - my $header = <F1>; $header =~ s/[\r\n]+$//; my @tabHeader = split("\t", $header); - print OUT $header, "\n"; - while(<F1>) - { - $_ =~ s/[\r\n]+$//; - my @tab = split("\t", $_); - - if(scalar(@tab) != scalar(@tabHeader)) - { - print OUT $tab[0]; - for(my $i=1; $i<=$#tabHeader; $i++) - { - if(defined($tab[$i])) { print OUT "\t$tab[$i]"; } - else { print OUT "\tNA"; } - } - print OUT "\n"; - } - else { print OUT "$_\n"; } - } - close F1; close OUT; -} - -# Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles. -sub RecoverColNameAuto -{ - our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_; - - $header =~ s/[\r\n]+$//; - - ## Name of the columns - my @mutect = qw(contig position ref_allele alt_allele); - my @vcf = qw(CHROM POS REF ALT); - my @cosmic = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic); - my @icgc = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele); - my @tcga = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2); - my @ionTorrent = qw(chr Position Ref Alt); - my @proton = qw(Chrom Position Ref Variant); - my @varScan2 = qw(Chrom Position Ref VarAllele); - my @annovar = qw(Chr Start Ref Obs); - my @custom = qw(Chromosome Start Wild_Type Mutant); - - my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@annovar, \@custom); - my $timer = 0; # For controlling if the names are present on the dictionnary or not - - foreach my $refTab (@allTab) - { - my @tab = @$refTab; - - SearchCol(\@tab); - - # The columns names were find - if( ($$ref_chrValue ne "c") && ($$ref_positionValue ne "s") && ($$ref_refValue ne "r") && ($$ref_altValue ne "a") ) { last; } - # The names of the columns are not present in the dictionnary - else { $timer++; } - } - - if($timer == scalar(@allTab)) - { - print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n"; - print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n"; - exit; - } - - # Extract the number of the column that contain the information - sub SearchCol - { - my ($refTab) = @_; - - my @tabNames = @$refTab; - my @tabHeader = split("\t", $header); - - # For VCF - if($tabHeader[0] eq "#CHROM") { ($$ref_chrValue, $$ref_positionValue, $$ref_refValue, $$ref_altValue) = (0, 1, 3, 4); } - # For tabular files - else - { - for(my $i=0; $i<=$#tabNames; $i++) - { - for(my $j=0; $j<=$#tabHeader; $j++) - { - if($tabHeader[$j] eq $tabNames[$i]) - { - if($i == 0) { $$ref_chrValue = $j; } - if($i == 1) { $$ref_positionValue = $j; } - if($i == 2) { $$ref_refValue = $j; } - if($i == 3) { $$ref_altValue = $j; } - last; # Once find pass to the next name - } - } - } - } - } -} - -# Annotate the file with Annovar, add the strand orientation and the sequence context -sub FullAnnotation -{ - print "-----------------------------------------------------------------\n"; - print "---------------------------Annotation----------------------------\n"; - print "-----------------------------------------------------------------\n"; - - - # If the input is a folder - if(-d $input) - { - foreach my $file (`ls $folder_temp/*.txt`) - { - chomp($file); - - # For recover the name of the file without extension, the directory where the file is and the extension of the file - my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/); - my $filenameOK = ""; - # For removing the ColumnCorrect for txt files - if($filename =~ /(.+)-ColumnCorrect/) - { - if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; } - else { $filenameOK = $1; } - } - else { print STDERR "Case not considered for $filename!!!\n"; exit; } - - - ################################################# - ### Cut the files in n part ### - ################################################# - # Recover the number of variants in the file for deciding the number of CPU to use - my $cpu = 0; - my $nbVariants = `wc -l $file`; - $nbVariants =~ /(\d+).+/; - - if($1-1 <= 5000) { $cpu = 1; } - elsif( ($1-1 > 5000) && ($1-1 < 25000) ) { $cpu = 2; } - elsif( ($1-1 >= 25000) && ($1-1 < 100000) ) { $cpu = 8; } - else { $cpu = $max_cpu; } - - # If the number predefined can't be used on the machine use the maximum number specify by the administrator - if($cpu > $max_cpu) { $cpu = $max_cpu } - - ## Recover the header - open(F1, $file) or die "$!: $file\n"; - my $headerOriginalFile = <F1>; - close F1; - - ## Remove the first line of the file - my $fileNoHeader = "$folder_temp/${filenameOK}-NoHeader"; - `sed 1d $file > $fileNoHeader`; - - if(!-e "$folder_temp/$filenameOK") { mkdir("$folder_temp/$filenameOK") or die "Can't create the directory $folder_temp/$filenameOK\n"; } - my $lines_per_temp = int(1+($1 / $cpu)); # +1 in case of the div == 0 - `split -l $lines_per_temp $fileNoHeader $folder_temp/$filenameOK/$filenameOK-`; - - if($headerOriginalFile eq "") { print STDERR "No header for the file $file!!!\nPlease check the format of your file\n"; exit; } - my @files = <$folder_temp/$filenameOK/$filenameOK-*>; - - ################################################# - ### Annotate the n part ### - ################################################# - my $pm = Parallel::ForkManager->new($cpu); - - foreach my $tempFile (@files) - { - # Forks and returns the pid for the child: - my $pid = $pm->start and next; - - # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo - my ($filename, $directories, $suffix) = fileparse($tempFile, qr/\-[^.]*/); - my $outFilenameTemp = $filename.$suffix; - Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput"); - - # Annotate the file with Annovar - my $tempFileName_AVOutput = $filename.$suffix.".".${refGenome}."_multianno.txt"; - if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } - else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } - - # Check if the annotations worked - open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n"; - while(<F1>) - { - if($_ =~ /ERROR/i) - { - print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n"; - print STDERR $_; - print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n"; - exit; - } - } - close F1; - - # Recover the strand orientation - my $length_AVheader = 0; - RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader); - - # Recover the sequence context - RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filenameOK/$outFilenameTemp".".".${refGenome}."_multianno.txt"); - - $pm->finish; # Terminates the child process - } - # Wait all the child process - $pm->wait_all_children; - - # Paste the file together - CombinedTempFile("$folder_temp/$filenameOK", "$folderAnnovar/$filenameOK".".".${refGenome}."_multianno.txt"); - } - } - # The input file is one file - else - { - my ($filenameO, $directoriesO, $suffixO) = fileparse($input, qr/\.[^.]*/); - - ################################################# - ### Cut the files in n part ### - ################################################# - # Recover the number of variants in the file for deciding the number of CPU to use - my $cpu = 0; - my $nbVariants = `wc -l $folder_temp/$filenameO-ColumnCorrect.txt`; - $nbVariants =~ /(\d+).+/; - - if($1-1 <= 5000) { $cpu = 1; } - elsif( ($1-1 > 5000) && ($1-1 < 25000) ) { $cpu = 2; } - elsif( ($1-1 >= 25000) && ($1-1 < 100000) ) { $cpu = 8; } - else { $cpu = $max_cpu; } - - # If the number predefined can't be used on the machine use the maximum number specify by the administrator - if($cpu > $max_cpu) { $cpu = $max_cpu } - - ## Recover the header - open(F1, "$folder_temp/$filenameO-ColumnCorrect.txt") or die "$!: $folder_temp/$filenameO-ColumnCorrect.txt\n"; - my $headerOriginalFile = <F1>; - close F1; - - ## Remove the first line of the file - my $fileNoHeader = "$folder_temp/$filenameO-NoHeader"; - `sed 1d $folder_temp/$filenameO-ColumnCorrect.txt > $fileNoHeader`; - - if(!-e "$folder_temp/$filenameO") { mkdir("$folder_temp/$filenameO") or die "Can't create the directory $folder_temp/$filenameO\n"; } - my $lines_per_temp = int(1+($1 / $cpu)); # +1 in case of the div == 0 - `split -l $lines_per_temp $fileNoHeader $folder_temp/$filenameO/$filenameO-`; - - if($headerOriginalFile eq "") { print STDERR "No header for the file $input!!!\nPlease check the format of your file\n"; exit; } - my @files = <$folder_temp/$filenameO/$filenameO-*>; - - ################################################# - ### Annotate the n part ### - ################################################# - my $pm = Parallel::ForkManager->new($cpu); - foreach my $tempFile (@files) - { - # Forks and returns the pid for the child: - my $pid = $pm->start and next; - - # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo - # For recover the name of the file without extension, the directory were the file is and the extension of the file - my ($filename, $directories, $suffix) = fileparse($tempFile, qr/\.[^.]*/); - my $outFilenameTemp = $filename.$suffix; - Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput"); - - # Annotate the file with Annovar - my $tempFileName_AVOutput = $outFilenameTemp.".".${refGenome}."_multianno.txt"; - if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } - else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } - - # Check if the annotations worked - open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n"; - while(<F1>) - { - if($_ =~ /ERROR/i) - { - print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n"; - print STDERR $_; - print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n"; - exit; - } - } - close F1; - - # Recover the strand orientation - my $length_AVheader = 0; - RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader); - - # Recover the sequence context - RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filenameO/$tempFileName_AVOutput"); - - $pm->finish; # Terminates the child process - } - # Wait all the child process - $pm->wait_all_children; - - # Paste the file together - CombinedTempFile("$folder_temp/$filenameO", "$folderAnnovar/$filenameO".".".${refGenome}."_multianno.txt"); - } - # Remove the temporary directory - rmtree($folder_temp); -} - -sub Convert2AV -{ - my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_; - - my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); - - open(F1, $inputFile) or die "$!: $inputFile\n"; - - open(OUT, ">", $output) or die "$!: $output\n"; - while(<F1>) - { - $_ =~ s/[\r\n]+$//; - my @tab = split("\t", $_); - my $chr = ""; - - # Don't consider chrM and GL - if($tab[$chr_value] =~ /M|GL/i) { next; } - - # Replace chr23 or chr24 by X or Y - if($tab[$chr_value] =~ /23/) { $chr = "chrX"; } - elsif($tab[$chr_value] =~ /24/) { $chr = "chrY"; } - elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; } - else { $chr = "chr".$tab[$chr_value]; } - - ### Reformat the Indels for Annovar - # chr1 85642631 C CT => chr1 85642631 85642631 - T (mm10) - # chr5 26085724 ACTT A => chr5 26085725 26085727 CTT - (mm10) - if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) ) - { - ### First check if the indels in the file are not already correctly formated - if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) - { - # For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels) - # Insertion: start = start & end = start - if($tab[$ref_value] =~ /\-/) - { - print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]"; - } - ## Deletion: start = start & end = start + length(del) -1 - else - { - my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1); - print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]"; - } - } - ### Indels not correctly formated for Annovar - else - { - my @tabRef = split("", $tab[$ref_value]); - my @tabAlt = split("", $tab[$alt_value]); - - # Remove the first base - my $ref2 = join("", @tabRef[1 .. $#tabRef]); - my $alt2 = join("", @tabAlt[1 .. $#tabAlt]); - - if(length($alt2) == 0) - { - my $altOK = "-"; - my $startOK = $tab[$start_value] + 1; - my $stopOK = $startOK + length($ref2) - 1; - print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK; - } - - if(length($ref2) == 0) - { - my $refOK = "-"; - print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2; - } - } - } - ### SBS - else - { - print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value]; - } - - ## Print the original file at the end - foreach (@tab) { print OUT "\t$_"; } - print OUT "\n"; - } - close F1; close OUT; -} - -sub AnnotateAV -{ - my ($inputFile, $output) = @_; - - if(!-e $path_AVDB) { print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; exit; } - - # Extract the name of the databases - my $protocol = ""; my $operation = ""; - ExtractAVDBName($listAVDB, \$protocol, \$operation); - - `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; - - sub ExtractAVDBName - { - my ($listAVDB, $refS_protocol, $refS_operation) = @_; - - open(F1, $listAVDB) or die "$!: $listAVDB\n"; - while(<F1>) - { - if ($_ =~ /^#/) { next; } - - $_ =~ s/[\r\n]+$//; - my @tab = split("\t", $_); - - # db name like refGenome_dbName.txt - if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /ljb26/) ) - { - $$refS_protocol .= $1.","; $$refS_operation .= $tab[1].","; - } - # 1000 genome - if($tab[0] =~ /sites/) - { - $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/; - my ($dbName, $year, $month) = ($1, $2, $3); - $dbName =~ tr/A-Z/a-z/; - - # convert the month number into the month name - ConvertMonth(\$month); - - my $AVdbName_final = "1000g".$year.$month."_".$dbName; - $$refS_protocol .=$AVdbName_final.","; $$refS_operation .= $tab[1].","; - } - # ESP - if( ($tab[0] =~ /esp/) || ($tab[0] =~ /ljb26/) ) - { - $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/; - my $AVdbName_final = $1."_".$2; - $$refS_protocol .=$AVdbName_final.","; $$refS_operation .= $tab[1].","; - } - } - close F1; - - sub ConvertMonth - { - my ($refS_month) = @_; - - if($$refS_month == 1) { $$refS_month = "janv"; } - elsif($$refS_month == 2) { $$refS_month = "feb"; } - elsif($$refS_month == 3) { $$refS_month = "mar"; } - elsif($$refS_month == 4) { $$refS_month = "apr"; } - elsif($$refS_month == 5) { $$refS_month = "may"; } - elsif($$refS_month == 6) { $$refS_month = "jun"; } - elsif($$refS_month == 7) { $$refS_month = "jul"; } - elsif($$refS_month == 8) { $$refS_month = "aug"; } - elsif($$refS_month == 9) { $$refS_month = "sept"; } - elsif($$refS_month == 10) { $$refS_month = "oct"; } - elsif($$refS_month == 11) { $$refS_month = "nov"; } - elsif($$refS_month == 12) { $$refS_month = "dec"; } - else { print STDERR "Month number don't considered\n"; exit; } - } - } -} - -### Add the minimum of annotations (refGene + strand + context) -sub annotateAV_min -{ - my ($inputFile, $output) = @_; - - if(!-e $path_AVDB) { print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; exit; } - - # Extract the name of the databases - my ($protocol, $operation) = ("refGene", "g"); - - `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; -} - -sub RecoverStrand -{ - my ($input, $headerOriginalFile, $pathDB, $refGenome, $output, $refS_lengthAVheader) = @_; - - my ($chr_value, $start_value, $ref_value, $alt_value, $func_value, $geneSymbol_value) = ("", "", "", "", "", "", "", ""); - - $chr_value = recoverNumCol($input, "Chr"); - $start_value = recoverNumCol($input, "Start"); - $ref_value = recoverNumCol($input, "Ref"); - $alt_value = recoverNumCol($input, "Alt"); - $func_value = recoverNumCol($input, "Func.refGene"); - $geneSymbol_value = recoverNumCol($input, "Gene.refGene"); - - #################### Convert the input file into a hash table - my %h_inputFile = (); - open(F1, $input) or die "$!: $input\n"; - my $annovar_header = <F1>; - - while(<F1>) - { - $_ =~ s/[\r\n]+$//; - my @tab = split("\t", $_); - - # In COSMIC the chromosome X and Y are annotated 23 and 24 - my $chr = ""; - if($tab[$chr_value] eq "chr23") { $chr = "chrX"; } - elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; } - elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; } - else { $chr = $tab[$chr_value]; } - - # Verify if the element exists - if($chr eq "") { print "Error RecoverStrand: The chromosome value is nor defined for $_\n"; exit; } - if(! exists $tab[$start_value]) { print "Error RecoverStrand: The start value is nor defined for $_\n"; exit; } - if(! exists $tab[$ref_value]) { print "Error RecoverStrand: The reference value is nor defined for $_\n"; exit; } - if(! exists $tab[$alt_value]) { print "Error RecoverStrand: The alternate value is nor defined for $_\n"; exit; } - if(! exists $tab[$func_value]) { print "Error RecoverStrand: The functional value is nor defined for $_\n"; exit; } - if(! exists $tab[$geneSymbol_value]) { print "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; exit; } - - my $geneSymbol = ""; - ######## For the splicing annotation we separate the gene symbol from the aa change - if($tab[$func_value] eq "splicing") - { - if($tab[$geneSymbol_value] =~ /(.+)\((.+)\)/) { $geneSymbol = $1; } - else { $geneSymbol = $tab[$geneSymbol_value]; } - } - else { $geneSymbol = $tab[$geneSymbol_value]; } - - push(@{$h_inputFile{"$chr:$tab[$start_value]:$tab[$start_value]:$tab[$ref_value]:$tab[$alt_value]:$geneSymbol"}}, $_); - } - close F1; - - # print "\t\tRecoverStrand: $input\n"; - - #################### Convert the database file into a hash table - my %h_database = (); - my ($db_geneSymbol_value, $db_strandInfo_value, $db_chr_value) = (12, 3, 2); - - my $folderNameDB = $refGenome."db"; - my $fileNameDB = $refGenome."_refGene.txt"; - - open(F1, "$pathDB/$fileNameDB") or die "$!: $pathDB/$fileNameDB\n"; - while(<F1>) - { - $_ =~ s/[\r\n]+$//; - my @tab = split("\t", $_); - my $strand = ""; - $strand = $tab[$db_strandInfo_value]; - if($strand eq "") { print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; exit; } - else - { - # Some genes have several strand orientation, keep the first in the database - if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; } - } - } - close F1; - - #################### Parse the two hash tables for recover the strand information - open(OUT, ">", $output) or die "$!: $output\n"; - - - ## Add the header only for the firts part of the files - if($input =~ /\-aa/) - { - my @tabHeaderInput = ""; - $annovar_header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $annovar_header); - # Save the length of the Annovar header for the next function (RecoverGenomicSequence) - $$refS_lengthAVheader = $#tabHeaderInput; - - # Print the Annovar header until the column before OtherInfo - print OUT "$tabHeaderInput[0]"; - for(my $i=1; $i<$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } - print OUT "\tStrand"; - print OUT "\t",$headerOriginalFile; - } - - - # Timer for comparing the number of SNVs present in the hash table - my $timerUniqueSNVs = 0; - # Timer for comparing the number of SNVs with the strand orientation - my $timerSNVsStrand = 0; - - foreach my $kFile (sort keys %h_inputFile) - { - my $test = 0; - my @tab = split(":", $kFile); - - # Sometimes the line is not printed correctely !!!!! :@ - my @tHeaderInput = split("\t", $annovar_header); my @lengthLine = split("\t", $h_inputFile{$kFile}[0]); - my @tHeaderOriginalFile = split("\t", $headerOriginalFile); - my $lengthHeader = @tHeaderInput + (scalar(@tHeaderOriginalFile)-1) ; my $lengthLine = @lengthLine; - - # Save the length of the Annovar header for the next function (RecoverGenomicSequence) - $$refS_lengthAVheader = $#tHeaderInput; - - foreach my $kDB (sort keys %h_database) - { - if("$tab[5]:$tab[0]" eq $kDB) - { - if($lengthHeader != $lengthLine) { print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n"; exit; } - - foreach my $line (@{$h_inputFile{$kFile}}) - { - my @tab = split("\t", $line); - my $j = 0; - - for(my $i=0; $i<$#tHeaderInput; $i++) { print OUT $tab[$i],"\t"; $j=$i } - print OUT $h_database{$kDB}; - for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } - print OUT "\n"; - } - $timerSNVsStrand++; - $test = 1; last; - } - } - # The strand orientation isn't defined - if($test == 0) - { - my @tHeaderInput = split("\t", $annovar_header); - foreach my $line (@{$h_inputFile{$kFile}}) - { - my @tab = split("\t", $line); - my $j = 0; - for(my $i=0; $i<$#tHeaderInput; $i++) { print OUT $tab[$i],"\t"; $j=$i } - print OUT "NA"; - for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } - print OUT "\n"; - } - $timerSNVsStrand++; - } - $timerUniqueSNVs++; - } - close OUT; - - # print "Strand orientation recover for $timerSNVsStrand SNVs out of $timerUniqueSNVs uniques\n"; -} - -sub RecoverGenomicSequence -{ - my ($inputFile, $length_AVheader, $intervalEnd, $referenceGenome, $pathToRefSeq, $output) = @_; - - ############ 1) Transform the input file in a hash table: one for recover the sequence context and one for keeping the original file - my %h_inputFileForSeqContext = (); my %h_inputFile = (); - my $header = ""; - CreateHashTable_from_InputFile($inputFile, $length_AVheader, \$header, $intervalEnd, \%h_inputFileForSeqContext, \%h_inputFile); - - sub CreateHashTable_from_InputFile - { - my ($input, $length_AVheader, $refS_header, $intervalEnd, $refH_inputFileForSeqContext, $refH_inputFile) = @_; - - my ($chr_value, $start_value, $strand_value) = (0, 1, $length_AVheader); - - my $countregion = 0; - my %allchr = (); - - open(F1, $input) or die "$!: $input\n"; - if($input =~ /\-aa/) { $$refS_header = <F1>; } - - while(<F1>) - { - $_ =~ s/[\r\n]+$//; - my @tab = split("\t", $_); - - my $name = "$tab[$chr_value]:$tab[$start_value]"; - my $start = $tab[$start_value] - $intervalEnd; - my $end = $tab[$start_value] + $intervalEnd; - - $start--; #make zero-start coordinate, to be consistent with UCSC - my $exonpos = "$tab[$chr_value]:$start"; - - push @{$refH_inputFileForSeqContext->{$tab[$chr_value]}}, [$name, $start, $end, $tab[$strand_value], $exonpos]; - push(@{$refH_inputFile->{"$tab[$chr_value]\t$start\t$end"}}, $_); - $countregion++; - $allchr{$tab[$chr_value]}++; - } - close F1; - } - - ############ 2) Extract the sequence context from the hash table - my %h_allRegionSeqContext = (); - my $refSeq = $pathToRefSeq; - Extract_SequenceContext(\%h_inputFileForSeqContext, $referenceGenome, $refSeq, \%h_allRegionSeqContext); - - sub Extract_SequenceContext - { - my ($refH_allRegion, $referenceGenome, $refSeq, $refH_allRegionSeqContext) = @_; - - my $folderDB = $referenceGenome."db"; - my $folderSeq = $referenceGenome."_seq"; - my $seqdir = "$refSeq/$folderSeq"; - - my %seqhash = (); #database sequence for each chromosome - my %name_seq = (); #sequence for each region - my (%seqlen, %discordlen, %badorf); #store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon - my ($count_success, @failure) = (0); - - for my $curchr (sort keys $refH_allRegion) - { - my ($seqid, $curseq) = ('', ''); - my $fastafile = ""; - if ($curchr =~ m/^chr/) - { - %seqhash = (); #clear the seqhash storage - $fastafile = "$seqdir/$curchr.fa"; #by default, all FASTA files should be saved at fastadir, with the same name - } - else - { - %seqhash = (); #clear the seqhash storage - $fastafile = "$seqdir/chr$curchr.fa"; #by default, all FASTA files should be saved at fastadir, with the same name - } - if (not -e $fastafile) { #to handle cases where no "chr" prefix is given - print "WARNING: the FASTA file $curchr.fa cannot be retrieved from the specified directory $seqdir. Sequences in this chromosome will not be processed\n"; - next; - } - - if (not %seqhash) - { - open (FASTA, $fastafile) or print "WARNING: cannot read from FASTA file $fastafile so sequences in $curchr will not be processed: $!\n" and next; - while (<FASTA>) - { - if (m/^>(\S+)/) - { - $seqid and $seqhash{$seqid} = $curseq; #finish reading the sequence for seqid and save it - $seqid = $1; - $curseq = ''; - } - else - { - s/[\r\n]+$//; - $curseq .= uc $_; #only use upper case characters - } - } - close FASTA; - $seqhash{$seqid} = $curseq; - } - if (not $seqhash{$curchr}) - { - #this chromosome just do not have FASTA sequences (maybe users used a wrong seqdir - print "WARNING: Unable to retrieve regions at $curchr due to lack of sequence information\n"; - next; - } - - for my $i (0 .. @{$refH_allRegion->{$curchr}}-1) - { - my ($name, $start, $end, $strand, $exonpos) = @{$refH_allRegion->{$curchr}[$i]}; - my @start = split (/,/, $start); - my @end = split (/,/, $end); - my $seq; - for my $i (0..@start-1) - { - if ($start[$i] >= length ($seqhash{$curchr})) - { - #here there must be an annotation error in user-specified gene/region definition file - print "WARNING: Ignoring the start position start=$start[$i] since it is longer than the $curchr sequence (length=" , length($seqhash{$curchr}), ")\n"; - undef $seq; - last; - } - $seq .= substr ($seqhash{$curchr}, $start[$i], $end[$i]-$start[$i]); - } - - if (defined $seq) - { - if (defined $seqlen{$name}) - { - $seqlen{$name} != length ($seq) and warn "WARNING: the sequence $name was found more than once with different sequence lengths\n"; - $seqlen{$name} != length ($seq) and $discordlen{$name}++; - } - else { $seqlen{$name} = length ($seq); } - - $name_seq{$name, $exonpos} = $seq; - $count_success++; - - # Put the sequence context in a hash table for Write the result after - ## Some sequence context are NNNNNN or empty - if( ($seq ne "NA") && ($seq =~ /N/i) ) { $refH_allRegionSeqContext->{"$curchr\t$start\t$end"} = "NA"; } - else { $refH_allRegionSeqContext->{"$curchr\t$start\t$end"} = $seq; } - } - else - { - print "WARNING: DNA sequence for $name cannot be inferred\n"; - push @failure, $name; - } - } - } # End for $curchr - } - - ############ 3) Create a file with the sequence context - WriteFile_SeqContext($inputFile, $length_AVheader, \%h_inputFile, $header, \%h_allRegionSeqContext, $output); - - sub WriteFile_SeqContext - { - my ($inputFile, $length_AVheader, $refH_InputFile, $header, $refH_allRegionSeqContext, $output) = @_; - - open(OUT, ">", $output) or die "$!: $output\n"; - - ## Add the header only for the firts part of the files - if($inputFile =~ /\-aa/) - { - my @tabHeaderInput = ""; - - $header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header); - # Print the Annovar header until the column before OtherInfo - print OUT "$tabHeaderInput[0]"; - my $j = 0; - for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; } - print OUT "\tcontext"; - for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } - print OUT "\n"; - } - - foreach my $k_hFile (sort keys $refH_InputFile) - { - foreach my $k_allRegonSeqContext (sort keys $refH_allRegionSeqContext) - { - if($k_hFile eq $k_allRegonSeqContext) - { - my $j=0; - - for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++) - { - my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]); - - for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; } - print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext}; - for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } - print OUT "\n"; - } - last; - } - } - } - close OUT; - } -} - -sub CombinedTempFile -{ - my ($folderTempFile, $output) = @_; - - my $cmd_cat_mt_results = "cat "; - - foreach my $file (`ls $folderTempFile/*.txt`) - { - chomp($file); - $cmd_cat_mt_results = $cmd_cat_mt_results." $file"; - } - $cmd_cat_mt_results = $cmd_cat_mt_results." > $output"; - `$cmd_cat_mt_results`; -} - - -sub recoverNumCol -{ - my ($input, $name_of_column) = @_; - - open(F1,$input) or die "recoverNumCol: $!: $input\n"; - # For having the name of the columns - my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); - close F1; - # The number of the column - my $name_of_column_NB = "toto"; - for(my $i=0; $i<=$#tab_search_header; $i++) - { - if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; } - } - if($name_of_column_NB eq "toto") { print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } - else { return $name_of_column_NB; } -} - - - - -=head1 NAME - -mutspec-Annot - -=head1 SYNOPSIS - - mutspecannot.pl [arguments] <query-file> - - <query-file> can be a folder with multiple VCF or a single VCF - - Arguments: - -h, --help print help message - -m, --man print complete documentation - -v, --verbose use verbose output - --refGenome the reference genome to use - --interval <interger> the number of bases for the sequence context - -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file - -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size - --pathAVDBList the path to the list of AV databases installed - -temp --pathTemporary <string> the path for saving the temporary files - --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no) - - -Function: automatically run a pipeline on a list of variants and annote them using Annovar - - Example: # Annotation only - mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input - - - Version: 06-2016 (June 2016) - - -=head1 OPTIONS - -=over 8 - -=item B<--help> - -print a brief usage message and detailed explanation of options. - -=item B<--man> - -print the complete manual of the program. - -=item B<--verbose> - -use verbose output. - - -=item B<--refGenome> - -the reference genome to use, could be hg19 or mm9. - -=item B<--interval> - -the number of bases surrounding the mutated bases, for the sequence context analysis. - -=item B<--outfile> - -the directory of output file names. If it is nor specify the same directory as the input file is used. - -=item B<--pathAnnovarDB> - -the path to the directory containing the Annovar databases and the files with the chromosome size. - -=item B<--pathAVDBList> - -the path to a texte file containing the list of the Annovar databases installed. - -=item B<--pathTemporary> - -the path for saving temporary files generated by the script. -If any is specify a temporary folder is created in the same directory where the script is running. -Deleted when the script is finish - -=item B<--fullAnnotation> - -Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines) - -=head1 DESCRIPTION - -MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS. -Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added. -A text tab delimited file is produced. - -=cut +#!/usr/bin/env perl + +#-----------------------------------# +# Author: Maude # +# Script: mutspecAnnot.pl # +# Last update: 02/03/17 # +#-----------------------------------# + +use strict; +use warnings; +use Getopt::Long; +use Pod::Usage; +use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); +use File::Path; +use Parallel::ForkManager; + + +our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. +our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files +our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context. +our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines) + + +######################################### +### SPECIFY THE NUMBER OF CPU ### +######################################### +our ($max_cpu) = 8; # Max number of CPU to use for the annotation + + + + +GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp, 'max_cpu=i' => \$max_cpu) or pod2usage(2); + +our ($input) = @ARGV; + +pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); +pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); +pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script +pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input) + + + +###################################################################################################################################################### +# GLOBAL VARIABLES # +###################################################################################################################################################### + +# Recover the current path +our $pwd = `pwd`; +chomp($pwd); +# Recover the filename and the input directory +our ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/); +# Output directories +our ($folderMutAnalysis, $folderAnnovar) = ("", ""); +# File with the list of Annovar databases to use +our $listAVDB = ""; +# Initialisation of chromosome, position, ref and alt values +our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a"); + + + +###################################################################################################################################################### +# MAIN # +###################################################################################################################################################### +## Check the presence of the flags and create the output and temp directories +CheckFlags(); + +## Format the file correctly: +## 1) Check the length of the filename (must be <= 31 characters) +## 2) Recover the input format. If MuTect output consider only "KEEP" variants +## 3) Recover the column number for chr, start, ref and alt +FormatingInputFile(); + +# Annotate the file with Annovar, add the strand orientation and the sequence context +FullAnnotation(); + +###################################################################################################################################################### +# FUNCTIONS # +###################################################################################################################################################### + +## Check the presence of the flags and create the output and temp directories +sub CheckFlags +{ + # Check the reference genome + if($refGenome eq "empty") + { + print STDERR "Missing flag !\n"; + print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; + exit; + } + if($intervalEnd eq "empty") + { + print STDERR "Missing flag !\n"; + print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n"; + exit; + } + # If no output is specified write the result as the same place as the input file + if($output eq "empty") + { + my $directory = dirname( $input ); + + $folderMutAnalysis = "$directory/Mutational_Analysis"; + if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } + } + else + { + if(!-e $output) { mkdir($output) or die "$!: $output\n"; } + + $folderMutAnalysis = "$output/Mutational_Analysis"; + if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } + } + # Create the output folder for Annovar + $folderAnnovar = "$folderMutAnalysis/Annovar"; + if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; } + + # Verify the access to Annovar databases + if($path_AVDB eq "empty") + { + print STDERR "Missing flag !\n"; + print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n"; + exit; + } + elsif(!-e $path_AVDB) + { + print STDERR "Error message:\n"; + print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n"; + } + + # Check the file list AV DB + if($pathAVDBList eq "empty") + { + print STDERR "Missing flag !\n"; + print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n"; + exit; + } + else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" } + + # If no temp folder is specified write the result in the current path + if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$filename"; } + if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; } + + # Verify listAVDB is not empty + if($listAVDB eq "") + { + print STDERR "Path to the text file containing the list of Annovar databases installed is not specified !!!\n"; + exit; + } +} + +## Format the file in the correct format if they are vcf or MuTect output and recover the column positions +sub FormatingInputFile +{ + # The input is a folder + if(-d $input) + { + foreach my $file (`ls $input`) + { + my $headerOriginalFile = ""; + chomp($file); + + CheckLengthFilename("$input/$file"); + + ################################################# + ### Recover the input format ### + ################################################# + RecoverInputFormat("$input/$file", \$headerOriginalFile); + } + } + # The input is one file + else + { + my $headerOriginalFile = ""; + + CheckLengthFilename($input); + + ################################################# + ### Recover the input format ### + ################################################# + RecoverInputFormat($input, \$headerOriginalFile); + } +} + +# The name for the Excel sheet can't be longer than 31 characters +sub CheckLengthFilename +{ + my ($inputFile) = @_; + + my ($filenameInputFile, $directoriesInputFile, $suffixInputFile) = fileparse($inputFile, qr/\.[^.]*/); + + if(length($filenameInputFile) > 32) + { + print STDERR "Error message:\n"; + print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n"; + } +} + +# Recover the input format. If MuTect output consider only "KEEP" variants +sub RecoverInputFormat +{ + my ($file, $refS_headerOriginalFile) = @_; + + my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); + + my $inputFormat = ""; + + open(F1, $file) or die "$!: $file\n"; + my $header = <F1>; + close F1; + + ### VCF and MuTect files have in their first line the type of the file + if($header =~ /fileformat=VCF/i) { $inputFormat = "vcf"; } + elsif($header =~ /mutect/i) { $inputFormat = "mutect"; } + else { $inputFormat = "unknown"; } + + + ### VCF files + if($inputFormat eq "vcf") + { + ### MuTect2 output VCFs + my $testVC = `grep MuTect2 $file`; + if($testVC =~ /MuTect2/) + { + # Keep only the variants passing MuTect2 filters + `grep PASS $file > $folder_temp/$filename-PASS.txt`; + + # Recover the header + $$refS_headerOriginalFile = `grep '#CHROM' $file`; + + # Add the header + # Sed command doesn't work... sed 's/^/some text\n/' file > res + open(OUT, ">", "$folder_temp/$filename-KEEP.txt") or die "$!: $folder_temp/$filename-KEEP.txt\n"; + print OUT $$refS_headerOriginalFile; + open(F1, "$folder_temp/$filename-PASS.txt") or die "$!: $folder_temp/$filename-PASS.txt\n"; + while(<F1>) { print OUT $_; } + close F1; close OUT; + + `rm $folder_temp/$filename-PASS.txt`; + + # Check if there if no empty column + CheckEmptyColumn("$folder_temp/$filename-KEEP.txt"); + `rm $folder_temp/$filename-KEEP.txt`; + + # Set the col number for the chr,start,ref and alt + ($chrValue, $positionValue, $refValue, $altValue) = (0, 1, 3, 4); + } + else + { + open(F1, $file) or die "$!: $file\n"; + open(OUT, ">", "$folder_temp/$filename.txt") or die "$!: $folder_temp/$filename.txt\n"; + while (<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + # Print the VCF header + if($tab[0] eq "#CHROM") + { + $tab[0] =~ /#(.+)/; + print OUT "$1"; + for(my $i=1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } + print OUT "\n"; + } + elsif($tab[0] !~ /##/) + { + # Don't consider chromosome random, GL and MT + if( ($tab[0] =~ /random/) || ($tab[0] =~ /GL/i) ) { next; } + print OUT "$_\n"; + } + } + close F1; close OUT; + + ## Recover the header + open(F1, "$folder_temp/$filename.txt") or die "$!: $folder_temp/$filename.txt\n"; + $$refS_headerOriginalFile = <F1>; + close F1; + + # Check if there if no empty column + CheckEmptyColumn("$folder_temp/$filename.txt"); + `rm $folder_temp/$filename.txt`; + + + # Set the col number for the chr,start,ref and alt + ($chrValue, $positionValue, $refValue, $altValue) = (0, 1, 3, 4); + } + } + ### MuTect files + elsif($inputFormat eq "mutect") + { + `sed '1d' $file > $folder_temp/$filename-HeaderOK`; + # Keep only the SNVs of good quality + `grep -v REJECT $folder_temp/$filename-HeaderOK > $folder_temp/$filename-KEEP.txt`; + `rm $folder_temp/$filename-HeaderOK`; + + # Recover the header + open(F1, "$folder_temp/$filename-KEEP.txt") or die "$!: $folder_temp/$filename-KEEP.txt\n"; + $$refS_headerOriginalFile = <F1>; + close F1; + + # Check if there if no empty column + CheckEmptyColumn("$folder_temp/$filename-KEEP.txt"); + `rm $folder_temp/$filename-KEEP.txt`; + + # Recover the name and the number of the columns that contain the chromosome number, the start position, the ref and alt alleles. + # Use the dictionary for recover the names and the position of the columns + RecoverColNameAuto("$folder_temp/$filename-KEEPColumnCorrect.txt", $$refS_headerOriginalFile, \$chrValue, \$positionValue, \$refValue, \$altValue); + } + ### Unknown type + else + { + ## Recover the header + open(F1, $file) or die "$!: $file\n"; + $$refS_headerOriginalFile = <F1>; + close F1; + + # Check if there if no empty column + CheckEmptyColumn($file); + + ## Recover the name and the number of the columns that contain the chromosome number, the start position, the ref and alt alleles. + # Use the dictionary for recover the names and the position of the columns + RecoverColNameAuto($file, $$refS_headerOriginalFile, \$chrValue, \$positionValue, \$refValue, \$altValue); + } +} + +# Some files can have empty column with no information +sub CheckEmptyColumn +{ + my ($inputFile) = @_; + + my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); + + if($filename =~ /(.+)-KEEP/) { $filename = $1; } + + open(OUT, ">", "$folder_temp/$filename-ColumnCorrect.txt") or die "$!: $folder_temp/$filename-ColumnCorrect.txt\n"; + + open(F1, $inputFile) or die "$!: $inputFile\n"; + my $header = <F1>; $header =~ s/[\r\n]+$//; my @tabHeader = split("\t", $header); + print OUT $header, "\n"; + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + if(scalar(@tab) != scalar(@tabHeader)) + { + print OUT $tab[0]; + for(my $i=1; $i<=$#tabHeader; $i++) + { + if(defined($tab[$i])) { print OUT "\t$tab[$i]"; } + else { print OUT "\tNA"; } + } + print OUT "\n"; + } + else { print OUT "$_\n"; } + } + close F1; close OUT; +} + +# Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles. +sub RecoverColNameAuto +{ + our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_; + + $header =~ s/[\r\n]+$//; + + ## Name of the columns + my @mutect = qw(contig position ref_allele alt_allele); + my @vcf = qw(CHROM POS REF ALT); + my @cosmic = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic); + my @icgc = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele); + my @tcga = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2); + my @ionTorrent = qw(chr Position Ref Alt); + my @proton = qw(Chrom Position Ref Variant); + my @varScan2 = qw(Chrom Position Ref VarAllele); + my @varScan2_somatic = qw(chrom position ref var); + my @annovar = qw(Chr Start Ref Obs); + my @custom = qw(Chromosome Start Wild_Type Mutant); + + my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@varScan2_somatic, \@annovar, \@custom); + my $timer = 0; # For controlling if the names are present on the dictionnary or not + + foreach my $refTab (@allTab) + { + my @tab = @$refTab; + + SearchCol(\@tab); + + # The columns names were find + if( ($$ref_chrValue ne "c") && ($$ref_positionValue ne "s") && ($$ref_refValue ne "r") && ($$ref_altValue ne "a") ) { last; } + # The names of the columns are not present in the dictionnary + else { $timer++; } + } + + if($timer == scalar(@allTab)) + { + print STDERR "Error message:\n"; + print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n"; + print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n"; + exit; + } + + # Extract the number of the column that contain the information + sub SearchCol + { + my ($refTab) = @_; + + my @tabNames = @$refTab; + my @tabHeader = split("\t", $header); + + # For VCF + if($tabHeader[0] eq "#CHROM") { ($$ref_chrValue, $$ref_positionValue, $$ref_refValue, $$ref_altValue) = (0, 1, 3, 4); } + # For tabular files + else + { + for(my $i=0; $i<=$#tabNames; $i++) + { + for(my $j=0; $j<=$#tabHeader; $j++) + { + if($tabHeader[$j] eq $tabNames[$i]) + { + if($i == 0) { $$ref_chrValue = $j; } + if($i == 1) { $$ref_positionValue = $j; } + if($i == 2) { $$ref_refValue = $j; } + if($i == 3) { $$ref_altValue = $j; } + last; # Once find pass to the next name + } + } + } + } + } +} + +# Annotate the file with Annovar, add the strand orientation and the sequence context +sub FullAnnotation +{ + print "-----------------------------------------------------------------\n"; + print "---------------------------Annotation----------------------------\n"; + print "-----------------------------------------------------------------\n"; + + + # If the input is a folder + if(-d $input) + { + foreach my $file (`ls $folder_temp/*.txt`) + { + chomp($file); + + # For recover the name of the file without extension, the directory where the file is and the extension of the file + my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/); + my $filenameOK = ""; + # For removing the ColumnCorrect for txt files + if($filename =~ /(.+)-ColumnCorrect/) + { + if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; } + else { $filenameOK = $1; } + } + else { print STDERR "Error message:\n"; print STDERR "Case not considered for $filename!!!\n"; } + + + ################################################# + ### Cut the files in n part ### + ################################################# + # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts + my $cpu = 0; + # Keep the original header + my $headerOriginalFile = ""; + # Save the numer of lines + my $nbLine = 0; + splitInputFile($file, \$cpu, \$headerOriginalFile, $filenameOK, \$nbLine); + + + ################################################# + ### Annotate the n part ### + ################################################# + annotateFile($cpu, $filenameOK, $headerOriginalFile); + + + ################################################# + ### Paste the file together ### + ################################################# + createOutput($filenameOK, $headerOriginalFile, $nbLine); + } + } + # The input file is one file + else + { + ################################################# + ### Cut the files in n part ### + ################################################# + # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts + my $cpu = 0; + # Keep the original header + my $headerOriginalFile = ""; + # Save the numer of lines + my $nbLine = 0; + splitInputFile("$folder_temp/$filename-ColumnCorrect.txt", \$cpu, \$headerOriginalFile, $filename, \$nbLine); + + + ################################################# + ### Annotate the n part ### + ################################################# + annotateFile($cpu, $filename, $headerOriginalFile); + + + ################################################# + ### Paste the file together ### + ################################################# + createOutput($filename, $headerOriginalFile, $nbLine); + } + # Remove the temporary directory + rmtree($folder_temp); +} + + + +sub splitInputFile +{ + my ($inputFile, $ref_cpu, $ref_header, $filename, $ref_nbLine) = @_; + + my $nbVariants = `wc -l $inputFile`; + $nbVariants =~ /(\d+).+/; + $$ref_nbLine = $1; + + if($$ref_nbLine-1 <= 5000) { $$ref_cpu = 1; } + elsif( ($$ref_nbLine-1 > 5000) && ($$ref_nbLine-1 < 25000) ) { $$ref_cpu = 2; } + elsif( ($$ref_nbLine-1 >= 25000) && ($$ref_nbLine-1 < 100000) ) { $$ref_cpu = 8; } + else { $$ref_cpu = $max_cpu; } + + # If the number predefined can't be used on the machine use the maximum number specify by the administrator + if($$ref_cpu > $max_cpu) { $$ref_cpu = $max_cpu } + + + ## Recover the header + open(F1, $inputFile) or die "$!: $inputFile\n"; + $$ref_header= <F1>; + close F1; + + ## Remove the first line of the file + my $fileNoHeader = "$folder_temp/${filename}-NoHeader"; + `sed 1d $inputFile > $fileNoHeader`; + + if(!-e "$folder_temp/$filename") { mkdir("$folder_temp/$filename") or die "Can't create the directory $folder_temp/$filename\n"; } + my $lines_per_temp = int(1+($1 / $$ref_cpu)); # +1 in case of the div == 0 + `split -l $lines_per_temp $fileNoHeader $folder_temp/$filename/$filename-`; + + if($$ref_header eq "") { print STDERR "Error message:\n"; print STDERR "No header for the file $inputFile!!!\nPlease check the format of your file\n"; } +} + +sub annotateFile +{ + my ($cpu, $filename, $headerOriginalFile) = @_; + + my $pm = Parallel::ForkManager->new($cpu); + + foreach my $tempFile (`ls $folder_temp/$filename/$filename-*`) + { + chomp($tempFile); + + # Forks and returns the pid for the child: + my $pid = $pm->start and next; + + # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo + my ($filenameTempFile, $directoriesTempFile, $suffixTempFile) = fileparse($tempFile, qr/\-[^.]*/); + my $outFilenameTemp = $filenameTempFile.$suffixTempFile; + Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput"); + + # Annotate the file with Annovar + my $tempFileName_AVOutput = $filename.$suffixTempFile.".".${refGenome}."_multianno.txt"; + if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } + else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } + + # Check if the annotations worked + open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n"; + while(<F1>) + { + if($_ =~ /ERROR/i) + { + print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n"; + print STDERR $_; + print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n"; + } + } + close F1; + + # Recover the strand orientation + my $length_AVheader = 0; + RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader); + + # Recover the sequence context + RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filename/$outFilenameTemp".".".${refGenome}."_multianno.txt"); + + $pm->finish; # Terminates the child process + } + # Wait all the child process + $pm->wait_all_children; +} + +sub createOutput +{ + my ($filename, $headerOriginalFile, $nbLine) = @_; + + ## For MuTect and MuTect2 calling only variants passing MuTect filters are kept and sometines there is no variant passing these filters making error in Galaxy when using "collection". + if($nbLine == 1) + { + print STDOUT "\nThe sample $filename didn't pass MuTect filters\n"; + + ### Print Annovar minimal header + the original header of the input file + my $outputFile = "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt"; + open(OUT, ">", $outputFile) or die "$!: $outputFile\n"; + + if($fullAVDB eq "no") + { + print OUT "Chr\tStart\tEnd\tRef\tAlt\tFunc.refGene\tGene.refGene\tGeneDetail.refGene\tExonicFunc.refGene\tAAChange.refGene\tStrand\tcontext"; + print OUT "\t".$headerOriginalFile; + } + ### Print complete Annovar header (using the database name present in the file listAVDB) + the original header of the input file + else + { + print OUT "Chr\tStart\tEnd\tRef\tAlt"; + open(F1, $listAVDB) or die "$!: $listAVDB\n"; + while(<F1>) + { + if($_ =~ /^#/) { next; } + + my @tab = split("\t", $_); + $tab[0] =~ /$refGenome\_(.+)\.txt/; + my $dbName = $1; + + if($dbName =~ /refGene|knownGene|ensGene/) + { + print OUT "\t"."Func.$dbName\tGene.$dbName\tGeneDetail.$dbName\tExonicFunc.$dbName\tAAChange.$dbName"; + } + else + { + print OUT "\t".$dbName; + } + } + print OUT "\tStrand\tcontext\t".$headerOriginalFile; + + close F1; + } + close OUT; + } + else + { + CombinedTempFile("$folder_temp/$filename", "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt"); + } +} + +sub Convert2AV +{ + my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_; + + my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); + + open(F1, $inputFile) or die "$!: $inputFile\n"; + + open(OUT, ">", $output) or die "$!: $output\n"; + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + my $chr = ""; + + # Replace chr23 or chr24 by X or Y + if($tab[$chr_value] =~ /23/) { $chr = "chrX"; } + elsif($tab[$chr_value] =~ /24/) { $chr = "chrY"; } + elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; } + else { $chr = "chr".$tab[$chr_value]; } + + + ### Consider only "normal" chromosomes for the annotation + if( ($chr !~ /chr\d{1,2}$|chrX|chrY/) ) { next; } + + + ### Don't consider variants with two or more alt bases + if($tab[$alt_value] =~ /\,/) { next; } + + ### Reformat the Indels for Annovar + # chr1 85642631 C CT => chr1 85642631 85642631 - T (mm10) + # chr5 26085724 ACTT A => chr5 26085725 26085727 CTT - (mm10) + if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) ) + { + ### First check if the indels in the file are not already correctly formated + if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) + { + # For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels) + # Insertion: start = start & end = start + if($tab[$ref_value] =~ /\-/) + { + print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]"; + } + ## Deletion: start = start & end = start + length(del) -1 + else + { + my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1); + print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]"; + } + } + ### Indels not correctly formated for Annovar + else + { + my @tabRef = split("", $tab[$ref_value]); + my @tabAlt = split("", $tab[$alt_value]); + + # Remove the first base + my $ref2 = join("", @tabRef[1 .. $#tabRef]); + my $alt2 = join("", @tabAlt[1 .. $#tabAlt]); + + if(length($alt2) == 0) + { + my $altOK = "-"; + my $startOK = $tab[$start_value] + 1; + my $stopOK = $startOK + length($ref2) - 1; + print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK; + } + + if(length($ref2) == 0) + { + my $refOK = "-"; + print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2; + } + } + } + ### SBS + else + { + print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value]; + } + + ## Print the original file at the end + foreach (@tab) { print OUT "\t$_"; } + print OUT "\n"; + } + close F1; close OUT; +} + +sub AnnotateAV +{ + my ($inputFile, $output) = @_; + + if(!-e $path_AVDB) + { + print STDERR "Error message:\n"; + print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; + print STDERR "Please install the database for this genome before running Annovar\n"; + } + + # Extract the name of the databases + my $protocol = ""; my $operation = ""; + ExtractAVDBName($listAVDB, \$protocol, \$operation); + + `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; + + sub ExtractAVDBName + { + my ($listAVDB, $refS_protocol, $refS_operation) = @_; + + open(F1, $listAVDB) or die "$!: $listAVDB\n"; + while(<F1>) + { + if ($_ =~ /^#/) { next; } + + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + # db name like refGenome_dbName.txt + if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /ljb26/) ) + { + $$refS_protocol .= $1.","; $$refS_operation .= $tab[1].","; + } + # 1000 genome + if($tab[0] =~ /sites/) + { + $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/; + my ($dbName, $year, $month) = ($1, $2, $3); + $dbName =~ tr/A-Z/a-z/; + + # convert the month number into the month name + ConvertMonth(\$month); + + my $AVdbName_final = "1000g".$year.$month."_".$dbName; + $$refS_protocol .=$AVdbName_final.","; $$refS_operation .= $tab[1].","; + } + # ESP + if( ($tab[0] =~ /esp/) || ($tab[0] =~ /ljb26/) ) + { + $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/; + my $AVdbName_final = $1."_".$2; + $$refS_protocol .=$AVdbName_final.","; $$refS_operation .= $tab[1].","; + } + } + close F1; + + sub ConvertMonth + { + my ($refS_month) = @_; + + if($$refS_month == 1) { $$refS_month = "janv"; } + elsif($$refS_month == 2) { $$refS_month = "feb"; } + elsif($$refS_month == 3) { $$refS_month = "mar"; } + elsif($$refS_month == 4) { $$refS_month = "apr"; } + elsif($$refS_month == 5) { $$refS_month = "may"; } + elsif($$refS_month == 6) { $$refS_month = "jun"; } + elsif($$refS_month == 7) { $$refS_month = "jul"; } + elsif($$refS_month == 8) { $$refS_month = "aug"; } + elsif($$refS_month == 9) { $$refS_month = "sept"; } + elsif($$refS_month == 10) { $$refS_month = "oct"; } + elsif($$refS_month == 11) { $$refS_month = "nov"; } + elsif($$refS_month == 12) { $$refS_month = "dec"; } + } + } +} + +### Add the minimum of annotations (refGene + strand + context) +sub annotateAV_min +{ + my ($inputFile, $output) = @_; + + if(!-e $path_AVDB) + { + print STDERR "Error message:\n"; + print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; + print STDERR "Please install the database for this genome before running Annovar\n"; + } + + # Extract the name of the databases + my ($protocol, $operation) = ("refGene", "g"); + + `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; +} + +sub RecoverStrand +{ + my ($input, $headerOriginalFile, $pathDB, $refGenome, $output, $refS_lengthAVheader) = @_; + + my ($chr_value, $start_value, $ref_value, $alt_value, $func_value, $geneSymbol_value) = ("", "", "", "", "", "", "", ""); + + $chr_value = recoverNumCol($input, "Chr"); + $start_value = recoverNumCol($input, "Start"); + $ref_value = recoverNumCol($input, "Ref"); + $alt_value = recoverNumCol($input, "Alt"); + $func_value = recoverNumCol($input, "Func.refGene"); + $geneSymbol_value = recoverNumCol($input, "Gene.refGene"); + + #################### Convert the input file into a hash table + my %h_inputFile = (); + open(F1, $input) or die "$!: $input\n"; + my $annovar_header = <F1>; + + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + # In COSMIC the chromosome X and Y are annotated 23 and 24 + my $chr = ""; + if($tab[$chr_value] eq "chr23") { $chr = "chrX"; } + elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; } + elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; } + else { $chr = $tab[$chr_value]; } + + # Verify if the element exists + if($chr eq "") { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The chromosome value is nor defined for $_\n"; } + if(! exists $tab[$start_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The start value is nor defined for $_\n"; } + if(! exists $tab[$ref_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The reference value is nor defined for $_\n"; } + if(! exists $tab[$alt_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The alternate value is nor defined for $_\n"; } + if(! exists $tab[$func_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The functional value is nor defined for $_\n"; } + if(! exists $tab[$geneSymbol_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; } + + my $geneSymbol = ""; + ######## For the splicing annotation we separate the gene symbol from the aa change + if($tab[$func_value] eq "splicing") + { + if($tab[$geneSymbol_value] =~ /(.+)\((.+)\)/) { $geneSymbol = $1; } + else { $geneSymbol = $tab[$geneSymbol_value]; } + } + else { $geneSymbol = $tab[$geneSymbol_value]; } + + push(@{$h_inputFile{"$chr:$tab[$start_value]:$tab[$start_value]:$tab[$ref_value]:$tab[$alt_value]:$geneSymbol"}}, $_); + } + close F1; + + # print "\t\tRecoverStrand: $input\n"; + + #################### Convert the database file into a hash table + my %h_database = (); + my ($db_geneSymbol_value, $db_strandInfo_value, $db_chr_value) = (12, 3, 2); + + my $folderNameDB = $refGenome."db"; + my $fileNameDB = $refGenome."_refGene.txt"; + + open(F1, "$pathDB/$fileNameDB") or die "$!: $pathDB/$fileNameDB\n"; + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + my $strand = ""; + $strand = $tab[$db_strandInfo_value]; + if($strand eq "") { print STDERR "Error message:\n"; print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; } + else + { + # Some genes have several strand orientation, keep the first in the database + if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; } + } + } + close F1; + + #################### Parse the two hash tables for recover the strand information + open(OUT, ">", $output) or die "$!: $output\n"; + + + ## Add the header only for the firts part of the files + if($input =~ /\-aa/) + { + my @tabHeaderInput = ""; + $annovar_header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $annovar_header); + # Save the length of the Annovar header for the next function (RecoverGenomicSequence) + $$refS_lengthAVheader = $#tabHeaderInput; + + # Print the Annovar header until the column before OtherInfo + print OUT "$tabHeaderInput[0]"; + for(my $i=1; $i<$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } + print OUT "\tStrand"; + print OUT "\t",$headerOriginalFile; + } + + + # Timer for comparing the number of SNVs present in the hash table + my $timerUniqueSNVs = 0; + # Timer for comparing the number of SNVs with the strand orientation + my $timerSNVsStrand = 0; + + foreach my $kFile (sort keys %h_inputFile) + { + my $test = 0; + my @tab = split(":", $kFile); + + # Sometimes the line is not printed correctely !!!!! :@ + my @tHeaderInput = split("\t", $annovar_header); my @lengthLine = split("\t", $h_inputFile{$kFile}[0]); + my @tHeaderOriginalFile = split("\t", $headerOriginalFile); + my $lengthHeader = @tHeaderInput + (scalar(@tHeaderOriginalFile)-1) ; my $lengthLine = @lengthLine; + + # Save the length of the Annovar header for the next function (RecoverGenomicSequence) + $$refS_lengthAVheader = $#tHeaderInput; + + foreach my $kDB (sort keys %h_database) + { + if("$tab[5]:$tab[0]" eq $kDB) + { + if($lengthHeader != $lengthLine) + { + print STDERR "Error message:\n"; + print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n"; + } + + foreach my $line (@{$h_inputFile{$kFile}}) + { + my @tab = split("\t", $line); + my $j = 0; + + for(my $i=0; $i<$#tHeaderInput; $i++) { print OUT $tab[$i],"\t"; $j=$i } + print OUT $h_database{$kDB}; + for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } + print OUT "\n"; + } + $timerSNVsStrand++; + $test = 1; last; + } + } + # The strand orientation isn't defined + if($test == 0) + { + my @tHeaderInput = split("\t", $annovar_header); + foreach my $line (@{$h_inputFile{$kFile}}) + { + my @tab = split("\t", $line); + my $j = 0; + for(my $i=0; $i<$#tHeaderInput; $i++) { print OUT $tab[$i],"\t"; $j=$i } + print OUT "NA"; + for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } + print OUT "\n"; + } + $timerSNVsStrand++; + } + $timerUniqueSNVs++; + } + close OUT; + + # print "Strand orientation recover for $timerSNVsStrand SNVs out of $timerUniqueSNVs uniques\n"; +} + +sub RecoverGenomicSequence +{ + my ($inputFile, $length_AVheader, $intervalEnd, $referenceGenome, $pathToRefSeq, $output) = @_; + + ############ 1) Transform the input file in a hash table: one for recover the sequence context and one for keeping the original file + my %h_inputFileForSeqContext = (); my %h_inputFile = (); + my $header = ""; + CreateHashTable_from_InputFile($inputFile, $length_AVheader, \$header, $intervalEnd, \%h_inputFileForSeqContext, \%h_inputFile); + + sub CreateHashTable_from_InputFile + { + my ($input, $length_AVheader, $refS_header, $intervalEnd, $refH_inputFileForSeqContext, $refH_inputFile) = @_; + + my ($chr_value, $start_value, $strand_value) = (0, 1, $length_AVheader); + + my $countregion = 0; + my %allchr = (); + + open(F1, $input) or die "$!: $input\n"; + if($input =~ /\-aa/) { $$refS_header = <F1>; } + + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + my $name = "$tab[$chr_value]:$tab[$start_value]"; + my $start = $tab[$start_value] - $intervalEnd; + my $end = $tab[$start_value] + $intervalEnd; + + $start--; #make zero-start coordinate, to be consistent with UCSC + my $exonpos = "$tab[$chr_value]:$start"; + + push @{$refH_inputFileForSeqContext->{$tab[$chr_value]}}, [$name, $start, $end, $tab[$strand_value], $exonpos]; + push(@{$refH_inputFile->{"$tab[$chr_value]\t$start\t$end"}}, $_); + $countregion++; + $allchr{$tab[$chr_value]}++; + } + close F1; + } + + ############ 2) Extract the sequence context from the hash table + my %h_allRegionSeqContext = (); + my $refSeq = $pathToRefSeq; + Extract_SequenceContext(\%h_inputFileForSeqContext, $referenceGenome, $refSeq, \%h_allRegionSeqContext); + + sub Extract_SequenceContext + { + my ($refH_allRegion, $referenceGenome, $refSeq, $refH_allRegionSeqContext) = @_; + + my $folderDB = $referenceGenome."db"; + my $folderSeq = $referenceGenome."_seq"; + my $seqdir = "$refSeq/$folderSeq"; + + my %seqhash = (); #database sequence for each chromosome + my %name_seq = (); #sequence for each region + my (%seqlen, %discordlen, %badorf); #store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon + my ($count_success, @failure) = (0); + + for my $curchr (sort keys %{$refH_allRegion}) + { + my ($seqid, $curseq) = ('', ''); + my $fastafile = ""; + if ($curchr =~ m/^chr/) + { + %seqhash = (); #clear the seqhash storage + $fastafile = "$seqdir/$curchr.fa"; #by default, all FASTA files should be saved at fastadir, with the same name + } + else + { + %seqhash = (); #clear the seqhash storage + $fastafile = "$seqdir/chr$curchr.fa"; #by default, all FASTA files should be saved at fastadir, with the same name + } + if (not -e $fastafile) { #to handle cases where no "chr" prefix is given + print "WARNING: the FASTA file $curchr.fa cannot be retrieved from the specified directory $seqdir. Sequences in this chromosome will not be processed\n"; + next; + } + + if (not %seqhash) + { + open (FASTA, $fastafile) or print "WARNING: cannot read from FASTA file $fastafile so sequences in $curchr will not be processed: $!\n" and next; + while (<FASTA>) + { + if (m/^>(\S+)/) + { + $seqid and $seqhash{$seqid} = $curseq; #finish reading the sequence for seqid and save it + $seqid = $1; + $curseq = ''; + } + else + { + s/[\r\n]+$//; + $curseq .= uc $_; #only use upper case characters + } + } + close FASTA; + $seqhash{$seqid} = $curseq; + } + if (not $seqhash{$curchr}) + { + #this chromosome just do not have FASTA sequences (maybe users used a wrong seqdir + print "WARNING: Unable to retrieve regions at $curchr due to lack of sequence information\n"; + next; + } + + for my $i (0 .. @{$refH_allRegion->{$curchr}}-1) + { + my ($name, $start, $end, $strand, $exonpos) = @{$refH_allRegion->{$curchr}[$i]}; + my @start = split (/,/, $start); + my @end = split (/,/, $end); + my $seq; + for my $i (0..@start-1) + { + if ($start[$i] >= length ($seqhash{$curchr})) + { + #here there must be an annotation error in user-specified gene/region definition file + print "WARNING: Ignoring the start position start=$start[$i] since it is longer than the $curchr sequence (length=" , length($seqhash{$curchr}), ")\n"; + undef $seq; + last; + } + $seq .= substr ($seqhash{$curchr}, $start[$i], $end[$i]-$start[$i]); + } + + if (defined $seq) + { + if (defined $seqlen{$name}) + { + $seqlen{$name} != length ($seq) and warn "WARNING: the sequence $name was found more than once with different sequence lengths\n"; + $seqlen{$name} != length ($seq) and $discordlen{$name}++; + } + else { $seqlen{$name} = length ($seq); } + + $name_seq{$name, $exonpos} = $seq; + $count_success++; + + # Put the sequence context in a hash table for Write the result after + ## Some sequence context are NNNNNN or empty + if( ($seq ne "NA") && ($seq =~ /N/i) ) { $refH_allRegionSeqContext->{"$curchr\t$start\t$end"} = "NA"; } + else { $refH_allRegionSeqContext->{"$curchr\t$start\t$end"} = $seq; } + } + else + { + print "WARNING: DNA sequence for $name cannot be inferred\n"; + push @failure, $name; + } + } + } # End for $curchr + } + + ############ 3) Create a file with the sequence context + WriteFile_SeqContext($inputFile, $length_AVheader, \%h_inputFile, $header, \%h_allRegionSeqContext, $output); + + sub WriteFile_SeqContext + { + my ($inputFile, $length_AVheader, $refH_InputFile, $header, $refH_allRegionSeqContext, $output) = @_; + + open(OUT, ">", $output) or die "$!: $output\n"; + + ## Add the header only for the firts part of the files + if($inputFile =~ /\-aa/) + { + my @tabHeaderInput = ""; + + $header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header); + # Print the Annovar header until the column before OtherInfo + print OUT "$tabHeaderInput[0]"; + my $j = 0; + for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; } + print OUT "\tcontext\ttrinucleotide_context"; + for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } + print OUT "\n"; + } + + foreach my $k_hFile (sort keys %{$refH_InputFile}) + { + foreach my $k_allRegonSeqContext (sort keys %{$refH_allRegionSeqContext}) + { + if($k_hFile eq $k_allRegonSeqContext) + { + my $j=0; + + for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++) + { + my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]); + + # Write Annovar annotation + strand orientation + for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; } + # Write the sequence context with the length defined by the user (default is 10) + print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext}; + # Write the trinucleotide context + my $contextSequence = $refH_allRegionSeqContext->{$k_allRegonSeqContext}; $contextSequence =~ tr/a-z/A-Z/; + my @tempContextSequence = split("", $contextSequence); + my $midlle_totalNbBaseContext = (scalar(@tempContextSequence)-1)/2; # For having the middle of the sequence + print OUT "\t".$tempContextSequence[$midlle_totalNbBaseContext-1]."x".$tempContextSequence[$midlle_totalNbBaseContext+1]; + # Write the original columns + for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } + print OUT "\n"; + } + last; + } + } + } + close OUT; + } +} + +sub CombinedTempFile +{ + my ($folderTempFile, $output) = @_; + + my $cmd_cat_mt_results = "cat "; + + foreach my $file (`ls $folderTempFile/*.txt`) + { + chomp($file); + $cmd_cat_mt_results = $cmd_cat_mt_results." $file"; + } + $cmd_cat_mt_results = $cmd_cat_mt_results." > $output"; + `$cmd_cat_mt_results`; +} + + +sub recoverNumCol +{ + my ($input, $name_of_column) = @_; + + open(F1,$input) or die "recoverNumCol: $!: $input\n"; + # For having the name of the columns + my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); + close F1; + # The number of the column + my $name_of_column_NB = "toto"; + for(my $i=0; $i<=$#tab_search_header; $i++) + { + if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; } + } + if($name_of_column_NB eq "toto") + { + print STDERR "Error message:\n"; + print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; + } + else { return $name_of_column_NB; } +} + + + + +=head1 NAME + +mutspec-Annot + +=head1 SYNOPSIS + + mutspecannot.pl [arguments] <query-file> + + <query-file> can be a folder with multiple VCF or a single VCF + + Arguments: + -h, --help print help message + -m, --man print complete documentation + -v, --verbose use verbose output + --refGenome the reference genome to use + --interval <interger> the number of bases for the sequence context + -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file + -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size + --pathAVDBList the path to a text file containing the list of Annovar databases installed + -temp --pathTemporary <string> the path for saving the temporary files + --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no) + --max_cpu <integer> number of CPUs to be used for the annotation + + +Function: automatically run a pipeline on a list of variants and annote them using Annovar + + Example: # Annotation only + mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input + + + Version: 03-2017 (March 2017) + + +=head1 OPTIONS + +=over 8 + +=item B<--help> + +print a brief usage message and detailed explanation of options. + +=item B<--man> + +print the complete manual of the program. + +=item B<--verbose> + +use verbose output. + +=item B<--refGenome> + +the reference genome to use, could be hg19 or mm9. + +=item B<--interval> + +the number of bases surrounding the mutated bases, for the sequence context analysis. + +=item B<--outfile> + +the directory of output file names. If it is nor specify the same directory as the input file is used. + +=item B<--pathAnnovarDB> + +the path to the directory containing the Annovar databases and the files with the chromosome size. + +=item B<--pathAVDBList> + +the path to a text file containing the list of Annovar databases installed. + +=item B<--pathTemporary> + +the path for saving temporary files generated by the script. +If any is specify a temporary folder is created in the same directory where the script is running. +Deleted when the script is finish + +=item B<--fullAnnotation> + +Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines) + +=item B<--max_cpu> + +Specify the number of CPUs to be used. This number is used for spliting the file in n part and running the annotations in each part in parallel. + + +=head1 DESCRIPTION + +MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS. +Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added. +A text tab delimited file is produced. + +=cut