Mercurial > repos > iarc > mutspec
view mutspecAnnot.pl @ 7:eda59b985b1c draft default tip
Uploaded
author | iarc |
---|---|
date | Mon, 13 Mar 2017 08:21:19 -0400 |
parents | 46a10309dfe2 |
children |
line wrap: on
line source
#!/usr/bin/env perl #-----------------------------------# # Author: Maude # # Script: mutspecAnnot.pl # # Last update: 02/03/17 # #-----------------------------------# use strict; use warnings; use Getopt::Long; use Pod::Usage; use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); use File::Path; use Parallel::ForkManager; our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context. our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines) ######################################### ### SPECIFY THE NUMBER OF CPU ### ######################################### our ($max_cpu) = 8; # Max number of CPU to use for the annotation GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp, 'max_cpu=i' => \$max_cpu) or pod2usage(2); our ($input) = @ARGV; pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input) ###################################################################################################################################################### # GLOBAL VARIABLES # ###################################################################################################################################################### # Recover the current path our $pwd = `pwd`; chomp($pwd); # Recover the filename and the input directory our ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/); # Output directories our ($folderMutAnalysis, $folderAnnovar) = ("", ""); # File with the list of Annovar databases to use our $listAVDB = ""; # Initialisation of chromosome, position, ref and alt values our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a"); ###################################################################################################################################################### # MAIN # ###################################################################################################################################################### ## Check the presence of the flags and create the output and temp directories CheckFlags(); ## Format the file correctly: ## 1) Check the length of the filename (must be <= 31 characters) ## 2) Recover the input format. If MuTect output consider only "KEEP" variants ## 3) Recover the column number for chr, start, ref and alt FormatingInputFile(); # Annotate the file with Annovar, add the strand orientation and the sequence context FullAnnotation(); ###################################################################################################################################################### # FUNCTIONS # ###################################################################################################################################################### ## Check the presence of the flags and create the output and temp directories sub CheckFlags { # Check the reference genome if($refGenome eq "empty") { print STDERR "Missing flag !\n"; print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; exit; } if($intervalEnd eq "empty") { print STDERR "Missing flag !\n"; print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n"; exit; } # If no output is specified write the result as the same place as the input file if($output eq "empty") { my $directory = dirname( $input ); $folderMutAnalysis = "$directory/Mutational_Analysis"; if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } } else { if(!-e $output) { mkdir($output) or die "$!: $output\n"; } $folderMutAnalysis = "$output/Mutational_Analysis"; if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } } # Create the output folder for Annovar $folderAnnovar = "$folderMutAnalysis/Annovar"; if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; } # Verify the access to Annovar databases if($path_AVDB eq "empty") { print STDERR "Missing flag !\n"; print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n"; exit; } elsif(!-e $path_AVDB) { print STDERR "Error message:\n"; print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n"; } # Check the file list AV DB if($pathAVDBList eq "empty") { print STDERR "Missing flag !\n"; print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n"; exit; } else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" } # If no temp folder is specified write the result in the current path if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$filename"; } if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; } # Verify listAVDB is not empty if($listAVDB eq "") { print STDERR "Path to the text file containing the list of Annovar databases installed is not specified !!!\n"; exit; } } ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions sub FormatingInputFile { # The input is a folder if(-d $input) { foreach my $file (`ls $input`) { my $headerOriginalFile = ""; chomp($file); CheckLengthFilename("$input/$file"); ################################################# ### Recover the input format ### ################################################# RecoverInputFormat("$input/$file", \$headerOriginalFile); } } # The input is one file else { my $headerOriginalFile = ""; CheckLengthFilename($input); ################################################# ### Recover the input format ### ################################################# RecoverInputFormat($input, \$headerOriginalFile); } } # The name for the Excel sheet can't be longer than 31 characters sub CheckLengthFilename { my ($inputFile) = @_; my ($filenameInputFile, $directoriesInputFile, $suffixInputFile) = fileparse($inputFile, qr/\.[^.]*/); if(length($filenameInputFile) > 32) { print STDERR "Error message:\n"; print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n"; } } # Recover the input format. If MuTect output consider only "KEEP" variants sub RecoverInputFormat { my ($file, $refS_headerOriginalFile) = @_; my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); my $inputFormat = ""; open(F1, $file) or die "$!: $file\n"; my $header = <F1>; close F1; ### VCF and MuTect files have in their first line the type of the file if($header =~ /fileformat=VCF/i) { $inputFormat = "vcf"; } elsif($header =~ /mutect/i) { $inputFormat = "mutect"; } else { $inputFormat = "unknown"; } ### VCF files if($inputFormat eq "vcf") { ### MuTect2 output VCFs my $testVC = `grep MuTect2 $file`; if($testVC =~ /MuTect2/) { # Keep only the variants passing MuTect2 filters `grep PASS $file > $folder_temp/$filename-PASS.txt`; # Recover the header $$refS_headerOriginalFile = `grep '#CHROM' $file`; # Add the header # Sed command doesn't work... sed 's/^/some text\n/' file > res open(OUT, ">", "$folder_temp/$filename-KEEP.txt") or die "$!: $folder_temp/$filename-KEEP.txt\n"; print OUT $$refS_headerOriginalFile; open(F1, "$folder_temp/$filename-PASS.txt") or die "$!: $folder_temp/$filename-PASS.txt\n"; while(<F1>) { print OUT $_; } close F1; close OUT; `rm $folder_temp/$filename-PASS.txt`; # Check if there if no empty column CheckEmptyColumn("$folder_temp/$filename-KEEP.txt"); `rm $folder_temp/$filename-KEEP.txt`; # Set the col number for the chr,start,ref and alt ($chrValue, $positionValue, $refValue, $altValue) = (0, 1, 3, 4); } else { open(F1, $file) or die "$!: $file\n"; open(OUT, ">", "$folder_temp/$filename.txt") or die "$!: $folder_temp/$filename.txt\n"; while (<F1>) { $_ =~ s/[\r\n]+$//; my @tab = split("\t", $_); # Print the VCF header if($tab[0] eq "#CHROM") { $tab[0] =~ /#(.+)/; print OUT "$1"; for(my $i=1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } print OUT "\n"; } elsif($tab[0] !~ /##/) { # Don't consider chromosome random, GL and MT if( ($tab[0] =~ /random/) || ($tab[0] =~ /GL/i) ) { next; } print OUT "$_\n"; } } close F1; close OUT; ## Recover the header open(F1, "$folder_temp/$filename.txt") or die "$!: $folder_temp/$filename.txt\n"; $$refS_headerOriginalFile = <F1>; close F1; # Check if there if no empty column CheckEmptyColumn("$folder_temp/$filename.txt"); `rm $folder_temp/$filename.txt`; # Set the col number for the chr,start,ref and alt ($chrValue, $positionValue, $refValue, $altValue) = (0, 1, 3, 4); } } ### MuTect files elsif($inputFormat eq "mutect") { `sed '1d' $file > $folder_temp/$filename-HeaderOK`; # Keep only the SNVs of good quality `grep -v REJECT $folder_temp/$filename-HeaderOK > $folder_temp/$filename-KEEP.txt`; `rm $folder_temp/$filename-HeaderOK`; # Recover the header open(F1, "$folder_temp/$filename-KEEP.txt") or die "$!: $folder_temp/$filename-KEEP.txt\n"; $$refS_headerOriginalFile = <F1>; close F1; # Check if there if no empty column CheckEmptyColumn("$folder_temp/$filename-KEEP.txt"); `rm $folder_temp/$filename-KEEP.txt`; # Recover the name and the number of the columns that contain the chromosome number, the start position, the ref and alt alleles. # Use the dictionary for recover the names and the position of the columns RecoverColNameAuto("$folder_temp/$filename-KEEPColumnCorrect.txt", $$refS_headerOriginalFile, \$chrValue, \$positionValue, \$refValue, \$altValue); } ### Unknown type else { ## Recover the header open(F1, $file) or die "$!: $file\n"; $$refS_headerOriginalFile = <F1>; close F1; # Check if there if no empty column CheckEmptyColumn($file); ## Recover the name and the number of the columns that contain the chromosome number, the start position, the ref and alt alleles. # Use the dictionary for recover the names and the position of the columns RecoverColNameAuto($file, $$refS_headerOriginalFile, \$chrValue, \$positionValue, \$refValue, \$altValue); } } # Some files can have empty column with no information sub CheckEmptyColumn { my ($inputFile) = @_; my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); if($filename =~ /(.+)-KEEP/) { $filename = $1; } open(OUT, ">", "$folder_temp/$filename-ColumnCorrect.txt") or die "$!: $folder_temp/$filename-ColumnCorrect.txt\n"; open(F1, $inputFile) or die "$!: $inputFile\n"; my $header = <F1>; $header =~ s/[\r\n]+$//; my @tabHeader = split("\t", $header); print OUT $header, "\n"; while(<F1>) { $_ =~ s/[\r\n]+$//; my @tab = split("\t", $_); if(scalar(@tab) != scalar(@tabHeader)) { print OUT $tab[0]; for(my $i=1; $i<=$#tabHeader; $i++) { if(defined($tab[$i])) { print OUT "\t$tab[$i]"; } else { print OUT "\tNA"; } } print OUT "\n"; } else { print OUT "$_\n"; } } close F1; close OUT; } # Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles. sub RecoverColNameAuto { our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_; $header =~ s/[\r\n]+$//; ## Name of the columns my @mutect = qw(contig position ref_allele alt_allele); my @vcf = qw(CHROM POS REF ALT); my @cosmic = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic); my @icgc = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele); my @tcga = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2); my @ionTorrent = qw(chr Position Ref Alt); my @proton = qw(Chrom Position Ref Variant); my @varScan2 = qw(Chrom Position Ref VarAllele); my @varScan2_somatic = qw(chrom position ref var); my @annovar = qw(Chr Start Ref Obs); my @custom = qw(Chromosome Start Wild_Type Mutant); my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@varScan2_somatic, \@annovar, \@custom); my $timer = 0; # For controlling if the names are present on the dictionnary or not foreach my $refTab (@allTab) { my @tab = @$refTab; SearchCol(\@tab); # The columns names were find if( ($$ref_chrValue ne "c") && ($$ref_positionValue ne "s") && ($$ref_refValue ne "r") && ($$ref_altValue ne "a") ) { last; } # The names of the columns are not present in the dictionnary else { $timer++; } } if($timer == scalar(@allTab)) { print STDERR "Error message:\n"; print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n"; print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n"; exit; } # Extract the number of the column that contain the information sub SearchCol { my ($refTab) = @_; my @tabNames = @$refTab; my @tabHeader = split("\t", $header); # For VCF if($tabHeader[0] eq "#CHROM") { ($$ref_chrValue, $$ref_positionValue, $$ref_refValue, $$ref_altValue) = (0, 1, 3, 4); } # For tabular files else { for(my $i=0; $i<=$#tabNames; $i++) { for(my $j=0; $j<=$#tabHeader; $j++) { if($tabHeader[$j] eq $tabNames[$i]) { if($i == 0) { $$ref_chrValue = $j; } if($i == 1) { $$ref_positionValue = $j; } if($i == 2) { $$ref_refValue = $j; } if($i == 3) { $$ref_altValue = $j; } last; # Once find pass to the next name } } } } } } # Annotate the file with Annovar, add the strand orientation and the sequence context sub FullAnnotation { print "-----------------------------------------------------------------\n"; print "---------------------------Annotation----------------------------\n"; print "-----------------------------------------------------------------\n"; # If the input is a folder if(-d $input) { foreach my $file (`ls $folder_temp/*.txt`) { chomp($file); # For recover the name of the file without extension, the directory where the file is and the extension of the file my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/); my $filenameOK = ""; # For removing the ColumnCorrect for txt files if($filename =~ /(.+)-ColumnCorrect/) { if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; } else { $filenameOK = $1; } } else { print STDERR "Error message:\n"; print STDERR "Case not considered for $filename!!!\n"; } ################################################# ### Cut the files in n part ### ################################################# # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts my $cpu = 0; # Keep the original header my $headerOriginalFile = ""; # Save the numer of lines my $nbLine = 0; splitInputFile($file, \$cpu, \$headerOriginalFile, $filenameOK, \$nbLine); ################################################# ### Annotate the n part ### ################################################# annotateFile($cpu, $filenameOK, $headerOriginalFile); ################################################# ### Paste the file together ### ################################################# createOutput($filenameOK, $headerOriginalFile, $nbLine); } } # The input file is one file else { ################################################# ### Cut the files in n part ### ################################################# # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts my $cpu = 0; # Keep the original header my $headerOriginalFile = ""; # Save the numer of lines my $nbLine = 0; splitInputFile("$folder_temp/$filename-ColumnCorrect.txt", \$cpu, \$headerOriginalFile, $filename, \$nbLine); ################################################# ### Annotate the n part ### ################################################# annotateFile($cpu, $filename, $headerOriginalFile); ################################################# ### Paste the file together ### ################################################# createOutput($filename, $headerOriginalFile, $nbLine); } # Remove the temporary directory rmtree($folder_temp); } sub splitInputFile { my ($inputFile, $ref_cpu, $ref_header, $filename, $ref_nbLine) = @_; my $nbVariants = `wc -l $inputFile`; $nbVariants =~ /(\d+).+/; $$ref_nbLine = $1; if($$ref_nbLine-1 <= 5000) { $$ref_cpu = 1; } elsif( ($$ref_nbLine-1 > 5000) && ($$ref_nbLine-1 < 25000) ) { $$ref_cpu = 2; } elsif( ($$ref_nbLine-1 >= 25000) && ($$ref_nbLine-1 < 100000) ) { $$ref_cpu = 8; } else { $$ref_cpu = $max_cpu; } # If the number predefined can't be used on the machine use the maximum number specify by the administrator if($$ref_cpu > $max_cpu) { $$ref_cpu = $max_cpu } ## Recover the header open(F1, $inputFile) or die "$!: $inputFile\n"; $$ref_header= <F1>; close F1; ## Remove the first line of the file my $fileNoHeader = "$folder_temp/${filename}-NoHeader"; `sed 1d $inputFile > $fileNoHeader`; if(!-e "$folder_temp/$filename") { mkdir("$folder_temp/$filename") or die "Can't create the directory $folder_temp/$filename\n"; } my $lines_per_temp = int(1+($1 / $$ref_cpu)); # +1 in case of the div == 0 `split -l $lines_per_temp $fileNoHeader $folder_temp/$filename/$filename-`; if($$ref_header eq "") { print STDERR "Error message:\n"; print STDERR "No header for the file $inputFile!!!\nPlease check the format of your file\n"; } } sub annotateFile { my ($cpu, $filename, $headerOriginalFile) = @_; my $pm = Parallel::ForkManager->new($cpu); foreach my $tempFile (`ls $folder_temp/$filename/$filename-*`) { chomp($tempFile); # Forks and returns the pid for the child: my $pid = $pm->start and next; # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo my ($filenameTempFile, $directoriesTempFile, $suffixTempFile) = fileparse($tempFile, qr/\-[^.]*/); my $outFilenameTemp = $filenameTempFile.$suffixTempFile; Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput"); # Annotate the file with Annovar my $tempFileName_AVOutput = $filename.$suffixTempFile.".".${refGenome}."_multianno.txt"; if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } # Check if the annotations worked open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n"; while(<F1>) { if($_ =~ /ERROR/i) { print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n"; print STDERR $_; print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n"; } } close F1; # Recover the strand orientation my $length_AVheader = 0; RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader); # Recover the sequence context RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filename/$outFilenameTemp".".".${refGenome}."_multianno.txt"); $pm->finish; # Terminates the child process } # Wait all the child process $pm->wait_all_children; } sub createOutput { my ($filename, $headerOriginalFile, $nbLine) = @_; ## For MuTect and MuTect2 calling only variants passing MuTect filters are kept and sometines there is no variant passing these filters making error in Galaxy when using "collection". if($nbLine == 1) { print STDOUT "\nThe sample $filename didn't pass MuTect filters\n"; ### Print Annovar minimal header + the original header of the input file my $outputFile = "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt"; open(OUT, ">", $outputFile) or die "$!: $outputFile\n"; if($fullAVDB eq "no") { print OUT "Chr\tStart\tEnd\tRef\tAlt\tFunc.refGene\tGene.refGene\tGeneDetail.refGene\tExonicFunc.refGene\tAAChange.refGene\tStrand\tcontext"; print OUT "\t".$headerOriginalFile; } ### Print complete Annovar header (using the database name present in the file listAVDB) + the original header of the input file else { print OUT "Chr\tStart\tEnd\tRef\tAlt"; open(F1, $listAVDB) or die "$!: $listAVDB\n"; while(<F1>) { if($_ =~ /^#/) { next; } my @tab = split("\t", $_); $tab[0] =~ /$refGenome\_(.+)\.txt/; my $dbName = $1; if($dbName =~ /refGene|knownGene|ensGene/) { print OUT "\t"."Func.$dbName\tGene.$dbName\tGeneDetail.$dbName\tExonicFunc.$dbName\tAAChange.$dbName"; } else { print OUT "\t".$dbName; } } print OUT "\tStrand\tcontext\t".$headerOriginalFile; close F1; } close OUT; } else { CombinedTempFile("$folder_temp/$filename", "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt"); } } sub Convert2AV { my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_; my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); open(F1, $inputFile) or die "$!: $inputFile\n"; open(OUT, ">", $output) or die "$!: $output\n"; while(<F1>) { $_ =~ s/[\r\n]+$//; my @tab = split("\t", $_); my $chr = ""; # Replace chr23 or chr24 by X or Y if($tab[$chr_value] =~ /23/) { $chr = "chrX"; } elsif($tab[$chr_value] =~ /24/) { $chr = "chrY"; } elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; } else { $chr = "chr".$tab[$chr_value]; } ### Consider only "normal" chromosomes for the annotation if( ($chr !~ /chr\d{1,2}$|chrX|chrY/) ) { next; } ### Don't consider variants with two or more alt bases if($tab[$alt_value] =~ /\,/) { next; } ### Reformat the Indels for Annovar # chr1 85642631 C CT => chr1 85642631 85642631 - T (mm10) # chr5 26085724 ACTT A => chr5 26085725 26085727 CTT - (mm10) if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) ) { ### First check if the indels in the file are not already correctly formated if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) { # For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels) # Insertion: start = start & end = start if($tab[$ref_value] =~ /\-/) { print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]"; } ## Deletion: start = start & end = start + length(del) -1 else { my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1); print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]"; } } ### Indels not correctly formated for Annovar else { my @tabRef = split("", $tab[$ref_value]); my @tabAlt = split("", $tab[$alt_value]); # Remove the first base my $ref2 = join("", @tabRef[1 .. $#tabRef]); my $alt2 = join("", @tabAlt[1 .. $#tabAlt]); if(length($alt2) == 0) { my $altOK = "-"; my $startOK = $tab[$start_value] + 1; my $stopOK = $startOK + length($ref2) - 1; print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK; } if(length($ref2) == 0) { my $refOK = "-"; print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2; } } } ### SBS else { print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value]; } ## Print the original file at the end foreach (@tab) { print OUT "\t$_"; } print OUT "\n"; } close F1; close OUT; } sub AnnotateAV { my ($inputFile, $output) = @_; if(!-e $path_AVDB) { print STDERR "Error message:\n"; print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; } # Extract the name of the databases my $protocol = ""; my $operation = ""; ExtractAVDBName($listAVDB, \$protocol, \$operation); `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; sub ExtractAVDBName { my ($listAVDB, $refS_protocol, $refS_operation) = @_; open(F1, $listAVDB) or die "$!: $listAVDB\n"; while(<F1>) { if ($_ =~ /^#/) { next; } $_ =~ s/[\r\n]+$//; my @tab = split("\t", $_); # db name like refGenome_dbName.txt if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /ljb26/) ) { $$refS_protocol .= $1.","; $$refS_operation .= $tab[1].","; } # 1000 genome if($tab[0] =~ /sites/) { $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/; my ($dbName, $year, $month) = ($1, $2, $3); $dbName =~ tr/A-Z/a-z/; # convert the month number into the month name ConvertMonth(\$month); my $AVdbName_final = "1000g".$year.$month."_".$dbName; $$refS_protocol .=$AVdbName_final.","; $$refS_operation .= $tab[1].","; } # ESP if( ($tab[0] =~ /esp/) || ($tab[0] =~ /ljb26/) ) { $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/; my $AVdbName_final = $1."_".$2; $$refS_protocol .=$AVdbName_final.","; $$refS_operation .= $tab[1].","; } } close F1; sub ConvertMonth { my ($refS_month) = @_; if($$refS_month == 1) { $$refS_month = "janv"; } elsif($$refS_month == 2) { $$refS_month = "feb"; } elsif($$refS_month == 3) { $$refS_month = "mar"; } elsif($$refS_month == 4) { $$refS_month = "apr"; } elsif($$refS_month == 5) { $$refS_month = "may"; } elsif($$refS_month == 6) { $$refS_month = "jun"; } elsif($$refS_month == 7) { $$refS_month = "jul"; } elsif($$refS_month == 8) { $$refS_month = "aug"; } elsif($$refS_month == 9) { $$refS_month = "sept"; } elsif($$refS_month == 10) { $$refS_month = "oct"; } elsif($$refS_month == 11) { $$refS_month = "nov"; } elsif($$refS_month == 12) { $$refS_month = "dec"; } } } } ### Add the minimum of annotations (refGene + strand + context) sub annotateAV_min { my ($inputFile, $output) = @_; if(!-e $path_AVDB) { print STDERR "Error message:\n"; print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; } # Extract the name of the databases my ($protocol, $operation) = ("refGene", "g"); `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; } sub RecoverStrand { my ($input, $headerOriginalFile, $pathDB, $refGenome, $output, $refS_lengthAVheader) = @_; my ($chr_value, $start_value, $ref_value, $alt_value, $func_value, $geneSymbol_value) = ("", "", "", "", "", "", "", ""); $chr_value = recoverNumCol($input, "Chr"); $start_value = recoverNumCol($input, "Start"); $ref_value = recoverNumCol($input, "Ref"); $alt_value = recoverNumCol($input, "Alt"); $func_value = recoverNumCol($input, "Func.refGene"); $geneSymbol_value = recoverNumCol($input, "Gene.refGene"); #################### Convert the input file into a hash table my %h_inputFile = (); open(F1, $input) or die "$!: $input\n"; my $annovar_header = <F1>; while(<F1>) { $_ =~ s/[\r\n]+$//; my @tab = split("\t", $_); # In COSMIC the chromosome X and Y are annotated 23 and 24 my $chr = ""; if($tab[$chr_value] eq "chr23") { $chr = "chrX"; } elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; } elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; } else { $chr = $tab[$chr_value]; } # Verify if the element exists if($chr eq "") { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The chromosome value is nor defined for $_\n"; } if(! exists $tab[$start_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The start value is nor defined for $_\n"; } if(! exists $tab[$ref_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The reference value is nor defined for $_\n"; } if(! exists $tab[$alt_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The alternate value is nor defined for $_\n"; } if(! exists $tab[$func_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The functional value is nor defined for $_\n"; } if(! exists $tab[$geneSymbol_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; } my $geneSymbol = ""; ######## For the splicing annotation we separate the gene symbol from the aa change if($tab[$func_value] eq "splicing") { if($tab[$geneSymbol_value] =~ /(.+)\((.+)\)/) { $geneSymbol = $1; } else { $geneSymbol = $tab[$geneSymbol_value]; } } else { $geneSymbol = $tab[$geneSymbol_value]; } push(@{$h_inputFile{"$chr:$tab[$start_value]:$tab[$start_value]:$tab[$ref_value]:$tab[$alt_value]:$geneSymbol"}}, $_); } close F1; # print "\t\tRecoverStrand: $input\n"; #################### Convert the database file into a hash table my %h_database = (); my ($db_geneSymbol_value, $db_strandInfo_value, $db_chr_value) = (12, 3, 2); my $folderNameDB = $refGenome."db"; my $fileNameDB = $refGenome."_refGene.txt"; open(F1, "$pathDB/$fileNameDB") or die "$!: $pathDB/$fileNameDB\n"; while(<F1>) { $_ =~ s/[\r\n]+$//; my @tab = split("\t", $_); my $strand = ""; $strand = $tab[$db_strandInfo_value]; if($strand eq "") { print STDERR "Error message:\n"; print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; } else { # Some genes have several strand orientation, keep the first in the database if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; } } } close F1; #################### Parse the two hash tables for recover the strand information open(OUT, ">", $output) or die "$!: $output\n"; ## Add the header only for the firts part of the files if($input =~ /\-aa/) { my @tabHeaderInput = ""; $annovar_header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $annovar_header); # Save the length of the Annovar header for the next function (RecoverGenomicSequence) $$refS_lengthAVheader = $#tabHeaderInput; # Print the Annovar header until the column before OtherInfo print OUT "$tabHeaderInput[0]"; for(my $i=1; $i<$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } print OUT "\tStrand"; print OUT "\t",$headerOriginalFile; } # Timer for comparing the number of SNVs present in the hash table my $timerUniqueSNVs = 0; # Timer for comparing the number of SNVs with the strand orientation my $timerSNVsStrand = 0; foreach my $kFile (sort keys %h_inputFile) { my $test = 0; my @tab = split(":", $kFile); # Sometimes the line is not printed correctely !!!!! :@ my @tHeaderInput = split("\t", $annovar_header); my @lengthLine = split("\t", $h_inputFile{$kFile}[0]); my @tHeaderOriginalFile = split("\t", $headerOriginalFile); my $lengthHeader = @tHeaderInput + (scalar(@tHeaderOriginalFile)-1) ; my $lengthLine = @lengthLine; # Save the length of the Annovar header for the next function (RecoverGenomicSequence) $$refS_lengthAVheader = $#tHeaderInput; foreach my $kDB (sort keys %h_database) { if("$tab[5]:$tab[0]" eq $kDB) { if($lengthHeader != $lengthLine) { print STDERR "Error message:\n"; print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n"; } foreach my $line (@{$h_inputFile{$kFile}}) { my @tab = split("\t", $line); my $j = 0; for(my $i=0; $i<$#tHeaderInput; $i++) { print OUT $tab[$i],"\t"; $j=$i } print OUT $h_database{$kDB}; for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } print OUT "\n"; } $timerSNVsStrand++; $test = 1; last; } } # The strand orientation isn't defined if($test == 0) { my @tHeaderInput = split("\t", $annovar_header); foreach my $line (@{$h_inputFile{$kFile}}) { my @tab = split("\t", $line); my $j = 0; for(my $i=0; $i<$#tHeaderInput; $i++) { print OUT $tab[$i],"\t"; $j=$i } print OUT "NA"; for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } print OUT "\n"; } $timerSNVsStrand++; } $timerUniqueSNVs++; } close OUT; # print "Strand orientation recover for $timerSNVsStrand SNVs out of $timerUniqueSNVs uniques\n"; } sub RecoverGenomicSequence { my ($inputFile, $length_AVheader, $intervalEnd, $referenceGenome, $pathToRefSeq, $output) = @_; ############ 1) Transform the input file in a hash table: one for recover the sequence context and one for keeping the original file my %h_inputFileForSeqContext = (); my %h_inputFile = (); my $header = ""; CreateHashTable_from_InputFile($inputFile, $length_AVheader, \$header, $intervalEnd, \%h_inputFileForSeqContext, \%h_inputFile); sub CreateHashTable_from_InputFile { my ($input, $length_AVheader, $refS_header, $intervalEnd, $refH_inputFileForSeqContext, $refH_inputFile) = @_; my ($chr_value, $start_value, $strand_value) = (0, 1, $length_AVheader); my $countregion = 0; my %allchr = (); open(F1, $input) or die "$!: $input\n"; if($input =~ /\-aa/) { $$refS_header = <F1>; } while(<F1>) { $_ =~ s/[\r\n]+$//; my @tab = split("\t", $_); my $name = "$tab[$chr_value]:$tab[$start_value]"; my $start = $tab[$start_value] - $intervalEnd; my $end = $tab[$start_value] + $intervalEnd; $start--; #make zero-start coordinate, to be consistent with UCSC my $exonpos = "$tab[$chr_value]:$start"; push @{$refH_inputFileForSeqContext->{$tab[$chr_value]}}, [$name, $start, $end, $tab[$strand_value], $exonpos]; push(@{$refH_inputFile->{"$tab[$chr_value]\t$start\t$end"}}, $_); $countregion++; $allchr{$tab[$chr_value]}++; } close F1; } ############ 2) Extract the sequence context from the hash table my %h_allRegionSeqContext = (); my $refSeq = $pathToRefSeq; Extract_SequenceContext(\%h_inputFileForSeqContext, $referenceGenome, $refSeq, \%h_allRegionSeqContext); sub Extract_SequenceContext { my ($refH_allRegion, $referenceGenome, $refSeq, $refH_allRegionSeqContext) = @_; my $folderDB = $referenceGenome."db"; my $folderSeq = $referenceGenome."_seq"; my $seqdir = "$refSeq/$folderSeq"; my %seqhash = (); #database sequence for each chromosome my %name_seq = (); #sequence for each region my (%seqlen, %discordlen, %badorf); #store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon my ($count_success, @failure) = (0); for my $curchr (sort keys %{$refH_allRegion}) { my ($seqid, $curseq) = ('', ''); my $fastafile = ""; if ($curchr =~ m/^chr/) { %seqhash = (); #clear the seqhash storage $fastafile = "$seqdir/$curchr.fa"; #by default, all FASTA files should be saved at fastadir, with the same name } else { %seqhash = (); #clear the seqhash storage $fastafile = "$seqdir/chr$curchr.fa"; #by default, all FASTA files should be saved at fastadir, with the same name } if (not -e $fastafile) { #to handle cases where no "chr" prefix is given print "WARNING: the FASTA file $curchr.fa cannot be retrieved from the specified directory $seqdir. Sequences in this chromosome will not be processed\n"; next; } if (not %seqhash) { open (FASTA, $fastafile) or print "WARNING: cannot read from FASTA file $fastafile so sequences in $curchr will not be processed: $!\n" and next; while (<FASTA>) { if (m/^>(\S+)/) { $seqid and $seqhash{$seqid} = $curseq; #finish reading the sequence for seqid and save it $seqid = $1; $curseq = ''; } else { s/[\r\n]+$//; $curseq .= uc $_; #only use upper case characters } } close FASTA; $seqhash{$seqid} = $curseq; } if (not $seqhash{$curchr}) { #this chromosome just do not have FASTA sequences (maybe users used a wrong seqdir print "WARNING: Unable to retrieve regions at $curchr due to lack of sequence information\n"; next; } for my $i (0 .. @{$refH_allRegion->{$curchr}}-1) { my ($name, $start, $end, $strand, $exonpos) = @{$refH_allRegion->{$curchr}[$i]}; my @start = split (/,/, $start); my @end = split (/,/, $end); my $seq; for my $i (0..@start-1) { if ($start[$i] >= length ($seqhash{$curchr})) { #here there must be an annotation error in user-specified gene/region definition file print "WARNING: Ignoring the start position start=$start[$i] since it is longer than the $curchr sequence (length=" , length($seqhash{$curchr}), ")\n"; undef $seq; last; } $seq .= substr ($seqhash{$curchr}, $start[$i], $end[$i]-$start[$i]); } if (defined $seq) { if (defined $seqlen{$name}) { $seqlen{$name} != length ($seq) and warn "WARNING: the sequence $name was found more than once with different sequence lengths\n"; $seqlen{$name} != length ($seq) and $discordlen{$name}++; } else { $seqlen{$name} = length ($seq); } $name_seq{$name, $exonpos} = $seq; $count_success++; # Put the sequence context in a hash table for Write the result after ## Some sequence context are NNNNNN or empty if( ($seq ne "NA") && ($seq =~ /N/i) ) { $refH_allRegionSeqContext->{"$curchr\t$start\t$end"} = "NA"; } else { $refH_allRegionSeqContext->{"$curchr\t$start\t$end"} = $seq; } } else { print "WARNING: DNA sequence for $name cannot be inferred\n"; push @failure, $name; } } } # End for $curchr } ############ 3) Create a file with the sequence context WriteFile_SeqContext($inputFile, $length_AVheader, \%h_inputFile, $header, \%h_allRegionSeqContext, $output); sub WriteFile_SeqContext { my ($inputFile, $length_AVheader, $refH_InputFile, $header, $refH_allRegionSeqContext, $output) = @_; open(OUT, ">", $output) or die "$!: $output\n"; ## Add the header only for the firts part of the files if($inputFile =~ /\-aa/) { my @tabHeaderInput = ""; $header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header); # Print the Annovar header until the column before OtherInfo print OUT "$tabHeaderInput[0]"; my $j = 0; for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; } print OUT "\tcontext\ttrinucleotide_context"; for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } print OUT "\n"; } foreach my $k_hFile (sort keys %{$refH_InputFile}) { foreach my $k_allRegonSeqContext (sort keys %{$refH_allRegionSeqContext}) { if($k_hFile eq $k_allRegonSeqContext) { my $j=0; for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++) { my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]); # Write Annovar annotation + strand orientation for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; } # Write the sequence context with the length defined by the user (default is 10) print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext}; # Write the trinucleotide context my $contextSequence = $refH_allRegionSeqContext->{$k_allRegonSeqContext}; $contextSequence =~ tr/a-z/A-Z/; my @tempContextSequence = split("", $contextSequence); my $midlle_totalNbBaseContext = (scalar(@tempContextSequence)-1)/2; # For having the middle of the sequence print OUT "\t".$tempContextSequence[$midlle_totalNbBaseContext-1]."x".$tempContextSequence[$midlle_totalNbBaseContext+1]; # Write the original columns for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } print OUT "\n"; } last; } } } close OUT; } } sub CombinedTempFile { my ($folderTempFile, $output) = @_; my $cmd_cat_mt_results = "cat "; foreach my $file (`ls $folderTempFile/*.txt`) { chomp($file); $cmd_cat_mt_results = $cmd_cat_mt_results." $file"; } $cmd_cat_mt_results = $cmd_cat_mt_results." > $output"; `$cmd_cat_mt_results`; } sub recoverNumCol { my ($input, $name_of_column) = @_; open(F1,$input) or die "recoverNumCol: $!: $input\n"; # For having the name of the columns my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); close F1; # The number of the column my $name_of_column_NB = "toto"; for(my $i=0; $i<=$#tab_search_header; $i++) { if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; } } if($name_of_column_NB eq "toto") { print STDERR "Error message:\n"; print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; } else { return $name_of_column_NB; } } =head1 NAME mutspec-Annot =head1 SYNOPSIS mutspecannot.pl [arguments] <query-file> <query-file> can be a folder with multiple VCF or a single VCF Arguments: -h, --help print help message -m, --man print complete documentation -v, --verbose use verbose output --refGenome the reference genome to use --interval <interger> the number of bases for the sequence context -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size --pathAVDBList the path to a text file containing the list of Annovar databases installed -temp --pathTemporary <string> the path for saving the temporary files --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no) --max_cpu <integer> number of CPUs to be used for the annotation Function: automatically run a pipeline on a list of variants and annote them using Annovar Example: # Annotation only mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input Version: 03-2017 (March 2017) =head1 OPTIONS =over 8 =item B<--help> print a brief usage message and detailed explanation of options. =item B<--man> print the complete manual of the program. =item B<--verbose> use verbose output. =item B<--refGenome> the reference genome to use, could be hg19 or mm9. =item B<--interval> the number of bases surrounding the mutated bases, for the sequence context analysis. =item B<--outfile> the directory of output file names. If it is nor specify the same directory as the input file is used. =item B<--pathAnnovarDB> the path to the directory containing the Annovar databases and the files with the chromosome size. =item B<--pathAVDBList> the path to a text file containing the list of Annovar databases installed. =item B<--pathTemporary> the path for saving temporary files generated by the script. If any is specify a temporary folder is created in the same directory where the script is running. Deleted when the script is finish =item B<--fullAnnotation> Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines) =item B<--max_cpu> Specify the number of CPUs to be used. This number is used for spliting the file in n part and running the annotations in each part in parallel. =head1 DESCRIPTION MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS. Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added. A text tab delimited file is produced. =cut