mutspec: mutspecAnnot.pl comparison

comparison mutspecAnnot.pl @ 7:eda59b985b1c draft default tip

Uploaded

author	iarc
date	Mon, 13 Mar 2017 08:21:19 -0400
parents	46a10309dfe2
children

comparison

equal deleted inserted replaced

-:46a10309dfe2
+:eda59b985b1c
 #!/usr/bin/env perl
 #-----------------------------------#
 # Author: Maude                     #
 # Script: mutspecAnnot.pl           #
-# Last update: 21/06/16             #
+# Last update: 02/03/17             #
 #-----------------------------------#
 use strict;
 use warnings;
 use Getopt::Long;
 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
 our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty");  # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files
 our ($intervalEnd)          = (10); # Number of bases for the flanking region for the sequence context.
 our ($fullAVDB)             = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines)
-GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp) or pod2usage(2);
+#########################################
+###     SPECIFY THE NUMBER OF CPU     ###
+#########################################
+our ($max_cpu) = 8; # Max number of CPU to use for the annotation
+GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp, 'max_cpu=i' => \$max_cpu) or pod2usage(2);
 our ($input) = @ARGV;
 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
 ######################################################################################################################################################
 #																																			GLOBAL VARIABLES																															 #
 ######################################################################################################################################################
-#########################################
-###     SPECIFY THE NUMBER OF CPU     ###
-#########################################
-our $max_cpu = 1; # Max number of CPU to use for the annotation
 # Recover the current path
 our $pwd = `pwd`;
 chomp($pwd);
+# Recover the filename and the input directory
-# Input file path
+our ($filename, $directories, $suffix)    = fileparse($input, qr/\.[^.]*/);
-our @pathInput = split("/", $input);
 # Output directories
 our ($folderMutAnalysis, $folderAnnovar) = ("", "");
 # File with the list of Annovar databases to use
 our $listAVDB = "";
 # Initialisation of chromosome, position, ref and alt values
 our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a");
 ######################################################################################################################################################
 #																																								MAIN 																																 #
 ######################################################################################################################################################
 ## Check the presence of the flags and create the output and temp directories
 CheckFlags();
-## Format the file in the correct format if they are vcf or MuTect output and recover the column positions
+## Format the file correctly:
+##			1) Check the length of the filename (must be <= 31 characters)
+##			2) Recover the input format. If MuTect output consider only "KEEP" variants
+##			3) Recover the column number for chr, start, ref and alt
 FormatingInputFile();
 # Annotate the file with Annovar, add the strand orientation and the sequence context
 FullAnnotation();
 ## Check the presence of the flags and create the output and temp directories
 sub CheckFlags
 {
 	# Check the reference genome
-	if($refGenome eq "empty")   { print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; exit; }
+	if($refGenome eq "empty")
-	if($intervalEnd eq "empty") { print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n"; exit; }
+	{
+		print STDERR "Missing flag !\n";
+		print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n";
+		exit;
+	}
+	if($intervalEnd eq "empty")
+	{
+		print STDERR "Missing flag !\n";
+		print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n";
+		exit;
+	}
 	# If no output is specified write the result as the same place as the input file
 	if($output eq "empty")
 	{
-		my $folderRes         = "";
+		my $directory = dirname( $input );
-		for(my $i=0; $i<$#pathInput; $i++) { $folderRes .= "$pathInput[$i]/"; }
+		$folderMutAnalysis = "$directory/Mutational_Analysis";
-		$folderMutAnalysis = "$folderRes/Mutational_Analysis";
 		if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; }
 	}
 	else
 	{
 		if(!-e $output) { mkdir($output) or die "$!: $output\n"; }
 	# Create the output folder for Annovar
 	$folderAnnovar         = "$folderMutAnalysis/Annovar";
 	if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; }
 	# Verify the access to Annovar databases
-	if($path_AVDB eq "empty") { print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n"; exit; }
+	if($path_AVDB eq "empty")
-	elsif(!-e $path_AVDB) { print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n"; exit; }
+	{
+		print STDERR "Missing flag !\n";
+		print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n";
+		exit;
+	}
+	elsif(!-e $path_AVDB)
+	{
+		print STDERR "Error message:\n";
+		print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n";
+	}
 	# Check the file list AV DB
-	if($pathAVDBList eq "empty") { print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n"; exit; }
+	if($pathAVDBList eq "empty")
+	{
+		print STDERR "Missing flag !\n";
+		print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n";
+		exit;
+	}
 	else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" }
 	# If no temp folder is specified write the result in the current path
-	if($folder_temp eq "empty") { $folder_temp   = "$pwd/TEMP_MutationalAnalysis_$pathInput[$#pathInput]"; }
+	if($folder_temp eq "empty") { $folder_temp   = "$pwd/TEMP_MutationalAnalysis_$filename"; }
 	if(!-e $folder_temp)        { mkdir($folder_temp) or die "$!: $folder_temp\n"; }
+	# Verify listAVDB is not empty
+	if($listAVDB eq "")
+	{
+		print STDERR "Path to the text file containing the list of Annovar databases installed is not specified !!!\n";
+		exit;
+	}
 }
 ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions
 sub FormatingInputFile
 {
 	{
 		foreach my $file (`ls $input`)
 		{
 			my $headerOriginalFile = "";
 			chomp($file);
-			my ($filename, $directories, $suffix) = fileparse("$input/$file", qr/\.[^.]*/);
 			CheckLengthFilename("$input/$file");
 			#################################################
 			###						Recover the input format 				###
 	else
 	{
 		my $headerOriginalFile = "";
 		CheckLengthFilename($input);
-		my ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/);
 		#################################################
 		###						Recover the input format 				###
 		#################################################
 		RecoverInputFormat($input, \$headerOriginalFile);
 # The name for the Excel sheet can't be longer than 31 characters
 sub CheckLengthFilename
 {
 	my ($inputFile) = @_;
-	## Verify the name of file, must be <= 31 chars for the sheet name
+	my ($filenameInputFile, $directoriesInputFile, $suffixInputFile) = fileparse($inputFile, qr/\.[^.]*/);
-	my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/);
+	if(length($filenameInputFile) > 32)
-	if(length($filename) > 32) { print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n"; exit; }
+	{
-}
+		print STDERR "Error message:\n";
+		print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n";
-# Recover the input format (vcf or txt) and depending on the format convert the input file in a suitable format for Annovar (ex: for MuTect files keep only the confident variants)
+	}
+}
+# Recover the input format. If MuTect output consider only "KEEP" variants
 sub RecoverInputFormat
 {
 	my ($file, $refS_headerOriginalFile) = @_;
 	my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
 # Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles.
 sub RecoverColNameAuto
 {
 	our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_;
-	$header      =~ s/[\r\n]+$//;
+	$header =~ s/[\r\n]+$//;
 	## Name of the columns
-	my @mutect     = qw(contig position ref_allele alt_allele);
+	my @mutect           = qw(contig position ref_allele alt_allele);
-	my @vcf        = qw(CHROM POS REF ALT);
+	my @vcf              = qw(CHROM POS REF ALT);
-	my @cosmic     = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic);
+	my @cosmic           = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic);
-	my @icgc       = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele);
+	my @icgc             = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele);
-	my @tcga       = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2);
+	my @tcga             = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2);
-	my @ionTorrent = qw(chr Position Ref Alt);
+	my @ionTorrent       = qw(chr Position Ref Alt);
-	my @proton     = qw(Chrom Position Ref Variant);
+	my @proton           = qw(Chrom Position Ref Variant);
-	my @varScan2   = qw(Chrom Position Ref VarAllele);
+	my @varScan2         = qw(Chrom Position Ref VarAllele);
-	my @annovar    = qw(Chr Start Ref Obs);
+	my @varScan2_somatic = qw(chrom position ref var);
-	my @custom     = qw(Chromosome Start Wild_Type Mutant);
+	my @annovar          = qw(Chr Start Ref Obs);
+	my @custom           = qw(Chromosome Start Wild_Type Mutant);
-	my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@annovar, \@custom);
+	my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@varScan2_somatic, \@annovar, \@custom);
 	my $timer  = 0; # For controlling if the names are present on the dictionnary or not
 	foreach my $refTab (@allTab)
 	{
 		my @tab = @$refTab;
 		else { $timer++; }
 	}
 	if($timer == scalar(@allTab))
 	{
+		print STDERR "Error message:\n";
 		print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n";
 		print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n";
 		exit;
 	}
 		foreach my $file (`ls $folder_temp/*.txt`)
 		{
 			chomp($file);
 			# For recover the name of the file without extension, the directory where the file is and the extension of the file
-			my ($filename, $directories, $suffix)    = fileparse("$folder_temp/$file", qr/\.[^.]*/);
+			my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/);
 			my $filenameOK = "";
 			# For removing the ColumnCorrect for txt files
 			if($filename =~ /(.+)-ColumnCorrect/)
 			{
 				if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; }
 				else { $filenameOK = $1; }
 			}
-			else { print STDERR "Case not considered for $filename!!!\n"; exit; }
+			else { print STDERR "Error message:\n"; print STDERR "Case not considered for $filename!!!\n"; }
 			#################################################
 			###						 Cut the files in n part  		  ###
 			#################################################
-			# Recover the number of variants in the file for deciding the number of CPU to use
+			# Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts
 			my $cpu = 0;
-			my $nbVariants = `wc -l $file`;
+			# Keep the original header
-			$nbVariants =~ /(\d+).+/;
+			my $headerOriginalFile = "";
+			# Save the numer of lines
-			if($1-1 <= 5000)      { $cpu = 1; }
+			my $nbLine  = 0;
-			elsif( ($1-1 > 5000) && ($1-1 < 25000) ) { $cpu = 2; }
+			splitInputFile($file, \$cpu, \$headerOriginalFile, $filenameOK, \$nbLine);
-			elsif( ($1-1 >= 25000) && ($1-1 < 100000) ) { $cpu = 8; }
-			else { $cpu = $max_cpu; }
-			# If the number predefined can't be used on the machine use the maximum number specify by the administrator
-			if($cpu > $max_cpu) { $cpu = $max_cpu }
-			## Recover the header
-			open(F1, $file) or die "$!: $file\n";
-			my $headerOriginalFile = <F1>;
-			close F1;
-			## Remove the first line of the file
-			my $fileNoHeader = "$folder_temp/${filenameOK}-NoHeader";
-			`sed 1d $file > $fileNoHeader`;
-			if(!-e "$folder_temp/$filenameOK") { mkdir("$folder_temp/$filenameOK") or die "Can't create the directory $folder_temp/$filenameOK\n"; }
-			my $lines_per_temp = int(1+($1 / $cpu)); # +1 in case of the div == 0
-			`split -l $lines_per_temp $fileNoHeader $folder_temp/$filenameOK/$filenameOK-`;
-			if($headerOriginalFile eq "") { print STDERR "No header for the file $file!!!\nPlease check the format of your file\n"; exit; }
-			my @files = <$folder_temp/$filenameOK/$filenameOK-*>;
 			#################################################
 			###							Annotate the n part  		 		  ###
 			#################################################
-			my $pm = Parallel::ForkManager->new($cpu);
+			annotateFile($cpu, $filenameOK, $headerOriginalFile);
-			foreach my $tempFile (@files)
-			{
+			#################################################
-				# Forks and returns the pid for the child:
+			###					Paste the file together 		 		  ###
-			 	my $pid = $pm->start and next;
+			#################################################
+			createOutput($filenameOK, $headerOriginalFile, $nbLine);
-			 	# Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo
-			 	my ($filename, $directories, $suffix) = fileparse($tempFile, qr/\-[^.]*/);
-				my $outFilenameTemp = $filename.$suffix;
-				Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput");
-				# Annotate the file with Annovar
-				my $tempFileName_AVOutput = $filename.$suffix.".".${refGenome}."_multianno.txt";
-				if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
-				else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
-				# Check if the annotations worked
-				open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n";
-				while(<F1>)
-				{
-					if($_ =~ /ERROR/i)
-					{
-						print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n";
-						print STDERR $_;
-						print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n";
-						exit;
-					}
-				}
-				close F1;
-				# Recover the strand orientation
-				my $length_AVheader = 0;
-				RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader);
-				# Recover the sequence context
-				RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filenameOK/$outFilenameTemp".".".${refGenome}."_multianno.txt");
-				$pm->finish; # Terminates the child process
-			}
-			# Wait all the child process
-			$pm->wait_all_children;
-			# Paste the file together
-			CombinedTempFile("$folder_temp/$filenameOK", "$folderAnnovar/$filenameOK".".".${refGenome}."_multianno.txt");
 		}
 	}
 	# The input file is one file
 	else
 	{
-		my ($filenameO, $directoriesO, $suffixO) = fileparse($input, qr/\.[^.]*/);
 		#################################################
 		###						 Cut the files in n part  		  ###
 		#################################################
-		# Recover the number of variants in the file for deciding the number of CPU to use
+		# Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts
 		my $cpu = 0;
-		my $nbVariants = `wc -l $folder_temp/$filenameO-ColumnCorrect.txt`;
+		# Keep the original header
-		$nbVariants =~ /(\d+).+/;
+		my $headerOriginalFile = "";
+		# Save the numer of lines
-		if($1-1 <= 5000)      { $cpu = 1; }
+		my $nbLine  = 0;
-		elsif( ($1-1 > 5000) && ($1-1 < 25000) ) { $cpu = 2; }
+		splitInputFile("$folder_temp/$filename-ColumnCorrect.txt", \$cpu, \$headerOriginalFile, $filename, \$nbLine);
-		elsif( ($1-1 >= 25000) && ($1-1 < 100000) ) { $cpu = 8; }
-		else { $cpu = $max_cpu; }
-		# If the number predefined can't be used on the machine use the maximum number specify by the administrator
-		if($cpu > $max_cpu) { $cpu = $max_cpu }
-		## Recover the header
-		open(F1, "$folder_temp/$filenameO-ColumnCorrect.txt") or die "$!: $folder_temp/$filenameO-ColumnCorrect.txt\n";
-		my $headerOriginalFile = <F1>;
-		close F1;
-		## Remove the first line of the file
-		my $fileNoHeader = "$folder_temp/$filenameO-NoHeader";
-		`sed 1d $folder_temp/$filenameO-ColumnCorrect.txt > $fileNoHeader`;
-		if(!-e "$folder_temp/$filenameO") { mkdir("$folder_temp/$filenameO") or die "Can't create the directory $folder_temp/$filenameO\n"; }
-		my $lines_per_temp = int(1+($1 / $cpu)); # +1 in case of the div == 0
-		`split -l $lines_per_temp $fileNoHeader $folder_temp/$filenameO/$filenameO-`;
-		if($headerOriginalFile eq "") { print STDERR "No header for the file $input!!!\nPlease check the format of your file\n"; exit; }
-		my @files = <$folder_temp/$filenameO/$filenameO-*>;
 		#################################################
 		###							Annotate the n part  		 		  ###
 		#################################################
-		my $pm = Parallel::ForkManager->new($cpu);
+		annotateFile($cpu, $filename, $headerOriginalFile);
-		foreach my $tempFile (@files)
-		{
-			# Forks and returns the pid for the child:
+		#################################################
-			my $pid = $pm->start and next;
+		###					Paste the file together 		 		  ###
+		#################################################
-			# Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo
+		createOutput($filename, $headerOriginalFile, $nbLine);
-			# For recover the name of the file without extension, the directory were the file is and the extension of the file
-			my ($filename, $directories, $suffix) = fileparse($tempFile, qr/\.[^.]*/);
-			my $outFilenameTemp = $filename.$suffix;
-			Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput");
-			# Annotate the file with Annovar
-			my $tempFileName_AVOutput = $outFilenameTemp.".".${refGenome}."_multianno.txt";
-			if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
-			else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
-			# Check if the annotations worked
-				open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n";
-				while(<F1>)
-				{
-					if($_ =~ /ERROR/i)
-					{
-						print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n";
-						print STDERR $_;
-						print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n";
-						exit;
-					}
-				}
-				close F1;
-			# Recover the strand orientation
-			my $length_AVheader = 0;
-			RecoverStrand("$folder_temp/$tempFileName_AVOutput",  $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader);
-			# Recover the sequence context
-			RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filenameO/$tempFileName_AVOutput");
-			$pm->finish; # Terminates the child process
-		}
-		# Wait all the child process
-		$pm->wait_all_children;
-		# Paste the file together
-		CombinedTempFile("$folder_temp/$filenameO", "$folderAnnovar/$filenameO".".".${refGenome}."_multianno.txt");
 	}
 	# Remove the temporary directory
 	rmtree($folder_temp);
+}
+sub splitInputFile
+{
+	my ($inputFile, $ref_cpu, $ref_header, $filename, $ref_nbLine) = @_;
+	my $nbVariants = `wc -l $inputFile`;
+	$nbVariants =~ /(\d+).+/;
+	$$ref_nbLine  = $1;
+	if($$ref_nbLine-1 <= 5000)      { $$ref_cpu = 1; }
+	elsif( ($$ref_nbLine-1 > 5000) && ($$ref_nbLine-1 < 25000) )    { $$ref_cpu = 2; }
+	elsif( ($$ref_nbLine-1 >= 25000) && ($$ref_nbLine-1 < 100000) ) { $$ref_cpu = 8; }
+	else { $$ref_cpu = $max_cpu; }
+	# If the number predefined can't be used on the machine use the maximum number specify by the administrator
+	if($$ref_cpu > $max_cpu) { $$ref_cpu = $max_cpu }
+	## Recover the header
+	open(F1, $inputFile) or die "$!: $inputFile\n";
+	$$ref_header= <F1>;
+	close F1;
+	## Remove the first line of the file
+	my $fileNoHeader = "$folder_temp/${filename}-NoHeader";
+	`sed 1d $inputFile > $fileNoHeader`;
+	if(!-e "$folder_temp/$filename") { mkdir("$folder_temp/$filename") or die "Can't create the directory $folder_temp/$filename\n"; }
+	my $lines_per_temp = int(1+($1 / $$ref_cpu)); # +1 in case of the div == 0
+	`split -l $lines_per_temp $fileNoHeader $folder_temp/$filename/$filename-`;
+	if($$ref_header eq "") { print STDERR "Error message:\n"; print STDERR "No header for the file $inputFile!!!\nPlease check the format of your file\n"; }
+}
+sub annotateFile
+{
+	my ($cpu, $filename, $headerOriginalFile) = @_;
+	my $pm = Parallel::ForkManager->new($cpu);
+	foreach my $tempFile (`ls $folder_temp/$filename/$filename-*`)
+	{
+		chomp($tempFile);
+		# Forks and returns the pid for the child:
+	 	my $pid = $pm->start and next;
+		# Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo
+		my ($filenameTempFile, $directoriesTempFile, $suffixTempFile) = fileparse($tempFile, qr/\-[^.]*/);
+		my $outFilenameTemp = $filenameTempFile.$suffixTempFile;
+		Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput");
+		# Annotate the file with Annovar
+		my $tempFileName_AVOutput = $filename.$suffixTempFile.".".${refGenome}."_multianno.txt";
+		if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
+		else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
+		# Check if the annotations worked
+		open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n";
+		while(<F1>)
+		{
+			if($_ =~ /ERROR/i)
+			{
+				print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n";
+				print STDERR $_;
+				print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n";
+			}
+		}
+		close F1;
+		# Recover the strand orientation
+		my $length_AVheader = 0;
+		RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader);
+		# Recover the sequence context
+		RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filename/$outFilenameTemp".".".${refGenome}."_multianno.txt");
+		$pm->finish; # Terminates the child process
+	}
+	# Wait all the child process
+	$pm->wait_all_children;
+}
+sub createOutput
+{
+	my ($filename, $headerOriginalFile, $nbLine) = @_;
+	## For MuTect and MuTect2 calling only variants passing MuTect filters are kept and sometines there is no variant passing these filters making error in Galaxy when using "collection".
+	if($nbLine == 1)
+	{
+		print STDOUT "\nThe sample $filename didn't pass MuTect filters\n";
+		### Print Annovar minimal header + the original header of the input file
+		my $outputFile = "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt";
+		open(OUT, ">", $outputFile) or die "$!: $outputFile\n";
+		if($fullAVDB eq "no")
+		{
+			print OUT "Chr\tStart\tEnd\tRef\tAlt\tFunc.refGene\tGene.refGene\tGeneDetail.refGene\tExonicFunc.refGene\tAAChange.refGene\tStrand\tcontext";
+			print OUT "\t".$headerOriginalFile;
+		}
+		### Print complete Annovar header (using the database name present in the file listAVDB) + the original header of the input file
+		else
+		{
+			print OUT "Chr\tStart\tEnd\tRef\tAlt";
+			open(F1, $listAVDB) or die "$!: $listAVDB\n";
+			while(<F1>)
+			{
+				if($_ =~ /^#/) { next; }
+				my @tab = split("\t", $_);
+				$tab[0] =~ /$refGenome\_(.+)\.txt/;
+				my $dbName = $1;
+				if($dbName =~ /refGene|knownGene|ensGene/)
+				{
+					print OUT "\t"."Func.$dbName\tGene.$dbName\tGeneDetail.$dbName\tExonicFunc.$dbName\tAAChange.$dbName";
+				}
+				else
+				{
+					print OUT "\t".$dbName;
+				}
+			}
+			print OUT "\tStrand\tcontext\t".$headerOriginalFile;
+			close F1;
+		}
+		close OUT;
+	}
+	else
+	{
+		CombinedTempFile("$folder_temp/$filename", "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt");
+	}
 }
 sub Convert2AV
 {
 	my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_;
 	while(<F1>)
 	{
 		$_ =~ s/[\r\n]+$//;
 		my @tab = split("\t", $_);
 		my $chr = "";
-		# Don't consider chrM and GL
-		if($tab[$chr_value] =~ /M|GL/i) { next; }
 		# Replace chr23 or chr24 by X or Y
 		if($tab[$chr_value] =~ /23/)     { $chr = "chrX"; }
 		elsif($tab[$chr_value] =~ /24/)  { $chr = "chrY"; }
 		elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; }
 		else                             { $chr = "chr".$tab[$chr_value]; }
+		### Consider only "normal" chromosomes for the annotation
+		if( ($chr !~ /chr\d{1,2}$|chrX|chrY/) ) { next; }
+		### Don't consider variants with two or more alt bases
+		if($tab[$alt_value] =~ /\,/) { next; }
 		### Reformat the Indels for Annovar
-			# chr1	85642631	C	    CT  => chr1	85642631	85642631	-	  T   (mm10)
+		# chr1	85642631	C	    CT  => chr1	85642631	85642631	-	  T   (mm10)
-			# chr5	26085724	ACTT	A   => chr5	26085725	26085727	CTT	-   (mm10)
+		# chr5	26085724	ACTT	A   => chr5	26085725	26085727	CTT	-   (mm10)
-			if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) )
+		if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) )
-			{
+		{
-				### First check if the indels in the file are not already correctly formated
+			### First check if the indels in the file are not already correctly formated
-				if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") )
+			if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") )
-				{
+			{
-					# For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels)
+				# For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels)
-					# Insertion: start = start & end = start
+				# Insertion: start = start & end = start
-					if($tab[$ref_value] =~ /\-/)
+				if($tab[$ref_value] =~ /\-/)
-					{
+				{
-						print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]";
+					print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]";
-					}
+				}
-					## Deletion: start = start & end = start + length(del) -1
+				## Deletion: start = start & end = start + length(del) -1
-					else
-					{
-						my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1);
-						print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]";
-					}
-				}
-				### Indels not correctly formated for Annovar
 				else
 				{
-					my @tabRef = split("", $tab[$ref_value]);
+					my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1);
-					my @tabAlt = split("", $tab[$alt_value]);
+					print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]";
+				}
-					# Remove the first base
+			}
-					my $ref2 = join("", @tabRef[1 .. $#tabRef]);
+			### Indels not correctly formated for Annovar
-					my $alt2 = join("", @tabAlt[1 .. $#tabAlt]);
-					if(length($alt2) == 0)
-					{
-						my $altOK   = "-";
-						my $startOK = $tab[$start_value] + 1;
-						my $stopOK  = $startOK + length($ref2) - 1;
-						print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK;
-					}
-					if(length($ref2) == 0)
-					{
-						my $refOK = "-";
-						print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2;
-					}
-				}
-			}
-			### SBS
 			else
 			{
-				print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value];
+				my @tabRef = split("", $tab[$ref_value]);
-			}
+				my @tabAlt = split("", $tab[$alt_value]);
-			## Print the original file at the end
+				# Remove the first base
-			foreach  (@tab) {  print OUT "\t$_"; }
+				my $ref2 = join("", @tabRef[1 .. $#tabRef]);
-			print OUT "\n";
+				my $alt2 = join("", @tabAlt[1 .. $#tabAlt]);
+				if(length($alt2) == 0)
+				{
+					my $altOK   = "-";
+					my $startOK = $tab[$start_value] + 1;
+					my $stopOK  = $startOK + length($ref2) - 1;
+					print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK;
+				}
+				if(length($ref2) == 0)
+				{
+					my $refOK = "-";
+					print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2;
+				}
+			}
+		}
+		### SBS
+		else
+		{
+			print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value];
+		}
+		## Print the original file at the end
+		foreach  (@tab) {  print OUT "\t$_"; }
+		print OUT "\n";
 	}
 	close F1; close OUT;
 }
 sub AnnotateAV
 {
 	my ($inputFile, $output) = @_;
-	if(!-e $path_AVDB) { print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; exit; }
+	if(!-e $path_AVDB)
+	{
+		print STDERR "Error message:\n";
+		print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n";
+		print STDERR "Please install the database for this genome before running Annovar\n";
+	}
 	# Extract the name of the databases
 	my $protocol = ""; my $operation = "";
 	ExtractAVDBName($listAVDB, \$protocol, \$operation);
 			elsif($$refS_month == 8)  { $$refS_month = "aug"; }
 			elsif($$refS_month == 9)  { $$refS_month = "sept"; }
 			elsif($$refS_month == 10) { $$refS_month = "oct"; }
 			elsif($$refS_month == 11) { $$refS_month = "nov"; }
 			elsif($$refS_month == 12) { $$refS_month = "dec"; }
-			else { print STDERR "Month number don't considered\n"; exit; }
 		}
 	}
 }
 ### Add the minimum of annotations (refGene + strand + context)
 sub annotateAV_min
 {
 	my ($inputFile, $output) = @_;
-	if(!-e $path_AVDB) { print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; exit; }
+	if(!-e $path_AVDB)
+	{
+		print STDERR "Error message:\n";
+		print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n";
+		print STDERR "Please install the database for this genome before running Annovar\n";
+	}
 	# Extract the name of the databases
 	my ($protocol, $operation) = ("refGene", "g");
 	`table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`;
 		elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; }
 		elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; }
 		else { $chr = $tab[$chr_value]; }
 		# Verify if the element exists
-		if($chr eq "")                       { print "Error RecoverStrand: The chromosome value is nor defined for $_\n"; exit; }
+		if($chr eq "")                       { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The chromosome value is nor defined for $_\n"; }
-		if(! exists $tab[$start_value])      { print "Error RecoverStrand: The start value is nor defined for $_\n"; exit; }
+		if(! exists $tab[$start_value])      { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The start value is nor defined for $_\n"; }
-		if(! exists $tab[$ref_value])        { print "Error RecoverStrand: The reference value is nor defined for $_\n"; exit; }
+		if(! exists $tab[$ref_value])        { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The reference value is nor defined for $_\n"; }
-		if(! exists $tab[$alt_value])        { print "Error RecoverStrand: The alternate value is nor defined for $_\n"; exit; }
+		if(! exists $tab[$alt_value])        { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The alternate value is nor defined for $_\n"; }
-		if(! exists $tab[$func_value])       { print "Error RecoverStrand: The functional value is nor defined for $_\n"; exit; }
+		if(! exists $tab[$func_value])       { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The functional value is nor defined for $_\n"; }
-		if(! exists $tab[$geneSymbol_value]) { print "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; exit; }
+		if(! exists $tab[$geneSymbol_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; }
 		my $geneSymbol = "";
 		######## For the splicing annotation we separate the gene symbol from the aa change
 		if($tab[$func_value] eq "splicing")
 		{
 	{
 		$_ =~ s/[\r\n]+$//;
 		my @tab = split("\t", $_);
 		my $strand = "";
 		$strand = $tab[$db_strandInfo_value];
-		if($strand eq "") { print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; exit; }
+		if($strand eq "") { print STDERR "Error message:\n"; print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; }
 		else
 		{
 			# Some genes have several strand orientation, keep the first in the database
 			if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; }
 		}
 		foreach my $kDB (sort keys %h_database)
 		{
 			if("$tab[5]:$tab[0]" eq $kDB)
 			{
-				if($lengthHeader != $lengthLine) { print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n"; exit; }
+				if($lengthHeader != $lengthLine)
+				{
+					print STDERR "Error message:\n";
+					print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n";
+				}
 				foreach my $line (@{$h_inputFile{$kFile}})
 				{
 					my @tab = split("\t", $line);
 					my $j = 0;
 		my %seqhash  = (); #database sequence for each chromosome
 		my %name_seq = (); #sequence for each region
 		my (%seqlen, %discordlen, %badorf);	#store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon
 		my ($count_success, @failure) = (0);
-		for my $curchr (sort keys $refH_allRegion)
+		for my $curchr (sort keys %{$refH_allRegion})
 		{
 			my ($seqid, $curseq) = ('', '');
 			my $fastafile        = "";
 			if ($curchr =~ m/^chr/)
 			{
 			$header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header);
 			# Print the Annovar header until the column before OtherInfo
 			print OUT "$tabHeaderInput[0]";
 			my $j = 0;
 			for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; }
-			print OUT "\tcontext";
+			print OUT "\tcontext\ttrinucleotide_context";
 			for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; }
 			print OUT "\n";
 		}
-		foreach my $k_hFile (sort keys $refH_InputFile)
+		foreach my $k_hFile (sort keys %{$refH_InputFile})
 		{
-			foreach my $k_allRegonSeqContext (sort keys $refH_allRegionSeqContext)
+			foreach my $k_allRegonSeqContext (sort keys %{$refH_allRegionSeqContext})
 			{
 				if($k_hFile eq $k_allRegonSeqContext)
 				{
 					my $j=0;
 					for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++)
 					{
 						my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]);
+						# Write Annovar annotation + strand orientation
 						for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; }
+						# Write the sequence context with the length defined by the user (default is 10)
 						print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext};
+						# Write the trinucleotide context
+						my $contextSequence           = $refH_allRegionSeqContext->{$k_allRegonSeqContext}; $contextSequence =~ tr/a-z/A-Z/;
+						my @tempContextSequence       = split("", $contextSequence);
+						my $midlle_totalNbBaseContext = (scalar(@tempContextSequence)-1)/2; # For having the middle of the sequence
+						print OUT "\t".$tempContextSequence[$midlle_totalNbBaseContext-1]."x".$tempContextSequence[$midlle_totalNbBaseContext+1];
+						# Write the original columns
 						for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; }
 						print OUT "\n";
 					}
 					last;
 				}
 my $name_of_column_NB  = "toto";
 for(my $i=0; $i<=$#tab_search_header; $i++)
 {
 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; }
 }
-if($name_of_column_NB eq "toto") { print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
+if($name_of_column_NB eq "toto")
-else                             { return $name_of_column_NB; }
+{
+	print STDERR "Error message:\n";
+	print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n";
+}
+else { return $name_of_column_NB; }
 }
 -v,        --verbose                     use verbose output
 --refGenome                   the reference genome to use
 --interval <interger>         the number of bases for the sequence context
 -o,        --outfile <string>            output directory for the result. If none is specify the result will be write in the same directory as the input file
 -AVDB      --pathAnnovarDB <string>      the path to Annovar database and the files with the chromosome size
---pathAVDBList                the path to the list of AV databases installed
+--pathAVDBList                the path to a text file containing the list of Annovar databases installed
 -temp      --pathTemporary <string>      the path for saving the temporary files
 --fullAnnotation <string>     recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no)
+--max_cpu <integer>           number of CPUs to be used for the annotation
 Function: automatically run a pipeline on a list of variants and annote them using Annovar
 Example: # Annotation only
 mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input
-Version: 06-2016 (June 2016)
+Version: 03-2017 (March 2017)
 =head1 OPTIONS
 =over 8
 =item B<--verbose>
 use verbose output.
 =item B<--refGenome>
 the reference genome to use, could be hg19 or mm9.
 =item B<--interval>
 the path to the directory containing the Annovar databases and the files with the chromosome size.
 =item B<--pathAVDBList>
-the path to a texte file containing the list of the Annovar databases installed.
+the path to a text file containing the list of Annovar databases installed.
 =item B<--pathTemporary>
 the path for saving temporary files generated by the script.
 If any is specify a temporary folder is created in the same directory where the script is running.
 =item B<--fullAnnotation>
 Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines)
+=item B<--max_cpu>
+Specify the number of CPUs to be used. This number is used for spliting the file in n part and running the annotations in each part in parallel.
 =head1 DESCRIPTION
 MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS.
 Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added.
 A text tab delimited file is produced.

Mercurial > repos > iarc > mutspec

comparison mutspecAnnot.pl @ 7:eda59b985b1c draft default tip