Mercurial > repos > iarc > mutspec
comparison mutspecAnnot.pl @ 7:eda59b985b1c draft default tip
Uploaded
| author | iarc |
|---|---|
| date | Mon, 13 Mar 2017 08:21:19 -0400 |
| parents | 46a10309dfe2 |
| children |
comparison
equal
deleted
inserted
replaced
| 6:46a10309dfe2 | 7:eda59b985b1c |
|---|---|
| 1 #!/usr/bin/env perl | 1 #!/usr/bin/env perl |
| 2 | 2 |
| 3 #-----------------------------------# | 3 #-----------------------------------# |
| 4 # Author: Maude # | 4 # Author: Maude # |
| 5 # Script: mutspecAnnot.pl # | 5 # Script: mutspecAnnot.pl # |
| 6 # Last update: 21/06/16 # | 6 # Last update: 02/03/17 # |
| 7 #-----------------------------------# | 7 #-----------------------------------# |
| 8 | 8 |
| 9 use strict; | 9 use strict; |
| 10 use warnings; | 10 use warnings; |
| 11 use Getopt::Long; | 11 use Getopt::Long; |
| 18 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. | 18 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. |
| 19 our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files | 19 our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files |
| 20 our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context. | 20 our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context. |
| 21 our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines) | 21 our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines) |
| 22 | 22 |
| 23 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp) or pod2usage(2); | 23 |
| 24 ######################################### | |
| 25 ### SPECIFY THE NUMBER OF CPU ### | |
| 26 ######################################### | |
| 27 our ($max_cpu) = 8; # Max number of CPU to use for the annotation | |
| 28 | |
| 29 | |
| 30 | |
| 31 | |
| 32 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp, 'max_cpu=i' => \$max_cpu) or pod2usage(2); | |
| 24 | 33 |
| 25 our ($input) = @ARGV; | 34 our ($input) = @ARGV; |
| 26 | 35 |
| 27 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); | 36 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); |
| 28 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); | 37 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); |
| 33 | 42 |
| 34 ###################################################################################################################################################### | 43 ###################################################################################################################################################### |
| 35 # GLOBAL VARIABLES # | 44 # GLOBAL VARIABLES # |
| 36 ###################################################################################################################################################### | 45 ###################################################################################################################################################### |
| 37 | 46 |
| 38 ######################################### | |
| 39 ### SPECIFY THE NUMBER OF CPU ### | |
| 40 ######################################### | |
| 41 our $max_cpu = 1; # Max number of CPU to use for the annotation | |
| 42 | |
| 43 | |
| 44 # Recover the current path | 47 # Recover the current path |
| 45 our $pwd = `pwd`; | 48 our $pwd = `pwd`; |
| 46 chomp($pwd); | 49 chomp($pwd); |
| 47 | 50 # Recover the filename and the input directory |
| 48 # Input file path | 51 our ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/); |
| 49 our @pathInput = split("/", $input); | |
| 50 # Output directories | 52 # Output directories |
| 51 our ($folderMutAnalysis, $folderAnnovar) = ("", ""); | 53 our ($folderMutAnalysis, $folderAnnovar) = ("", ""); |
| 52 # File with the list of Annovar databases to use | 54 # File with the list of Annovar databases to use |
| 53 our $listAVDB = ""; | 55 our $listAVDB = ""; |
| 54 # Initialisation of chromosome, position, ref and alt values | 56 # Initialisation of chromosome, position, ref and alt values |
| 55 our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a"); | 57 our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a"); |
| 56 | 58 |
| 57 | 59 |
| 60 | |
| 58 ###################################################################################################################################################### | 61 ###################################################################################################################################################### |
| 59 # MAIN # | 62 # MAIN # |
| 60 ###################################################################################################################################################### | 63 ###################################################################################################################################################### |
| 61 ## Check the presence of the flags and create the output and temp directories | 64 ## Check the presence of the flags and create the output and temp directories |
| 62 CheckFlags(); | 65 CheckFlags(); |
| 63 | 66 |
| 64 ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions | 67 ## Format the file correctly: |
| 68 ## 1) Check the length of the filename (must be <= 31 characters) | |
| 69 ## 2) Recover the input format. If MuTect output consider only "KEEP" variants | |
| 70 ## 3) Recover the column number for chr, start, ref and alt | |
| 65 FormatingInputFile(); | 71 FormatingInputFile(); |
| 66 | 72 |
| 67 # Annotate the file with Annovar, add the strand orientation and the sequence context | 73 # Annotate the file with Annovar, add the strand orientation and the sequence context |
| 68 FullAnnotation(); | 74 FullAnnotation(); |
| 69 | 75 |
| 73 | 79 |
| 74 ## Check the presence of the flags and create the output and temp directories | 80 ## Check the presence of the flags and create the output and temp directories |
| 75 sub CheckFlags | 81 sub CheckFlags |
| 76 { | 82 { |
| 77 # Check the reference genome | 83 # Check the reference genome |
| 78 if($refGenome eq "empty") { print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; exit; } | 84 if($refGenome eq "empty") |
| 79 if($intervalEnd eq "empty") { print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n"; exit; } | 85 { |
| 86 print STDERR "Missing flag !\n"; | |
| 87 print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; | |
| 88 exit; | |
| 89 } | |
| 90 if($intervalEnd eq "empty") | |
| 91 { | |
| 92 print STDERR "Missing flag !\n"; | |
| 93 print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n"; | |
| 94 exit; | |
| 95 } | |
| 80 # If no output is specified write the result as the same place as the input file | 96 # If no output is specified write the result as the same place as the input file |
| 81 if($output eq "empty") | 97 if($output eq "empty") |
| 82 { | 98 { |
| 83 my $folderRes = ""; | 99 my $directory = dirname( $input ); |
| 84 for(my $i=0; $i<$#pathInput; $i++) { $folderRes .= "$pathInput[$i]/"; } | 100 |
| 85 | 101 $folderMutAnalysis = "$directory/Mutational_Analysis"; |
| 86 $folderMutAnalysis = "$folderRes/Mutational_Analysis"; | |
| 87 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } | 102 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } |
| 88 } | 103 } |
| 89 else | 104 else |
| 90 { | 105 { |
| 91 if(!-e $output) { mkdir($output) or die "$!: $output\n"; } | 106 if(!-e $output) { mkdir($output) or die "$!: $output\n"; } |
| 96 # Create the output folder for Annovar | 111 # Create the output folder for Annovar |
| 97 $folderAnnovar = "$folderMutAnalysis/Annovar"; | 112 $folderAnnovar = "$folderMutAnalysis/Annovar"; |
| 98 if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; } | 113 if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; } |
| 99 | 114 |
| 100 # Verify the access to Annovar databases | 115 # Verify the access to Annovar databases |
| 101 if($path_AVDB eq "empty") { print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n"; exit; } | 116 if($path_AVDB eq "empty") |
| 102 elsif(!-e $path_AVDB) { print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n"; exit; } | 117 { |
| 118 print STDERR "Missing flag !\n"; | |
| 119 print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n"; | |
| 120 exit; | |
| 121 } | |
| 122 elsif(!-e $path_AVDB) | |
| 123 { | |
| 124 print STDERR "Error message:\n"; | |
| 125 print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n"; | |
| 126 } | |
| 103 | 127 |
| 104 # Check the file list AV DB | 128 # Check the file list AV DB |
| 105 if($pathAVDBList eq "empty") { print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n"; exit; } | 129 if($pathAVDBList eq "empty") |
| 130 { | |
| 131 print STDERR "Missing flag !\n"; | |
| 132 print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n"; | |
| 133 exit; | |
| 134 } | |
| 106 else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" } | 135 else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" } |
| 107 | 136 |
| 108 # If no temp folder is specified write the result in the current path | 137 # If no temp folder is specified write the result in the current path |
| 109 if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$pathInput[$#pathInput]"; } | 138 if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$filename"; } |
| 110 if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; } | 139 if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; } |
| 140 | |
| 141 # Verify listAVDB is not empty | |
| 142 if($listAVDB eq "") | |
| 143 { | |
| 144 print STDERR "Path to the text file containing the list of Annovar databases installed is not specified !!!\n"; | |
| 145 exit; | |
| 146 } | |
| 111 } | 147 } |
| 112 | 148 |
| 113 ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions | 149 ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions |
| 114 sub FormatingInputFile | 150 sub FormatingInputFile |
| 115 { | 151 { |
| 118 { | 154 { |
| 119 foreach my $file (`ls $input`) | 155 foreach my $file (`ls $input`) |
| 120 { | 156 { |
| 121 my $headerOriginalFile = ""; | 157 my $headerOriginalFile = ""; |
| 122 chomp($file); | 158 chomp($file); |
| 123 my ($filename, $directories, $suffix) = fileparse("$input/$file", qr/\.[^.]*/); | |
| 124 | 159 |
| 125 CheckLengthFilename("$input/$file"); | 160 CheckLengthFilename("$input/$file"); |
| 126 | 161 |
| 127 ################################################# | 162 ################################################# |
| 128 ### Recover the input format ### | 163 ### Recover the input format ### |
| 134 else | 169 else |
| 135 { | 170 { |
| 136 my $headerOriginalFile = ""; | 171 my $headerOriginalFile = ""; |
| 137 | 172 |
| 138 CheckLengthFilename($input); | 173 CheckLengthFilename($input); |
| 139 my ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/); | |
| 140 | 174 |
| 141 ################################################# | 175 ################################################# |
| 142 ### Recover the input format ### | 176 ### Recover the input format ### |
| 143 ################################################# | 177 ################################################# |
| 144 RecoverInputFormat($input, \$headerOriginalFile); | 178 RecoverInputFormat($input, \$headerOriginalFile); |
| 148 # The name for the Excel sheet can't be longer than 31 characters | 182 # The name for the Excel sheet can't be longer than 31 characters |
| 149 sub CheckLengthFilename | 183 sub CheckLengthFilename |
| 150 { | 184 { |
| 151 my ($inputFile) = @_; | 185 my ($inputFile) = @_; |
| 152 | 186 |
| 153 ## Verify the name of file, must be <= 31 chars for the sheet name | 187 my ($filenameInputFile, $directoriesInputFile, $suffixInputFile) = fileparse($inputFile, qr/\.[^.]*/); |
| 154 my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); | 188 |
| 155 | 189 if(length($filenameInputFile) > 32) |
| 156 if(length($filename) > 32) { print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n"; exit; } | 190 { |
| 157 } | 191 print STDERR "Error message:\n"; |
| 158 | 192 print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n"; |
| 159 # Recover the input format (vcf or txt) and depending on the format convert the input file in a suitable format for Annovar (ex: for MuTect files keep only the confident variants) | 193 } |
| 194 } | |
| 195 | |
| 196 # Recover the input format. If MuTect output consider only "KEEP" variants | |
| 160 sub RecoverInputFormat | 197 sub RecoverInputFormat |
| 161 { | 198 { |
| 162 my ($file, $refS_headerOriginalFile) = @_; | 199 my ($file, $refS_headerOriginalFile) = @_; |
| 163 | 200 |
| 164 my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); | 201 my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); |
| 319 # Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles. | 356 # Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles. |
| 320 sub RecoverColNameAuto | 357 sub RecoverColNameAuto |
| 321 { | 358 { |
| 322 our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_; | 359 our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_; |
| 323 | 360 |
| 324 $header =~ s/[\r\n]+$//; | 361 $header =~ s/[\r\n]+$//; |
| 325 | 362 |
| 326 ## Name of the columns | 363 ## Name of the columns |
| 327 my @mutect = qw(contig position ref_allele alt_allele); | 364 my @mutect = qw(contig position ref_allele alt_allele); |
| 328 my @vcf = qw(CHROM POS REF ALT); | 365 my @vcf = qw(CHROM POS REF ALT); |
| 329 my @cosmic = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic); | 366 my @cosmic = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic); |
| 330 my @icgc = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele); | 367 my @icgc = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele); |
| 331 my @tcga = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2); | 368 my @tcga = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2); |
| 332 my @ionTorrent = qw(chr Position Ref Alt); | 369 my @ionTorrent = qw(chr Position Ref Alt); |
| 333 my @proton = qw(Chrom Position Ref Variant); | 370 my @proton = qw(Chrom Position Ref Variant); |
| 334 my @varScan2 = qw(Chrom Position Ref VarAllele); | 371 my @varScan2 = qw(Chrom Position Ref VarAllele); |
| 335 my @annovar = qw(Chr Start Ref Obs); | 372 my @varScan2_somatic = qw(chrom position ref var); |
| 336 my @custom = qw(Chromosome Start Wild_Type Mutant); | 373 my @annovar = qw(Chr Start Ref Obs); |
| 337 | 374 my @custom = qw(Chromosome Start Wild_Type Mutant); |
| 338 my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@annovar, \@custom); | 375 |
| 376 my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@varScan2_somatic, \@annovar, \@custom); | |
| 339 my $timer = 0; # For controlling if the names are present on the dictionnary or not | 377 my $timer = 0; # For controlling if the names are present on the dictionnary or not |
| 340 | 378 |
| 341 foreach my $refTab (@allTab) | 379 foreach my $refTab (@allTab) |
| 342 { | 380 { |
| 343 my @tab = @$refTab; | 381 my @tab = @$refTab; |
| 350 else { $timer++; } | 388 else { $timer++; } |
| 351 } | 389 } |
| 352 | 390 |
| 353 if($timer == scalar(@allTab)) | 391 if($timer == scalar(@allTab)) |
| 354 { | 392 { |
| 393 print STDERR "Error message:\n"; | |
| 355 print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n"; | 394 print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n"; |
| 356 print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n"; | 395 print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n"; |
| 357 exit; | 396 exit; |
| 358 } | 397 } |
| 359 | 398 |
| 402 foreach my $file (`ls $folder_temp/*.txt`) | 441 foreach my $file (`ls $folder_temp/*.txt`) |
| 403 { | 442 { |
| 404 chomp($file); | 443 chomp($file); |
| 405 | 444 |
| 406 # For recover the name of the file without extension, the directory where the file is and the extension of the file | 445 # For recover the name of the file without extension, the directory where the file is and the extension of the file |
| 407 my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/); | 446 my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/); |
| 408 my $filenameOK = ""; | 447 my $filenameOK = ""; |
| 409 # For removing the ColumnCorrect for txt files | 448 # For removing the ColumnCorrect for txt files |
| 410 if($filename =~ /(.+)-ColumnCorrect/) | 449 if($filename =~ /(.+)-ColumnCorrect/) |
| 411 { | 450 { |
| 412 if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; } | 451 if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; } |
| 413 else { $filenameOK = $1; } | 452 else { $filenameOK = $1; } |
| 414 } | 453 } |
| 415 else { print STDERR "Case not considered for $filename!!!\n"; exit; } | 454 else { print STDERR "Error message:\n"; print STDERR "Case not considered for $filename!!!\n"; } |
| 416 | 455 |
| 417 | 456 |
| 418 ################################################# | 457 ################################################# |
| 419 ### Cut the files in n part ### | 458 ### Cut the files in n part ### |
| 420 ################################################# | 459 ################################################# |
| 421 # Recover the number of variants in the file for deciding the number of CPU to use | 460 # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts |
| 422 my $cpu = 0; | 461 my $cpu = 0; |
| 423 my $nbVariants = `wc -l $file`; | 462 # Keep the original header |
| 424 $nbVariants =~ /(\d+).+/; | 463 my $headerOriginalFile = ""; |
| 425 | 464 # Save the numer of lines |
| 426 if($1-1 <= 5000) { $cpu = 1; } | 465 my $nbLine = 0; |
| 427 elsif( ($1-1 > 5000) && ($1-1 < 25000) ) { $cpu = 2; } | 466 splitInputFile($file, \$cpu, \$headerOriginalFile, $filenameOK, \$nbLine); |
| 428 elsif( ($1-1 >= 25000) && ($1-1 < 100000) ) { $cpu = 8; } | 467 |
| 429 else { $cpu = $max_cpu; } | |
| 430 | |
| 431 # If the number predefined can't be used on the machine use the maximum number specify by the administrator | |
| 432 if($cpu > $max_cpu) { $cpu = $max_cpu } | |
| 433 | |
| 434 ## Recover the header | |
| 435 open(F1, $file) or die "$!: $file\n"; | |
| 436 my $headerOriginalFile = <F1>; | |
| 437 close F1; | |
| 438 | |
| 439 ## Remove the first line of the file | |
| 440 my $fileNoHeader = "$folder_temp/${filenameOK}-NoHeader"; | |
| 441 `sed 1d $file > $fileNoHeader`; | |
| 442 | |
| 443 if(!-e "$folder_temp/$filenameOK") { mkdir("$folder_temp/$filenameOK") or die "Can't create the directory $folder_temp/$filenameOK\n"; } | |
| 444 my $lines_per_temp = int(1+($1 / $cpu)); # +1 in case of the div == 0 | |
| 445 `split -l $lines_per_temp $fileNoHeader $folder_temp/$filenameOK/$filenameOK-`; | |
| 446 | |
| 447 if($headerOriginalFile eq "") { print STDERR "No header for the file $file!!!\nPlease check the format of your file\n"; exit; } | |
| 448 my @files = <$folder_temp/$filenameOK/$filenameOK-*>; | |
| 449 | 468 |
| 450 ################################################# | 469 ################################################# |
| 451 ### Annotate the n part ### | 470 ### Annotate the n part ### |
| 452 ################################################# | 471 ################################################# |
| 453 my $pm = Parallel::ForkManager->new($cpu); | 472 annotateFile($cpu, $filenameOK, $headerOriginalFile); |
| 454 | 473 |
| 455 foreach my $tempFile (@files) | 474 |
| 456 { | 475 ################################################# |
| 457 # Forks and returns the pid for the child: | 476 ### Paste the file together ### |
| 458 my $pid = $pm->start and next; | 477 ################################################# |
| 459 | 478 createOutput($filenameOK, $headerOriginalFile, $nbLine); |
| 460 # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo | |
| 461 my ($filename, $directories, $suffix) = fileparse($tempFile, qr/\-[^.]*/); | |
| 462 my $outFilenameTemp = $filename.$suffix; | |
| 463 Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput"); | |
| 464 | |
| 465 # Annotate the file with Annovar | |
| 466 my $tempFileName_AVOutput = $filename.$suffix.".".${refGenome}."_multianno.txt"; | |
| 467 if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } | |
| 468 else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } | |
| 469 | |
| 470 # Check if the annotations worked | |
| 471 open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n"; | |
| 472 while(<F1>) | |
| 473 { | |
| 474 if($_ =~ /ERROR/i) | |
| 475 { | |
| 476 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n"; | |
| 477 print STDERR $_; | |
| 478 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n"; | |
| 479 exit; | |
| 480 } | |
| 481 } | |
| 482 close F1; | |
| 483 | |
| 484 # Recover the strand orientation | |
| 485 my $length_AVheader = 0; | |
| 486 RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader); | |
| 487 | |
| 488 # Recover the sequence context | |
| 489 RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filenameOK/$outFilenameTemp".".".${refGenome}."_multianno.txt"); | |
| 490 | |
| 491 $pm->finish; # Terminates the child process | |
| 492 } | |
| 493 # Wait all the child process | |
| 494 $pm->wait_all_children; | |
| 495 | |
| 496 # Paste the file together | |
| 497 CombinedTempFile("$folder_temp/$filenameOK", "$folderAnnovar/$filenameOK".".".${refGenome}."_multianno.txt"); | |
| 498 } | 479 } |
| 499 } | 480 } |
| 500 # The input file is one file | 481 # The input file is one file |
| 501 else | 482 else |
| 502 { | 483 { |
| 503 my ($filenameO, $directoriesO, $suffixO) = fileparse($input, qr/\.[^.]*/); | |
| 504 | |
| 505 ################################################# | 484 ################################################# |
| 506 ### Cut the files in n part ### | 485 ### Cut the files in n part ### |
| 507 ################################################# | 486 ################################################# |
| 508 # Recover the number of variants in the file for deciding the number of CPU to use | 487 # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts |
| 509 my $cpu = 0; | 488 my $cpu = 0; |
| 510 my $nbVariants = `wc -l $folder_temp/$filenameO-ColumnCorrect.txt`; | 489 # Keep the original header |
| 511 $nbVariants =~ /(\d+).+/; | 490 my $headerOriginalFile = ""; |
| 512 | 491 # Save the numer of lines |
| 513 if($1-1 <= 5000) { $cpu = 1; } | 492 my $nbLine = 0; |
| 514 elsif( ($1-1 > 5000) && ($1-1 < 25000) ) { $cpu = 2; } | 493 splitInputFile("$folder_temp/$filename-ColumnCorrect.txt", \$cpu, \$headerOriginalFile, $filename, \$nbLine); |
| 515 elsif( ($1-1 >= 25000) && ($1-1 < 100000) ) { $cpu = 8; } | 494 |
| 516 else { $cpu = $max_cpu; } | |
| 517 | |
| 518 # If the number predefined can't be used on the machine use the maximum number specify by the administrator | |
| 519 if($cpu > $max_cpu) { $cpu = $max_cpu } | |
| 520 | |
| 521 ## Recover the header | |
| 522 open(F1, "$folder_temp/$filenameO-ColumnCorrect.txt") or die "$!: $folder_temp/$filenameO-ColumnCorrect.txt\n"; | |
| 523 my $headerOriginalFile = <F1>; | |
| 524 close F1; | |
| 525 | |
| 526 ## Remove the first line of the file | |
| 527 my $fileNoHeader = "$folder_temp/$filenameO-NoHeader"; | |
| 528 `sed 1d $folder_temp/$filenameO-ColumnCorrect.txt > $fileNoHeader`; | |
| 529 | |
| 530 if(!-e "$folder_temp/$filenameO") { mkdir("$folder_temp/$filenameO") or die "Can't create the directory $folder_temp/$filenameO\n"; } | |
| 531 my $lines_per_temp = int(1+($1 / $cpu)); # +1 in case of the div == 0 | |
| 532 `split -l $lines_per_temp $fileNoHeader $folder_temp/$filenameO/$filenameO-`; | |
| 533 | |
| 534 if($headerOriginalFile eq "") { print STDERR "No header for the file $input!!!\nPlease check the format of your file\n"; exit; } | |
| 535 my @files = <$folder_temp/$filenameO/$filenameO-*>; | |
| 536 | 495 |
| 537 ################################################# | 496 ################################################# |
| 538 ### Annotate the n part ### | 497 ### Annotate the n part ### |
| 539 ################################################# | 498 ################################################# |
| 540 my $pm = Parallel::ForkManager->new($cpu); | 499 annotateFile($cpu, $filename, $headerOriginalFile); |
| 541 foreach my $tempFile (@files) | 500 |
| 542 { | 501 |
| 543 # Forks and returns the pid for the child: | 502 ################################################# |
| 544 my $pid = $pm->start and next; | 503 ### Paste the file together ### |
| 545 | 504 ################################################# |
| 546 # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo | 505 createOutput($filename, $headerOriginalFile, $nbLine); |
| 547 # For recover the name of the file without extension, the directory were the file is and the extension of the file | |
| 548 my ($filename, $directories, $suffix) = fileparse($tempFile, qr/\.[^.]*/); | |
| 549 my $outFilenameTemp = $filename.$suffix; | |
| 550 Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput"); | |
| 551 | |
| 552 # Annotate the file with Annovar | |
| 553 my $tempFileName_AVOutput = $outFilenameTemp.".".${refGenome}."_multianno.txt"; | |
| 554 if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } | |
| 555 else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } | |
| 556 | |
| 557 # Check if the annotations worked | |
| 558 open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n"; | |
| 559 while(<F1>) | |
| 560 { | |
| 561 if($_ =~ /ERROR/i) | |
| 562 { | |
| 563 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n"; | |
| 564 print STDERR $_; | |
| 565 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n"; | |
| 566 exit; | |
| 567 } | |
| 568 } | |
| 569 close F1; | |
| 570 | |
| 571 # Recover the strand orientation | |
| 572 my $length_AVheader = 0; | |
| 573 RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader); | |
| 574 | |
| 575 # Recover the sequence context | |
| 576 RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filenameO/$tempFileName_AVOutput"); | |
| 577 | |
| 578 $pm->finish; # Terminates the child process | |
| 579 } | |
| 580 # Wait all the child process | |
| 581 $pm->wait_all_children; | |
| 582 | |
| 583 # Paste the file together | |
| 584 CombinedTempFile("$folder_temp/$filenameO", "$folderAnnovar/$filenameO".".".${refGenome}."_multianno.txt"); | |
| 585 } | 506 } |
| 586 # Remove the temporary directory | 507 # Remove the temporary directory |
| 587 rmtree($folder_temp); | 508 rmtree($folder_temp); |
| 509 } | |
| 510 | |
| 511 | |
| 512 | |
| 513 sub splitInputFile | |
| 514 { | |
| 515 my ($inputFile, $ref_cpu, $ref_header, $filename, $ref_nbLine) = @_; | |
| 516 | |
| 517 my $nbVariants = `wc -l $inputFile`; | |
| 518 $nbVariants =~ /(\d+).+/; | |
| 519 $$ref_nbLine = $1; | |
| 520 | |
| 521 if($$ref_nbLine-1 <= 5000) { $$ref_cpu = 1; } | |
| 522 elsif( ($$ref_nbLine-1 > 5000) && ($$ref_nbLine-1 < 25000) ) { $$ref_cpu = 2; } | |
| 523 elsif( ($$ref_nbLine-1 >= 25000) && ($$ref_nbLine-1 < 100000) ) { $$ref_cpu = 8; } | |
| 524 else { $$ref_cpu = $max_cpu; } | |
| 525 | |
| 526 # If the number predefined can't be used on the machine use the maximum number specify by the administrator | |
| 527 if($$ref_cpu > $max_cpu) { $$ref_cpu = $max_cpu } | |
| 528 | |
| 529 | |
| 530 ## Recover the header | |
| 531 open(F1, $inputFile) or die "$!: $inputFile\n"; | |
| 532 $$ref_header= <F1>; | |
| 533 close F1; | |
| 534 | |
| 535 ## Remove the first line of the file | |
| 536 my $fileNoHeader = "$folder_temp/${filename}-NoHeader"; | |
| 537 `sed 1d $inputFile > $fileNoHeader`; | |
| 538 | |
| 539 if(!-e "$folder_temp/$filename") { mkdir("$folder_temp/$filename") or die "Can't create the directory $folder_temp/$filename\n"; } | |
| 540 my $lines_per_temp = int(1+($1 / $$ref_cpu)); # +1 in case of the div == 0 | |
| 541 `split -l $lines_per_temp $fileNoHeader $folder_temp/$filename/$filename-`; | |
| 542 | |
| 543 if($$ref_header eq "") { print STDERR "Error message:\n"; print STDERR "No header for the file $inputFile!!!\nPlease check the format of your file\n"; } | |
| 544 } | |
| 545 | |
| 546 sub annotateFile | |
| 547 { | |
| 548 my ($cpu, $filename, $headerOriginalFile) = @_; | |
| 549 | |
| 550 my $pm = Parallel::ForkManager->new($cpu); | |
| 551 | |
| 552 foreach my $tempFile (`ls $folder_temp/$filename/$filename-*`) | |
| 553 { | |
| 554 chomp($tempFile); | |
| 555 | |
| 556 # Forks and returns the pid for the child: | |
| 557 my $pid = $pm->start and next; | |
| 558 | |
| 559 # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo | |
| 560 my ($filenameTempFile, $directoriesTempFile, $suffixTempFile) = fileparse($tempFile, qr/\-[^.]*/); | |
| 561 my $outFilenameTemp = $filenameTempFile.$suffixTempFile; | |
| 562 Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput"); | |
| 563 | |
| 564 # Annotate the file with Annovar | |
| 565 my $tempFileName_AVOutput = $filename.$suffixTempFile.".".${refGenome}."_multianno.txt"; | |
| 566 if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } | |
| 567 else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } | |
| 568 | |
| 569 # Check if the annotations worked | |
| 570 open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n"; | |
| 571 while(<F1>) | |
| 572 { | |
| 573 if($_ =~ /ERROR/i) | |
| 574 { | |
| 575 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n"; | |
| 576 print STDERR $_; | |
| 577 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n"; | |
| 578 } | |
| 579 } | |
| 580 close F1; | |
| 581 | |
| 582 # Recover the strand orientation | |
| 583 my $length_AVheader = 0; | |
| 584 RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader); | |
| 585 | |
| 586 # Recover the sequence context | |
| 587 RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filename/$outFilenameTemp".".".${refGenome}."_multianno.txt"); | |
| 588 | |
| 589 $pm->finish; # Terminates the child process | |
| 590 } | |
| 591 # Wait all the child process | |
| 592 $pm->wait_all_children; | |
| 593 } | |
| 594 | |
| 595 sub createOutput | |
| 596 { | |
| 597 my ($filename, $headerOriginalFile, $nbLine) = @_; | |
| 598 | |
| 599 ## For MuTect and MuTect2 calling only variants passing MuTect filters are kept and sometines there is no variant passing these filters making error in Galaxy when using "collection". | |
| 600 if($nbLine == 1) | |
| 601 { | |
| 602 print STDOUT "\nThe sample $filename didn't pass MuTect filters\n"; | |
| 603 | |
| 604 ### Print Annovar minimal header + the original header of the input file | |
| 605 my $outputFile = "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt"; | |
| 606 open(OUT, ">", $outputFile) or die "$!: $outputFile\n"; | |
| 607 | |
| 608 if($fullAVDB eq "no") | |
| 609 { | |
| 610 print OUT "Chr\tStart\tEnd\tRef\tAlt\tFunc.refGene\tGene.refGene\tGeneDetail.refGene\tExonicFunc.refGene\tAAChange.refGene\tStrand\tcontext"; | |
| 611 print OUT "\t".$headerOriginalFile; | |
| 612 } | |
| 613 ### Print complete Annovar header (using the database name present in the file listAVDB) + the original header of the input file | |
| 614 else | |
| 615 { | |
| 616 print OUT "Chr\tStart\tEnd\tRef\tAlt"; | |
| 617 open(F1, $listAVDB) or die "$!: $listAVDB\n"; | |
| 618 while(<F1>) | |
| 619 { | |
| 620 if($_ =~ /^#/) { next; } | |
| 621 | |
| 622 my @tab = split("\t", $_); | |
| 623 $tab[0] =~ /$refGenome\_(.+)\.txt/; | |
| 624 my $dbName = $1; | |
| 625 | |
| 626 if($dbName =~ /refGene|knownGene|ensGene/) | |
| 627 { | |
| 628 print OUT "\t"."Func.$dbName\tGene.$dbName\tGeneDetail.$dbName\tExonicFunc.$dbName\tAAChange.$dbName"; | |
| 629 } | |
| 630 else | |
| 631 { | |
| 632 print OUT "\t".$dbName; | |
| 633 } | |
| 634 } | |
| 635 print OUT "\tStrand\tcontext\t".$headerOriginalFile; | |
| 636 | |
| 637 close F1; | |
| 638 } | |
| 639 close OUT; | |
| 640 } | |
| 641 else | |
| 642 { | |
| 643 CombinedTempFile("$folder_temp/$filename", "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt"); | |
| 644 } | |
| 588 } | 645 } |
| 589 | 646 |
| 590 sub Convert2AV | 647 sub Convert2AV |
| 591 { | 648 { |
| 592 my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_; | 649 my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_; |
| 599 while(<F1>) | 656 while(<F1>) |
| 600 { | 657 { |
| 601 $_ =~ s/[\r\n]+$//; | 658 $_ =~ s/[\r\n]+$//; |
| 602 my @tab = split("\t", $_); | 659 my @tab = split("\t", $_); |
| 603 my $chr = ""; | 660 my $chr = ""; |
| 604 | |
| 605 # Don't consider chrM and GL | |
| 606 if($tab[$chr_value] =~ /M|GL/i) { next; } | |
| 607 | 661 |
| 608 # Replace chr23 or chr24 by X or Y | 662 # Replace chr23 or chr24 by X or Y |
| 609 if($tab[$chr_value] =~ /23/) { $chr = "chrX"; } | 663 if($tab[$chr_value] =~ /23/) { $chr = "chrX"; } |
| 610 elsif($tab[$chr_value] =~ /24/) { $chr = "chrY"; } | 664 elsif($tab[$chr_value] =~ /24/) { $chr = "chrY"; } |
| 611 elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; } | 665 elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; } |
| 612 else { $chr = "chr".$tab[$chr_value]; } | 666 else { $chr = "chr".$tab[$chr_value]; } |
| 613 | 667 |
| 668 | |
| 669 ### Consider only "normal" chromosomes for the annotation | |
| 670 if( ($chr !~ /chr\d{1,2}$|chrX|chrY/) ) { next; } | |
| 671 | |
| 672 | |
| 673 ### Don't consider variants with two or more alt bases | |
| 674 if($tab[$alt_value] =~ /\,/) { next; } | |
| 675 | |
| 614 ### Reformat the Indels for Annovar | 676 ### Reformat the Indels for Annovar |
| 615 # chr1 85642631 C CT => chr1 85642631 85642631 - T (mm10) | 677 # chr1 85642631 C CT => chr1 85642631 85642631 - T (mm10) |
| 616 # chr5 26085724 ACTT A => chr5 26085725 26085727 CTT - (mm10) | 678 # chr5 26085724 ACTT A => chr5 26085725 26085727 CTT - (mm10) |
| 617 if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) ) | 679 if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) ) |
| 618 { | 680 { |
| 619 ### First check if the indels in the file are not already correctly formated | 681 ### First check if the indels in the file are not already correctly formated |
| 620 if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) | 682 if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) |
| 621 { | 683 { |
| 622 # For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels) | 684 # For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels) |
| 623 # Insertion: start = start & end = start | 685 # Insertion: start = start & end = start |
| 624 if($tab[$ref_value] =~ /\-/) | 686 if($tab[$ref_value] =~ /\-/) |
| 625 { | 687 { |
| 626 print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]"; | 688 print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]"; |
| 627 } | 689 } |
| 628 ## Deletion: start = start & end = start + length(del) -1 | 690 ## Deletion: start = start & end = start + length(del) -1 |
| 629 else | |
| 630 { | |
| 631 my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1); | |
| 632 print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]"; | |
| 633 } | |
| 634 } | |
| 635 ### Indels not correctly formated for Annovar | |
| 636 else | 691 else |
| 637 { | 692 { |
| 638 my @tabRef = split("", $tab[$ref_value]); | 693 my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1); |
| 639 my @tabAlt = split("", $tab[$alt_value]); | 694 print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]"; |
| 640 | 695 } |
| 641 # Remove the first base | 696 } |
| 642 my $ref2 = join("", @tabRef[1 .. $#tabRef]); | 697 ### Indels not correctly formated for Annovar |
| 643 my $alt2 = join("", @tabAlt[1 .. $#tabAlt]); | |
| 644 | |
| 645 if(length($alt2) == 0) | |
| 646 { | |
| 647 my $altOK = "-"; | |
| 648 my $startOK = $tab[$start_value] + 1; | |
| 649 my $stopOK = $startOK + length($ref2) - 1; | |
| 650 print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK; | |
| 651 } | |
| 652 | |
| 653 if(length($ref2) == 0) | |
| 654 { | |
| 655 my $refOK = "-"; | |
| 656 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2; | |
| 657 } | |
| 658 } | |
| 659 } | |
| 660 ### SBS | |
| 661 else | 698 else |
| 662 { | 699 { |
| 663 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value]; | 700 my @tabRef = split("", $tab[$ref_value]); |
| 664 } | 701 my @tabAlt = split("", $tab[$alt_value]); |
| 665 | 702 |
| 666 ## Print the original file at the end | 703 # Remove the first base |
| 667 foreach (@tab) { print OUT "\t$_"; } | 704 my $ref2 = join("", @tabRef[1 .. $#tabRef]); |
| 668 print OUT "\n"; | 705 my $alt2 = join("", @tabAlt[1 .. $#tabAlt]); |
| 706 | |
| 707 if(length($alt2) == 0) | |
| 708 { | |
| 709 my $altOK = "-"; | |
| 710 my $startOK = $tab[$start_value] + 1; | |
| 711 my $stopOK = $startOK + length($ref2) - 1; | |
| 712 print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK; | |
| 713 } | |
| 714 | |
| 715 if(length($ref2) == 0) | |
| 716 { | |
| 717 my $refOK = "-"; | |
| 718 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2; | |
| 719 } | |
| 720 } | |
| 721 } | |
| 722 ### SBS | |
| 723 else | |
| 724 { | |
| 725 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value]; | |
| 726 } | |
| 727 | |
| 728 ## Print the original file at the end | |
| 729 foreach (@tab) { print OUT "\t$_"; } | |
| 730 print OUT "\n"; | |
| 669 } | 731 } |
| 670 close F1; close OUT; | 732 close F1; close OUT; |
| 671 } | 733 } |
| 672 | 734 |
| 673 sub AnnotateAV | 735 sub AnnotateAV |
| 674 { | 736 { |
| 675 my ($inputFile, $output) = @_; | 737 my ($inputFile, $output) = @_; |
| 676 | 738 |
| 677 if(!-e $path_AVDB) { print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; exit; } | 739 if(!-e $path_AVDB) |
| 740 { | |
| 741 print STDERR "Error message:\n"; | |
| 742 print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; | |
| 743 print STDERR "Please install the database for this genome before running Annovar\n"; | |
| 744 } | |
| 678 | 745 |
| 679 # Extract the name of the databases | 746 # Extract the name of the databases |
| 680 my $protocol = ""; my $operation = ""; | 747 my $protocol = ""; my $operation = ""; |
| 681 ExtractAVDBName($listAVDB, \$protocol, \$operation); | 748 ExtractAVDBName($listAVDB, \$protocol, \$operation); |
| 682 | 749 |
| 736 elsif($$refS_month == 8) { $$refS_month = "aug"; } | 803 elsif($$refS_month == 8) { $$refS_month = "aug"; } |
| 737 elsif($$refS_month == 9) { $$refS_month = "sept"; } | 804 elsif($$refS_month == 9) { $$refS_month = "sept"; } |
| 738 elsif($$refS_month == 10) { $$refS_month = "oct"; } | 805 elsif($$refS_month == 10) { $$refS_month = "oct"; } |
| 739 elsif($$refS_month == 11) { $$refS_month = "nov"; } | 806 elsif($$refS_month == 11) { $$refS_month = "nov"; } |
| 740 elsif($$refS_month == 12) { $$refS_month = "dec"; } | 807 elsif($$refS_month == 12) { $$refS_month = "dec"; } |
| 741 else { print STDERR "Month number don't considered\n"; exit; } | |
| 742 } | 808 } |
| 743 } | 809 } |
| 744 } | 810 } |
| 745 | 811 |
| 746 ### Add the minimum of annotations (refGene + strand + context) | 812 ### Add the minimum of annotations (refGene + strand + context) |
| 747 sub annotateAV_min | 813 sub annotateAV_min |
| 748 { | 814 { |
| 749 my ($inputFile, $output) = @_; | 815 my ($inputFile, $output) = @_; |
| 750 | 816 |
| 751 if(!-e $path_AVDB) { print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; exit; } | 817 if(!-e $path_AVDB) |
| 818 { | |
| 819 print STDERR "Error message:\n"; | |
| 820 print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; | |
| 821 print STDERR "Please install the database for this genome before running Annovar\n"; | |
| 822 } | |
| 752 | 823 |
| 753 # Extract the name of the databases | 824 # Extract the name of the databases |
| 754 my ($protocol, $operation) = ("refGene", "g"); | 825 my ($protocol, $operation) = ("refGene", "g"); |
| 755 | 826 |
| 756 `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; | 827 `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; |
| 785 elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; } | 856 elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; } |
| 786 elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; } | 857 elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; } |
| 787 else { $chr = $tab[$chr_value]; } | 858 else { $chr = $tab[$chr_value]; } |
| 788 | 859 |
| 789 # Verify if the element exists | 860 # Verify if the element exists |
| 790 if($chr eq "") { print "Error RecoverStrand: The chromosome value is nor defined for $_\n"; exit; } | 861 if($chr eq "") { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The chromosome value is nor defined for $_\n"; } |
| 791 if(! exists $tab[$start_value]) { print "Error RecoverStrand: The start value is nor defined for $_\n"; exit; } | 862 if(! exists $tab[$start_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The start value is nor defined for $_\n"; } |
| 792 if(! exists $tab[$ref_value]) { print "Error RecoverStrand: The reference value is nor defined for $_\n"; exit; } | 863 if(! exists $tab[$ref_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The reference value is nor defined for $_\n"; } |
| 793 if(! exists $tab[$alt_value]) { print "Error RecoverStrand: The alternate value is nor defined for $_\n"; exit; } | 864 if(! exists $tab[$alt_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The alternate value is nor defined for $_\n"; } |
| 794 if(! exists $tab[$func_value]) { print "Error RecoverStrand: The functional value is nor defined for $_\n"; exit; } | 865 if(! exists $tab[$func_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The functional value is nor defined for $_\n"; } |
| 795 if(! exists $tab[$geneSymbol_value]) { print "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; exit; } | 866 if(! exists $tab[$geneSymbol_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; } |
| 796 | 867 |
| 797 my $geneSymbol = ""; | 868 my $geneSymbol = ""; |
| 798 ######## For the splicing annotation we separate the gene symbol from the aa change | 869 ######## For the splicing annotation we separate the gene symbol from the aa change |
| 799 if($tab[$func_value] eq "splicing") | 870 if($tab[$func_value] eq "splicing") |
| 800 { | 871 { |
| 821 { | 892 { |
| 822 $_ =~ s/[\r\n]+$//; | 893 $_ =~ s/[\r\n]+$//; |
| 823 my @tab = split("\t", $_); | 894 my @tab = split("\t", $_); |
| 824 my $strand = ""; | 895 my $strand = ""; |
| 825 $strand = $tab[$db_strandInfo_value]; | 896 $strand = $tab[$db_strandInfo_value]; |
| 826 if($strand eq "") { print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; exit; } | 897 if($strand eq "") { print STDERR "Error message:\n"; print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; } |
| 827 else | 898 else |
| 828 { | 899 { |
| 829 # Some genes have several strand orientation, keep the first in the database | 900 # Some genes have several strand orientation, keep the first in the database |
| 830 if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; } | 901 if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; } |
| 831 } | 902 } |
| 872 | 943 |
| 873 foreach my $kDB (sort keys %h_database) | 944 foreach my $kDB (sort keys %h_database) |
| 874 { | 945 { |
| 875 if("$tab[5]:$tab[0]" eq $kDB) | 946 if("$tab[5]:$tab[0]" eq $kDB) |
| 876 { | 947 { |
| 877 if($lengthHeader != $lengthLine) { print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n"; exit; } | 948 if($lengthHeader != $lengthLine) |
| 949 { | |
| 950 print STDERR "Error message:\n"; | |
| 951 print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n"; | |
| 952 } | |
| 878 | 953 |
| 879 foreach my $line (@{$h_inputFile{$kFile}}) | 954 foreach my $line (@{$h_inputFile{$kFile}}) |
| 880 { | 955 { |
| 881 my @tab = split("\t", $line); | 956 my @tab = split("\t", $line); |
| 882 my $j = 0; | 957 my $j = 0; |
| 969 my %seqhash = (); #database sequence for each chromosome | 1044 my %seqhash = (); #database sequence for each chromosome |
| 970 my %name_seq = (); #sequence for each region | 1045 my %name_seq = (); #sequence for each region |
| 971 my (%seqlen, %discordlen, %badorf); #store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon | 1046 my (%seqlen, %discordlen, %badorf); #store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon |
| 972 my ($count_success, @failure) = (0); | 1047 my ($count_success, @failure) = (0); |
| 973 | 1048 |
| 974 for my $curchr (sort keys $refH_allRegion) | 1049 for my $curchr (sort keys %{$refH_allRegion}) |
| 975 { | 1050 { |
| 976 my ($seqid, $curseq) = ('', ''); | 1051 my ($seqid, $curseq) = ('', ''); |
| 977 my $fastafile = ""; | 1052 my $fastafile = ""; |
| 978 if ($curchr =~ m/^chr/) | 1053 if ($curchr =~ m/^chr/) |
| 979 { | 1054 { |
| 1078 $header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header); | 1153 $header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header); |
| 1079 # Print the Annovar header until the column before OtherInfo | 1154 # Print the Annovar header until the column before OtherInfo |
| 1080 print OUT "$tabHeaderInput[0]"; | 1155 print OUT "$tabHeaderInput[0]"; |
| 1081 my $j = 0; | 1156 my $j = 0; |
| 1082 for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; } | 1157 for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; } |
| 1083 print OUT "\tcontext"; | 1158 print OUT "\tcontext\ttrinucleotide_context"; |
| 1084 for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } | 1159 for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } |
| 1085 print OUT "\n"; | 1160 print OUT "\n"; |
| 1086 } | 1161 } |
| 1087 | 1162 |
| 1088 foreach my $k_hFile (sort keys $refH_InputFile) | 1163 foreach my $k_hFile (sort keys %{$refH_InputFile}) |
| 1089 { | 1164 { |
| 1090 foreach my $k_allRegonSeqContext (sort keys $refH_allRegionSeqContext) | 1165 foreach my $k_allRegonSeqContext (sort keys %{$refH_allRegionSeqContext}) |
| 1091 { | 1166 { |
| 1092 if($k_hFile eq $k_allRegonSeqContext) | 1167 if($k_hFile eq $k_allRegonSeqContext) |
| 1093 { | 1168 { |
| 1094 my $j=0; | 1169 my $j=0; |
| 1095 | 1170 |
| 1096 for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++) | 1171 for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++) |
| 1097 { | 1172 { |
| 1098 my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]); | 1173 my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]); |
| 1099 | 1174 |
| 1175 # Write Annovar annotation + strand orientation | |
| 1100 for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; } | 1176 for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; } |
| 1177 # Write the sequence context with the length defined by the user (default is 10) | |
| 1101 print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext}; | 1178 print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext}; |
| 1179 # Write the trinucleotide context | |
| 1180 my $contextSequence = $refH_allRegionSeqContext->{$k_allRegonSeqContext}; $contextSequence =~ tr/a-z/A-Z/; | |
| 1181 my @tempContextSequence = split("", $contextSequence); | |
| 1182 my $midlle_totalNbBaseContext = (scalar(@tempContextSequence)-1)/2; # For having the middle of the sequence | |
| 1183 print OUT "\t".$tempContextSequence[$midlle_totalNbBaseContext-1]."x".$tempContextSequence[$midlle_totalNbBaseContext+1]; | |
| 1184 # Write the original columns | |
| 1102 for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } | 1185 for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } |
| 1103 print OUT "\n"; | 1186 print OUT "\n"; |
| 1104 } | 1187 } |
| 1105 last; | 1188 last; |
| 1106 } | 1189 } |
| 1138 my $name_of_column_NB = "toto"; | 1221 my $name_of_column_NB = "toto"; |
| 1139 for(my $i=0; $i<=$#tab_search_header; $i++) | 1222 for(my $i=0; $i<=$#tab_search_header; $i++) |
| 1140 { | 1223 { |
| 1141 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; } | 1224 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; } |
| 1142 } | 1225 } |
| 1143 if($name_of_column_NB eq "toto") { print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } | 1226 if($name_of_column_NB eq "toto") |
| 1144 else { return $name_of_column_NB; } | 1227 { |
| 1228 print STDERR "Error message:\n"; | |
| 1229 print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; | |
| 1230 } | |
| 1231 else { return $name_of_column_NB; } | |
| 1145 } | 1232 } |
| 1146 | 1233 |
| 1147 | 1234 |
| 1148 | 1235 |
| 1149 | 1236 |
| 1163 -v, --verbose use verbose output | 1250 -v, --verbose use verbose output |
| 1164 --refGenome the reference genome to use | 1251 --refGenome the reference genome to use |
| 1165 --interval <interger> the number of bases for the sequence context | 1252 --interval <interger> the number of bases for the sequence context |
| 1166 -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file | 1253 -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file |
| 1167 -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size | 1254 -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size |
| 1168 --pathAVDBList the path to the list of AV databases installed | 1255 --pathAVDBList the path to a text file containing the list of Annovar databases installed |
| 1169 -temp --pathTemporary <string> the path for saving the temporary files | 1256 -temp --pathTemporary <string> the path for saving the temporary files |
| 1170 --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no) | 1257 --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no) |
| 1258 --max_cpu <integer> number of CPUs to be used for the annotation | |
| 1171 | 1259 |
| 1172 | 1260 |
| 1173 Function: automatically run a pipeline on a list of variants and annote them using Annovar | 1261 Function: automatically run a pipeline on a list of variants and annote them using Annovar |
| 1174 | 1262 |
| 1175 Example: # Annotation only | 1263 Example: # Annotation only |
| 1176 mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input | 1264 mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input |
| 1177 | 1265 |
| 1178 | 1266 |
| 1179 Version: 06-2016 (June 2016) | 1267 Version: 03-2017 (March 2017) |
| 1180 | 1268 |
| 1181 | 1269 |
| 1182 =head1 OPTIONS | 1270 =head1 OPTIONS |
| 1183 | 1271 |
| 1184 =over 8 | 1272 =over 8 |
| 1193 | 1281 |
| 1194 =item B<--verbose> | 1282 =item B<--verbose> |
| 1195 | 1283 |
| 1196 use verbose output. | 1284 use verbose output. |
| 1197 | 1285 |
| 1198 | |
| 1199 =item B<--refGenome> | 1286 =item B<--refGenome> |
| 1200 | 1287 |
| 1201 the reference genome to use, could be hg19 or mm9. | 1288 the reference genome to use, could be hg19 or mm9. |
| 1202 | 1289 |
| 1203 =item B<--interval> | 1290 =item B<--interval> |
| 1212 | 1299 |
| 1213 the path to the directory containing the Annovar databases and the files with the chromosome size. | 1300 the path to the directory containing the Annovar databases and the files with the chromosome size. |
| 1214 | 1301 |
| 1215 =item B<--pathAVDBList> | 1302 =item B<--pathAVDBList> |
| 1216 | 1303 |
| 1217 the path to a texte file containing the list of the Annovar databases installed. | 1304 the path to a text file containing the list of Annovar databases installed. |
| 1218 | 1305 |
| 1219 =item B<--pathTemporary> | 1306 =item B<--pathTemporary> |
| 1220 | 1307 |
| 1221 the path for saving temporary files generated by the script. | 1308 the path for saving temporary files generated by the script. |
| 1222 If any is specify a temporary folder is created in the same directory where the script is running. | 1309 If any is specify a temporary folder is created in the same directory where the script is running. |
| 1224 | 1311 |
| 1225 =item B<--fullAnnotation> | 1312 =item B<--fullAnnotation> |
| 1226 | 1313 |
| 1227 Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines) | 1314 Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines) |
| 1228 | 1315 |
| 1316 =item B<--max_cpu> | |
| 1317 | |
| 1318 Specify the number of CPUs to be used. This number is used for spliting the file in n part and running the annotations in each part in parallel. | |
| 1319 | |
| 1320 | |
| 1229 =head1 DESCRIPTION | 1321 =head1 DESCRIPTION |
| 1230 | 1322 |
| 1231 MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS. | 1323 MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS. |
| 1232 Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added. | 1324 Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added. |
| 1233 A text tab delimited file is produced. | 1325 A text tab delimited file is produced. |
