comparison mutspecAnnot.pl @ 7:eda59b985b1c draft default tip

Uploaded
author iarc
date Mon, 13 Mar 2017 08:21:19 -0400
parents 46a10309dfe2
children
comparison
equal deleted inserted replaced
6:46a10309dfe2 7:eda59b985b1c
1 #!/usr/bin/env perl 1 #!/usr/bin/env perl
2 2
3 #-----------------------------------# 3 #-----------------------------------#
4 # Author: Maude # 4 # Author: Maude #
5 # Script: mutspecAnnot.pl # 5 # Script: mutspecAnnot.pl #
6 # Last update: 21/06/16 # 6 # Last update: 02/03/17 #
7 #-----------------------------------# 7 #-----------------------------------#
8 8
9 use strict; 9 use strict;
10 use warnings; 10 use warnings;
11 use Getopt::Long; 11 use Getopt::Long;
18 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. 18 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
19 our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files 19 our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files
20 our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context. 20 our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context.
21 our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines) 21 our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines)
22 22
23 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp) or pod2usage(2); 23
24 #########################################
25 ### SPECIFY THE NUMBER OF CPU ###
26 #########################################
27 our ($max_cpu) = 8; # Max number of CPU to use for the annotation
28
29
30
31
32 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp, 'max_cpu=i' => \$max_cpu) or pod2usage(2);
24 33
25 our ($input) = @ARGV; 34 our ($input) = @ARGV;
26 35
27 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); 36 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
28 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); 37 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
33 42
34 ###################################################################################################################################################### 43 ######################################################################################################################################################
35 # GLOBAL VARIABLES # 44 # GLOBAL VARIABLES #
36 ###################################################################################################################################################### 45 ######################################################################################################################################################
37 46
38 #########################################
39 ### SPECIFY THE NUMBER OF CPU ###
40 #########################################
41 our $max_cpu = 1; # Max number of CPU to use for the annotation
42
43
44 # Recover the current path 47 # Recover the current path
45 our $pwd = `pwd`; 48 our $pwd = `pwd`;
46 chomp($pwd); 49 chomp($pwd);
47 50 # Recover the filename and the input directory
48 # Input file path 51 our ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/);
49 our @pathInput = split("/", $input);
50 # Output directories 52 # Output directories
51 our ($folderMutAnalysis, $folderAnnovar) = ("", ""); 53 our ($folderMutAnalysis, $folderAnnovar) = ("", "");
52 # File with the list of Annovar databases to use 54 # File with the list of Annovar databases to use
53 our $listAVDB = ""; 55 our $listAVDB = "";
54 # Initialisation of chromosome, position, ref and alt values 56 # Initialisation of chromosome, position, ref and alt values
55 our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a"); 57 our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a");
56 58
57 59
60
58 ###################################################################################################################################################### 61 ######################################################################################################################################################
59 # MAIN # 62 # MAIN #
60 ###################################################################################################################################################### 63 ######################################################################################################################################################
61 ## Check the presence of the flags and create the output and temp directories 64 ## Check the presence of the flags and create the output and temp directories
62 CheckFlags(); 65 CheckFlags();
63 66
64 ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions 67 ## Format the file correctly:
68 ## 1) Check the length of the filename (must be <= 31 characters)
69 ## 2) Recover the input format. If MuTect output consider only "KEEP" variants
70 ## 3) Recover the column number for chr, start, ref and alt
65 FormatingInputFile(); 71 FormatingInputFile();
66 72
67 # Annotate the file with Annovar, add the strand orientation and the sequence context 73 # Annotate the file with Annovar, add the strand orientation and the sequence context
68 FullAnnotation(); 74 FullAnnotation();
69 75
73 79
74 ## Check the presence of the flags and create the output and temp directories 80 ## Check the presence of the flags and create the output and temp directories
75 sub CheckFlags 81 sub CheckFlags
76 { 82 {
77 # Check the reference genome 83 # Check the reference genome
78 if($refGenome eq "empty") { print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; exit; } 84 if($refGenome eq "empty")
79 if($intervalEnd eq "empty") { print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n"; exit; } 85 {
86 print STDERR "Missing flag !\n";
87 print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n";
88 exit;
89 }
90 if($intervalEnd eq "empty")
91 {
92 print STDERR "Missing flag !\n";
93 print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n";
94 exit;
95 }
80 # If no output is specified write the result as the same place as the input file 96 # If no output is specified write the result as the same place as the input file
81 if($output eq "empty") 97 if($output eq "empty")
82 { 98 {
83 my $folderRes = ""; 99 my $directory = dirname( $input );
84 for(my $i=0; $i<$#pathInput; $i++) { $folderRes .= "$pathInput[$i]/"; } 100
85 101 $folderMutAnalysis = "$directory/Mutational_Analysis";
86 $folderMutAnalysis = "$folderRes/Mutational_Analysis";
87 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } 102 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; }
88 } 103 }
89 else 104 else
90 { 105 {
91 if(!-e $output) { mkdir($output) or die "$!: $output\n"; } 106 if(!-e $output) { mkdir($output) or die "$!: $output\n"; }
96 # Create the output folder for Annovar 111 # Create the output folder for Annovar
97 $folderAnnovar = "$folderMutAnalysis/Annovar"; 112 $folderAnnovar = "$folderMutAnalysis/Annovar";
98 if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; } 113 if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; }
99 114
100 # Verify the access to Annovar databases 115 # Verify the access to Annovar databases
101 if($path_AVDB eq "empty") { print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n"; exit; } 116 if($path_AVDB eq "empty")
102 elsif(!-e $path_AVDB) { print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n"; exit; } 117 {
118 print STDERR "Missing flag !\n";
119 print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n";
120 exit;
121 }
122 elsif(!-e $path_AVDB)
123 {
124 print STDERR "Error message:\n";
125 print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n";
126 }
103 127
104 # Check the file list AV DB 128 # Check the file list AV DB
105 if($pathAVDBList eq "empty") { print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n"; exit; } 129 if($pathAVDBList eq "empty")
130 {
131 print STDERR "Missing flag !\n";
132 print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n";
133 exit;
134 }
106 else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" } 135 else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" }
107 136
108 # If no temp folder is specified write the result in the current path 137 # If no temp folder is specified write the result in the current path
109 if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$pathInput[$#pathInput]"; } 138 if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$filename"; }
110 if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; } 139 if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; }
140
141 # Verify listAVDB is not empty
142 if($listAVDB eq "")
143 {
144 print STDERR "Path to the text file containing the list of Annovar databases installed is not specified !!!\n";
145 exit;
146 }
111 } 147 }
112 148
113 ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions 149 ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions
114 sub FormatingInputFile 150 sub FormatingInputFile
115 { 151 {
118 { 154 {
119 foreach my $file (`ls $input`) 155 foreach my $file (`ls $input`)
120 { 156 {
121 my $headerOriginalFile = ""; 157 my $headerOriginalFile = "";
122 chomp($file); 158 chomp($file);
123 my ($filename, $directories, $suffix) = fileparse("$input/$file", qr/\.[^.]*/);
124 159
125 CheckLengthFilename("$input/$file"); 160 CheckLengthFilename("$input/$file");
126 161
127 ################################################# 162 #################################################
128 ### Recover the input format ### 163 ### Recover the input format ###
134 else 169 else
135 { 170 {
136 my $headerOriginalFile = ""; 171 my $headerOriginalFile = "";
137 172
138 CheckLengthFilename($input); 173 CheckLengthFilename($input);
139 my ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/);
140 174
141 ################################################# 175 #################################################
142 ### Recover the input format ### 176 ### Recover the input format ###
143 ################################################# 177 #################################################
144 RecoverInputFormat($input, \$headerOriginalFile); 178 RecoverInputFormat($input, \$headerOriginalFile);
148 # The name for the Excel sheet can't be longer than 31 characters 182 # The name for the Excel sheet can't be longer than 31 characters
149 sub CheckLengthFilename 183 sub CheckLengthFilename
150 { 184 {
151 my ($inputFile) = @_; 185 my ($inputFile) = @_;
152 186
153 ## Verify the name of file, must be <= 31 chars for the sheet name 187 my ($filenameInputFile, $directoriesInputFile, $suffixInputFile) = fileparse($inputFile, qr/\.[^.]*/);
154 my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); 188
155 189 if(length($filenameInputFile) > 32)
156 if(length($filename) > 32) { print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n"; exit; } 190 {
157 } 191 print STDERR "Error message:\n";
158 192 print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n";
159 # Recover the input format (vcf or txt) and depending on the format convert the input file in a suitable format for Annovar (ex: for MuTect files keep only the confident variants) 193 }
194 }
195
196 # Recover the input format. If MuTect output consider only "KEEP" variants
160 sub RecoverInputFormat 197 sub RecoverInputFormat
161 { 198 {
162 my ($file, $refS_headerOriginalFile) = @_; 199 my ($file, $refS_headerOriginalFile) = @_;
163 200
164 my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); 201 my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
319 # Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles. 356 # Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles.
320 sub RecoverColNameAuto 357 sub RecoverColNameAuto
321 { 358 {
322 our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_; 359 our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_;
323 360
324 $header =~ s/[\r\n]+$//; 361 $header =~ s/[\r\n]+$//;
325 362
326 ## Name of the columns 363 ## Name of the columns
327 my @mutect = qw(contig position ref_allele alt_allele); 364 my @mutect = qw(contig position ref_allele alt_allele);
328 my @vcf = qw(CHROM POS REF ALT); 365 my @vcf = qw(CHROM POS REF ALT);
329 my @cosmic = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic); 366 my @cosmic = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic);
330 my @icgc = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele); 367 my @icgc = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele);
331 my @tcga = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2); 368 my @tcga = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2);
332 my @ionTorrent = qw(chr Position Ref Alt); 369 my @ionTorrent = qw(chr Position Ref Alt);
333 my @proton = qw(Chrom Position Ref Variant); 370 my @proton = qw(Chrom Position Ref Variant);
334 my @varScan2 = qw(Chrom Position Ref VarAllele); 371 my @varScan2 = qw(Chrom Position Ref VarAllele);
335 my @annovar = qw(Chr Start Ref Obs); 372 my @varScan2_somatic = qw(chrom position ref var);
336 my @custom = qw(Chromosome Start Wild_Type Mutant); 373 my @annovar = qw(Chr Start Ref Obs);
337 374 my @custom = qw(Chromosome Start Wild_Type Mutant);
338 my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@annovar, \@custom); 375
376 my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@varScan2_somatic, \@annovar, \@custom);
339 my $timer = 0; # For controlling if the names are present on the dictionnary or not 377 my $timer = 0; # For controlling if the names are present on the dictionnary or not
340 378
341 foreach my $refTab (@allTab) 379 foreach my $refTab (@allTab)
342 { 380 {
343 my @tab = @$refTab; 381 my @tab = @$refTab;
350 else { $timer++; } 388 else { $timer++; }
351 } 389 }
352 390
353 if($timer == scalar(@allTab)) 391 if($timer == scalar(@allTab))
354 { 392 {
393 print STDERR "Error message:\n";
355 print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n"; 394 print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n";
356 print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n"; 395 print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n";
357 exit; 396 exit;
358 } 397 }
359 398
402 foreach my $file (`ls $folder_temp/*.txt`) 441 foreach my $file (`ls $folder_temp/*.txt`)
403 { 442 {
404 chomp($file); 443 chomp($file);
405 444
406 # For recover the name of the file without extension, the directory where the file is and the extension of the file 445 # For recover the name of the file without extension, the directory where the file is and the extension of the file
407 my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/); 446 my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/);
408 my $filenameOK = ""; 447 my $filenameOK = "";
409 # For removing the ColumnCorrect for txt files 448 # For removing the ColumnCorrect for txt files
410 if($filename =~ /(.+)-ColumnCorrect/) 449 if($filename =~ /(.+)-ColumnCorrect/)
411 { 450 {
412 if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; } 451 if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; }
413 else { $filenameOK = $1; } 452 else { $filenameOK = $1; }
414 } 453 }
415 else { print STDERR "Case not considered for $filename!!!\n"; exit; } 454 else { print STDERR "Error message:\n"; print STDERR "Case not considered for $filename!!!\n"; }
416 455
417 456
418 ################################################# 457 #################################################
419 ### Cut the files in n part ### 458 ### Cut the files in n part ###
420 ################################################# 459 #################################################
421 # Recover the number of variants in the file for deciding the number of CPU to use 460 # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts
422 my $cpu = 0; 461 my $cpu = 0;
423 my $nbVariants = `wc -l $file`; 462 # Keep the original header
424 $nbVariants =~ /(\d+).+/; 463 my $headerOriginalFile = "";
425 464 # Save the numer of lines
426 if($1-1 <= 5000) { $cpu = 1; } 465 my $nbLine = 0;
427 elsif( ($1-1 > 5000) && ($1-1 < 25000) ) { $cpu = 2; } 466 splitInputFile($file, \$cpu, \$headerOriginalFile, $filenameOK, \$nbLine);
428 elsif( ($1-1 >= 25000) && ($1-1 < 100000) ) { $cpu = 8; } 467
429 else { $cpu = $max_cpu; }
430
431 # If the number predefined can't be used on the machine use the maximum number specify by the administrator
432 if($cpu > $max_cpu) { $cpu = $max_cpu }
433
434 ## Recover the header
435 open(F1, $file) or die "$!: $file\n";
436 my $headerOriginalFile = <F1>;
437 close F1;
438
439 ## Remove the first line of the file
440 my $fileNoHeader = "$folder_temp/${filenameOK}-NoHeader";
441 `sed 1d $file > $fileNoHeader`;
442
443 if(!-e "$folder_temp/$filenameOK") { mkdir("$folder_temp/$filenameOK") or die "Can't create the directory $folder_temp/$filenameOK\n"; }
444 my $lines_per_temp = int(1+($1 / $cpu)); # +1 in case of the div == 0
445 `split -l $lines_per_temp $fileNoHeader $folder_temp/$filenameOK/$filenameOK-`;
446
447 if($headerOriginalFile eq "") { print STDERR "No header for the file $file!!!\nPlease check the format of your file\n"; exit; }
448 my @files = <$folder_temp/$filenameOK/$filenameOK-*>;
449 468
450 ################################################# 469 #################################################
451 ### Annotate the n part ### 470 ### Annotate the n part ###
452 ################################################# 471 #################################################
453 my $pm = Parallel::ForkManager->new($cpu); 472 annotateFile($cpu, $filenameOK, $headerOriginalFile);
454 473
455 foreach my $tempFile (@files) 474
456 { 475 #################################################
457 # Forks and returns the pid for the child: 476 ### Paste the file together ###
458 my $pid = $pm->start and next; 477 #################################################
459 478 createOutput($filenameOK, $headerOriginalFile, $nbLine);
460 # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo
461 my ($filename, $directories, $suffix) = fileparse($tempFile, qr/\-[^.]*/);
462 my $outFilenameTemp = $filename.$suffix;
463 Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput");
464
465 # Annotate the file with Annovar
466 my $tempFileName_AVOutput = $filename.$suffix.".".${refGenome}."_multianno.txt";
467 if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
468 else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
469
470 # Check if the annotations worked
471 open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n";
472 while(<F1>)
473 {
474 if($_ =~ /ERROR/i)
475 {
476 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n";
477 print STDERR $_;
478 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n";
479 exit;
480 }
481 }
482 close F1;
483
484 # Recover the strand orientation
485 my $length_AVheader = 0;
486 RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader);
487
488 # Recover the sequence context
489 RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filenameOK/$outFilenameTemp".".".${refGenome}."_multianno.txt");
490
491 $pm->finish; # Terminates the child process
492 }
493 # Wait all the child process
494 $pm->wait_all_children;
495
496 # Paste the file together
497 CombinedTempFile("$folder_temp/$filenameOK", "$folderAnnovar/$filenameOK".".".${refGenome}."_multianno.txt");
498 } 479 }
499 } 480 }
500 # The input file is one file 481 # The input file is one file
501 else 482 else
502 { 483 {
503 my ($filenameO, $directoriesO, $suffixO) = fileparse($input, qr/\.[^.]*/);
504
505 ################################################# 484 #################################################
506 ### Cut the files in n part ### 485 ### Cut the files in n part ###
507 ################################################# 486 #################################################
508 # Recover the number of variants in the file for deciding the number of CPU to use 487 # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts
509 my $cpu = 0; 488 my $cpu = 0;
510 my $nbVariants = `wc -l $folder_temp/$filenameO-ColumnCorrect.txt`; 489 # Keep the original header
511 $nbVariants =~ /(\d+).+/; 490 my $headerOriginalFile = "";
512 491 # Save the numer of lines
513 if($1-1 <= 5000) { $cpu = 1; } 492 my $nbLine = 0;
514 elsif( ($1-1 > 5000) && ($1-1 < 25000) ) { $cpu = 2; } 493 splitInputFile("$folder_temp/$filename-ColumnCorrect.txt", \$cpu, \$headerOriginalFile, $filename, \$nbLine);
515 elsif( ($1-1 >= 25000) && ($1-1 < 100000) ) { $cpu = 8; } 494
516 else { $cpu = $max_cpu; }
517
518 # If the number predefined can't be used on the machine use the maximum number specify by the administrator
519 if($cpu > $max_cpu) { $cpu = $max_cpu }
520
521 ## Recover the header
522 open(F1, "$folder_temp/$filenameO-ColumnCorrect.txt") or die "$!: $folder_temp/$filenameO-ColumnCorrect.txt\n";
523 my $headerOriginalFile = <F1>;
524 close F1;
525
526 ## Remove the first line of the file
527 my $fileNoHeader = "$folder_temp/$filenameO-NoHeader";
528 `sed 1d $folder_temp/$filenameO-ColumnCorrect.txt > $fileNoHeader`;
529
530 if(!-e "$folder_temp/$filenameO") { mkdir("$folder_temp/$filenameO") or die "Can't create the directory $folder_temp/$filenameO\n"; }
531 my $lines_per_temp = int(1+($1 / $cpu)); # +1 in case of the div == 0
532 `split -l $lines_per_temp $fileNoHeader $folder_temp/$filenameO/$filenameO-`;
533
534 if($headerOriginalFile eq "") { print STDERR "No header for the file $input!!!\nPlease check the format of your file\n"; exit; }
535 my @files = <$folder_temp/$filenameO/$filenameO-*>;
536 495
537 ################################################# 496 #################################################
538 ### Annotate the n part ### 497 ### Annotate the n part ###
539 ################################################# 498 #################################################
540 my $pm = Parallel::ForkManager->new($cpu); 499 annotateFile($cpu, $filename, $headerOriginalFile);
541 foreach my $tempFile (@files) 500
542 { 501
543 # Forks and returns the pid for the child: 502 #################################################
544 my $pid = $pm->start and next; 503 ### Paste the file together ###
545 504 #################################################
546 # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo 505 createOutput($filename, $headerOriginalFile, $nbLine);
547 # For recover the name of the file without extension, the directory were the file is and the extension of the file
548 my ($filename, $directories, $suffix) = fileparse($tempFile, qr/\.[^.]*/);
549 my $outFilenameTemp = $filename.$suffix;
550 Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput");
551
552 # Annotate the file with Annovar
553 my $tempFileName_AVOutput = $outFilenameTemp.".".${refGenome}."_multianno.txt";
554 if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
555 else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
556
557 # Check if the annotations worked
558 open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n";
559 while(<F1>)
560 {
561 if($_ =~ /ERROR/i)
562 {
563 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n";
564 print STDERR $_;
565 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n";
566 exit;
567 }
568 }
569 close F1;
570
571 # Recover the strand orientation
572 my $length_AVheader = 0;
573 RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader);
574
575 # Recover the sequence context
576 RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filenameO/$tempFileName_AVOutput");
577
578 $pm->finish; # Terminates the child process
579 }
580 # Wait all the child process
581 $pm->wait_all_children;
582
583 # Paste the file together
584 CombinedTempFile("$folder_temp/$filenameO", "$folderAnnovar/$filenameO".".".${refGenome}."_multianno.txt");
585 } 506 }
586 # Remove the temporary directory 507 # Remove the temporary directory
587 rmtree($folder_temp); 508 rmtree($folder_temp);
509 }
510
511
512
513 sub splitInputFile
514 {
515 my ($inputFile, $ref_cpu, $ref_header, $filename, $ref_nbLine) = @_;
516
517 my $nbVariants = `wc -l $inputFile`;
518 $nbVariants =~ /(\d+).+/;
519 $$ref_nbLine = $1;
520
521 if($$ref_nbLine-1 <= 5000) { $$ref_cpu = 1; }
522 elsif( ($$ref_nbLine-1 > 5000) && ($$ref_nbLine-1 < 25000) ) { $$ref_cpu = 2; }
523 elsif( ($$ref_nbLine-1 >= 25000) && ($$ref_nbLine-1 < 100000) ) { $$ref_cpu = 8; }
524 else { $$ref_cpu = $max_cpu; }
525
526 # If the number predefined can't be used on the machine use the maximum number specify by the administrator
527 if($$ref_cpu > $max_cpu) { $$ref_cpu = $max_cpu }
528
529
530 ## Recover the header
531 open(F1, $inputFile) or die "$!: $inputFile\n";
532 $$ref_header= <F1>;
533 close F1;
534
535 ## Remove the first line of the file
536 my $fileNoHeader = "$folder_temp/${filename}-NoHeader";
537 `sed 1d $inputFile > $fileNoHeader`;
538
539 if(!-e "$folder_temp/$filename") { mkdir("$folder_temp/$filename") or die "Can't create the directory $folder_temp/$filename\n"; }
540 my $lines_per_temp = int(1+($1 / $$ref_cpu)); # +1 in case of the div == 0
541 `split -l $lines_per_temp $fileNoHeader $folder_temp/$filename/$filename-`;
542
543 if($$ref_header eq "") { print STDERR "Error message:\n"; print STDERR "No header for the file $inputFile!!!\nPlease check the format of your file\n"; }
544 }
545
546 sub annotateFile
547 {
548 my ($cpu, $filename, $headerOriginalFile) = @_;
549
550 my $pm = Parallel::ForkManager->new($cpu);
551
552 foreach my $tempFile (`ls $folder_temp/$filename/$filename-*`)
553 {
554 chomp($tempFile);
555
556 # Forks and returns the pid for the child:
557 my $pid = $pm->start and next;
558
559 # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo
560 my ($filenameTempFile, $directoriesTempFile, $suffixTempFile) = fileparse($tempFile, qr/\-[^.]*/);
561 my $outFilenameTemp = $filenameTempFile.$suffixTempFile;
562 Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput");
563
564 # Annotate the file with Annovar
565 my $tempFileName_AVOutput = $filename.$suffixTempFile.".".${refGenome}."_multianno.txt";
566 if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
567 else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
568
569 # Check if the annotations worked
570 open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n";
571 while(<F1>)
572 {
573 if($_ =~ /ERROR/i)
574 {
575 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n";
576 print STDERR $_;
577 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n";
578 }
579 }
580 close F1;
581
582 # Recover the strand orientation
583 my $length_AVheader = 0;
584 RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader);
585
586 # Recover the sequence context
587 RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filename/$outFilenameTemp".".".${refGenome}."_multianno.txt");
588
589 $pm->finish; # Terminates the child process
590 }
591 # Wait all the child process
592 $pm->wait_all_children;
593 }
594
595 sub createOutput
596 {
597 my ($filename, $headerOriginalFile, $nbLine) = @_;
598
599 ## For MuTect and MuTect2 calling only variants passing MuTect filters are kept and sometines there is no variant passing these filters making error in Galaxy when using "collection".
600 if($nbLine == 1)
601 {
602 print STDOUT "\nThe sample $filename didn't pass MuTect filters\n";
603
604 ### Print Annovar minimal header + the original header of the input file
605 my $outputFile = "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt";
606 open(OUT, ">", $outputFile) or die "$!: $outputFile\n";
607
608 if($fullAVDB eq "no")
609 {
610 print OUT "Chr\tStart\tEnd\tRef\tAlt\tFunc.refGene\tGene.refGene\tGeneDetail.refGene\tExonicFunc.refGene\tAAChange.refGene\tStrand\tcontext";
611 print OUT "\t".$headerOriginalFile;
612 }
613 ### Print complete Annovar header (using the database name present in the file listAVDB) + the original header of the input file
614 else
615 {
616 print OUT "Chr\tStart\tEnd\tRef\tAlt";
617 open(F1, $listAVDB) or die "$!: $listAVDB\n";
618 while(<F1>)
619 {
620 if($_ =~ /^#/) { next; }
621
622 my @tab = split("\t", $_);
623 $tab[0] =~ /$refGenome\_(.+)\.txt/;
624 my $dbName = $1;
625
626 if($dbName =~ /refGene|knownGene|ensGene/)
627 {
628 print OUT "\t"."Func.$dbName\tGene.$dbName\tGeneDetail.$dbName\tExonicFunc.$dbName\tAAChange.$dbName";
629 }
630 else
631 {
632 print OUT "\t".$dbName;
633 }
634 }
635 print OUT "\tStrand\tcontext\t".$headerOriginalFile;
636
637 close F1;
638 }
639 close OUT;
640 }
641 else
642 {
643 CombinedTempFile("$folder_temp/$filename", "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt");
644 }
588 } 645 }
589 646
590 sub Convert2AV 647 sub Convert2AV
591 { 648 {
592 my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_; 649 my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_;
599 while(<F1>) 656 while(<F1>)
600 { 657 {
601 $_ =~ s/[\r\n]+$//; 658 $_ =~ s/[\r\n]+$//;
602 my @tab = split("\t", $_); 659 my @tab = split("\t", $_);
603 my $chr = ""; 660 my $chr = "";
604
605 # Don't consider chrM and GL
606 if($tab[$chr_value] =~ /M|GL/i) { next; }
607 661
608 # Replace chr23 or chr24 by X or Y 662 # Replace chr23 or chr24 by X or Y
609 if($tab[$chr_value] =~ /23/) { $chr = "chrX"; } 663 if($tab[$chr_value] =~ /23/) { $chr = "chrX"; }
610 elsif($tab[$chr_value] =~ /24/) { $chr = "chrY"; } 664 elsif($tab[$chr_value] =~ /24/) { $chr = "chrY"; }
611 elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; } 665 elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; }
612 else { $chr = "chr".$tab[$chr_value]; } 666 else { $chr = "chr".$tab[$chr_value]; }
613 667
668
669 ### Consider only "normal" chromosomes for the annotation
670 if( ($chr !~ /chr\d{1,2}$|chrX|chrY/) ) { next; }
671
672
673 ### Don't consider variants with two or more alt bases
674 if($tab[$alt_value] =~ /\,/) { next; }
675
614 ### Reformat the Indels for Annovar 676 ### Reformat the Indels for Annovar
615 # chr1 85642631 C CT => chr1 85642631 85642631 - T (mm10) 677 # chr1 85642631 C CT => chr1 85642631 85642631 - T (mm10)
616 # chr5 26085724 ACTT A => chr5 26085725 26085727 CTT - (mm10) 678 # chr5 26085724 ACTT A => chr5 26085725 26085727 CTT - (mm10)
617 if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) ) 679 if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) )
618 { 680 {
619 ### First check if the indels in the file are not already correctly formated 681 ### First check if the indels in the file are not already correctly formated
620 if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) 682 if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") )
621 { 683 {
622 # For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels) 684 # For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels)
623 # Insertion: start = start & end = start 685 # Insertion: start = start & end = start
624 if($tab[$ref_value] =~ /\-/) 686 if($tab[$ref_value] =~ /\-/)
625 { 687 {
626 print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]"; 688 print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]";
627 } 689 }
628 ## Deletion: start = start & end = start + length(del) -1 690 ## Deletion: start = start & end = start + length(del) -1
629 else
630 {
631 my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1);
632 print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]";
633 }
634 }
635 ### Indels not correctly formated for Annovar
636 else 691 else
637 { 692 {
638 my @tabRef = split("", $tab[$ref_value]); 693 my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1);
639 my @tabAlt = split("", $tab[$alt_value]); 694 print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]";
640 695 }
641 # Remove the first base 696 }
642 my $ref2 = join("", @tabRef[1 .. $#tabRef]); 697 ### Indels not correctly formated for Annovar
643 my $alt2 = join("", @tabAlt[1 .. $#tabAlt]);
644
645 if(length($alt2) == 0)
646 {
647 my $altOK = "-";
648 my $startOK = $tab[$start_value] + 1;
649 my $stopOK = $startOK + length($ref2) - 1;
650 print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK;
651 }
652
653 if(length($ref2) == 0)
654 {
655 my $refOK = "-";
656 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2;
657 }
658 }
659 }
660 ### SBS
661 else 698 else
662 { 699 {
663 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value]; 700 my @tabRef = split("", $tab[$ref_value]);
664 } 701 my @tabAlt = split("", $tab[$alt_value]);
665 702
666 ## Print the original file at the end 703 # Remove the first base
667 foreach (@tab) { print OUT "\t$_"; } 704 my $ref2 = join("", @tabRef[1 .. $#tabRef]);
668 print OUT "\n"; 705 my $alt2 = join("", @tabAlt[1 .. $#tabAlt]);
706
707 if(length($alt2) == 0)
708 {
709 my $altOK = "-";
710 my $startOK = $tab[$start_value] + 1;
711 my $stopOK = $startOK + length($ref2) - 1;
712 print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK;
713 }
714
715 if(length($ref2) == 0)
716 {
717 my $refOK = "-";
718 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2;
719 }
720 }
721 }
722 ### SBS
723 else
724 {
725 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value];
726 }
727
728 ## Print the original file at the end
729 foreach (@tab) { print OUT "\t$_"; }
730 print OUT "\n";
669 } 731 }
670 close F1; close OUT; 732 close F1; close OUT;
671 } 733 }
672 734
673 sub AnnotateAV 735 sub AnnotateAV
674 { 736 {
675 my ($inputFile, $output) = @_; 737 my ($inputFile, $output) = @_;
676 738
677 if(!-e $path_AVDB) { print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; exit; } 739 if(!-e $path_AVDB)
740 {
741 print STDERR "Error message:\n";
742 print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n";
743 print STDERR "Please install the database for this genome before running Annovar\n";
744 }
678 745
679 # Extract the name of the databases 746 # Extract the name of the databases
680 my $protocol = ""; my $operation = ""; 747 my $protocol = ""; my $operation = "";
681 ExtractAVDBName($listAVDB, \$protocol, \$operation); 748 ExtractAVDBName($listAVDB, \$protocol, \$operation);
682 749
736 elsif($$refS_month == 8) { $$refS_month = "aug"; } 803 elsif($$refS_month == 8) { $$refS_month = "aug"; }
737 elsif($$refS_month == 9) { $$refS_month = "sept"; } 804 elsif($$refS_month == 9) { $$refS_month = "sept"; }
738 elsif($$refS_month == 10) { $$refS_month = "oct"; } 805 elsif($$refS_month == 10) { $$refS_month = "oct"; }
739 elsif($$refS_month == 11) { $$refS_month = "nov"; } 806 elsif($$refS_month == 11) { $$refS_month = "nov"; }
740 elsif($$refS_month == 12) { $$refS_month = "dec"; } 807 elsif($$refS_month == 12) { $$refS_month = "dec"; }
741 else { print STDERR "Month number don't considered\n"; exit; }
742 } 808 }
743 } 809 }
744 } 810 }
745 811
746 ### Add the minimum of annotations (refGene + strand + context) 812 ### Add the minimum of annotations (refGene + strand + context)
747 sub annotateAV_min 813 sub annotateAV_min
748 { 814 {
749 my ($inputFile, $output) = @_; 815 my ($inputFile, $output) = @_;
750 816
751 if(!-e $path_AVDB) { print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; exit; } 817 if(!-e $path_AVDB)
818 {
819 print STDERR "Error message:\n";
820 print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n";
821 print STDERR "Please install the database for this genome before running Annovar\n";
822 }
752 823
753 # Extract the name of the databases 824 # Extract the name of the databases
754 my ($protocol, $operation) = ("refGene", "g"); 825 my ($protocol, $operation) = ("refGene", "g");
755 826
756 `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; 827 `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`;
785 elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; } 856 elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; }
786 elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; } 857 elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; }
787 else { $chr = $tab[$chr_value]; } 858 else { $chr = $tab[$chr_value]; }
788 859
789 # Verify if the element exists 860 # Verify if the element exists
790 if($chr eq "") { print "Error RecoverStrand: The chromosome value is nor defined for $_\n"; exit; } 861 if($chr eq "") { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The chromosome value is nor defined for $_\n"; }
791 if(! exists $tab[$start_value]) { print "Error RecoverStrand: The start value is nor defined for $_\n"; exit; } 862 if(! exists $tab[$start_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The start value is nor defined for $_\n"; }
792 if(! exists $tab[$ref_value]) { print "Error RecoverStrand: The reference value is nor defined for $_\n"; exit; } 863 if(! exists $tab[$ref_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The reference value is nor defined for $_\n"; }
793 if(! exists $tab[$alt_value]) { print "Error RecoverStrand: The alternate value is nor defined for $_\n"; exit; } 864 if(! exists $tab[$alt_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The alternate value is nor defined for $_\n"; }
794 if(! exists $tab[$func_value]) { print "Error RecoverStrand: The functional value is nor defined for $_\n"; exit; } 865 if(! exists $tab[$func_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The functional value is nor defined for $_\n"; }
795 if(! exists $tab[$geneSymbol_value]) { print "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; exit; } 866 if(! exists $tab[$geneSymbol_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; }
796 867
797 my $geneSymbol = ""; 868 my $geneSymbol = "";
798 ######## For the splicing annotation we separate the gene symbol from the aa change 869 ######## For the splicing annotation we separate the gene symbol from the aa change
799 if($tab[$func_value] eq "splicing") 870 if($tab[$func_value] eq "splicing")
800 { 871 {
821 { 892 {
822 $_ =~ s/[\r\n]+$//; 893 $_ =~ s/[\r\n]+$//;
823 my @tab = split("\t", $_); 894 my @tab = split("\t", $_);
824 my $strand = ""; 895 my $strand = "";
825 $strand = $tab[$db_strandInfo_value]; 896 $strand = $tab[$db_strandInfo_value];
826 if($strand eq "") { print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; exit; } 897 if($strand eq "") { print STDERR "Error message:\n"; print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; }
827 else 898 else
828 { 899 {
829 # Some genes have several strand orientation, keep the first in the database 900 # Some genes have several strand orientation, keep the first in the database
830 if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; } 901 if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; }
831 } 902 }
872 943
873 foreach my $kDB (sort keys %h_database) 944 foreach my $kDB (sort keys %h_database)
874 { 945 {
875 if("$tab[5]:$tab[0]" eq $kDB) 946 if("$tab[5]:$tab[0]" eq $kDB)
876 { 947 {
877 if($lengthHeader != $lengthLine) { print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n"; exit; } 948 if($lengthHeader != $lengthLine)
949 {
950 print STDERR "Error message:\n";
951 print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n";
952 }
878 953
879 foreach my $line (@{$h_inputFile{$kFile}}) 954 foreach my $line (@{$h_inputFile{$kFile}})
880 { 955 {
881 my @tab = split("\t", $line); 956 my @tab = split("\t", $line);
882 my $j = 0; 957 my $j = 0;
969 my %seqhash = (); #database sequence for each chromosome 1044 my %seqhash = (); #database sequence for each chromosome
970 my %name_seq = (); #sequence for each region 1045 my %name_seq = (); #sequence for each region
971 my (%seqlen, %discordlen, %badorf); #store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon 1046 my (%seqlen, %discordlen, %badorf); #store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon
972 my ($count_success, @failure) = (0); 1047 my ($count_success, @failure) = (0);
973 1048
974 for my $curchr (sort keys $refH_allRegion) 1049 for my $curchr (sort keys %{$refH_allRegion})
975 { 1050 {
976 my ($seqid, $curseq) = ('', ''); 1051 my ($seqid, $curseq) = ('', '');
977 my $fastafile = ""; 1052 my $fastafile = "";
978 if ($curchr =~ m/^chr/) 1053 if ($curchr =~ m/^chr/)
979 { 1054 {
1078 $header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header); 1153 $header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header);
1079 # Print the Annovar header until the column before OtherInfo 1154 # Print the Annovar header until the column before OtherInfo
1080 print OUT "$tabHeaderInput[0]"; 1155 print OUT "$tabHeaderInput[0]";
1081 my $j = 0; 1156 my $j = 0;
1082 for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; } 1157 for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; }
1083 print OUT "\tcontext"; 1158 print OUT "\tcontext\ttrinucleotide_context";
1084 for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } 1159 for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; }
1085 print OUT "\n"; 1160 print OUT "\n";
1086 } 1161 }
1087 1162
1088 foreach my $k_hFile (sort keys $refH_InputFile) 1163 foreach my $k_hFile (sort keys %{$refH_InputFile})
1089 { 1164 {
1090 foreach my $k_allRegonSeqContext (sort keys $refH_allRegionSeqContext) 1165 foreach my $k_allRegonSeqContext (sort keys %{$refH_allRegionSeqContext})
1091 { 1166 {
1092 if($k_hFile eq $k_allRegonSeqContext) 1167 if($k_hFile eq $k_allRegonSeqContext)
1093 { 1168 {
1094 my $j=0; 1169 my $j=0;
1095 1170
1096 for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++) 1171 for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++)
1097 { 1172 {
1098 my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]); 1173 my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]);
1099 1174
1175 # Write Annovar annotation + strand orientation
1100 for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; } 1176 for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; }
1177 # Write the sequence context with the length defined by the user (default is 10)
1101 print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext}; 1178 print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext};
1179 # Write the trinucleotide context
1180 my $contextSequence = $refH_allRegionSeqContext->{$k_allRegonSeqContext}; $contextSequence =~ tr/a-z/A-Z/;
1181 my @tempContextSequence = split("", $contextSequence);
1182 my $midlle_totalNbBaseContext = (scalar(@tempContextSequence)-1)/2; # For having the middle of the sequence
1183 print OUT "\t".$tempContextSequence[$midlle_totalNbBaseContext-1]."x".$tempContextSequence[$midlle_totalNbBaseContext+1];
1184 # Write the original columns
1102 for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } 1185 for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; }
1103 print OUT "\n"; 1186 print OUT "\n";
1104 } 1187 }
1105 last; 1188 last;
1106 } 1189 }
1138 my $name_of_column_NB = "toto"; 1221 my $name_of_column_NB = "toto";
1139 for(my $i=0; $i<=$#tab_search_header; $i++) 1222 for(my $i=0; $i<=$#tab_search_header; $i++)
1140 { 1223 {
1141 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; } 1224 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; }
1142 } 1225 }
1143 if($name_of_column_NB eq "toto") { print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } 1226 if($name_of_column_NB eq "toto")
1144 else { return $name_of_column_NB; } 1227 {
1228 print STDERR "Error message:\n";
1229 print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n";
1230 }
1231 else { return $name_of_column_NB; }
1145 } 1232 }
1146 1233
1147 1234
1148 1235
1149 1236
1163 -v, --verbose use verbose output 1250 -v, --verbose use verbose output
1164 --refGenome the reference genome to use 1251 --refGenome the reference genome to use
1165 --interval <interger> the number of bases for the sequence context 1252 --interval <interger> the number of bases for the sequence context
1166 -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file 1253 -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file
1167 -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size 1254 -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size
1168 --pathAVDBList the path to the list of AV databases installed 1255 --pathAVDBList the path to a text file containing the list of Annovar databases installed
1169 -temp --pathTemporary <string> the path for saving the temporary files 1256 -temp --pathTemporary <string> the path for saving the temporary files
1170 --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no) 1257 --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no)
1258 --max_cpu <integer> number of CPUs to be used for the annotation
1171 1259
1172 1260
1173 Function: automatically run a pipeline on a list of variants and annote them using Annovar 1261 Function: automatically run a pipeline on a list of variants and annote them using Annovar
1174 1262
1175 Example: # Annotation only 1263 Example: # Annotation only
1176 mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input 1264 mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input
1177 1265
1178 1266
1179 Version: 06-2016 (June 2016) 1267 Version: 03-2017 (March 2017)
1180 1268
1181 1269
1182 =head1 OPTIONS 1270 =head1 OPTIONS
1183 1271
1184 =over 8 1272 =over 8
1193 1281
1194 =item B<--verbose> 1282 =item B<--verbose>
1195 1283
1196 use verbose output. 1284 use verbose output.
1197 1285
1198
1199 =item B<--refGenome> 1286 =item B<--refGenome>
1200 1287
1201 the reference genome to use, could be hg19 or mm9. 1288 the reference genome to use, could be hg19 or mm9.
1202 1289
1203 =item B<--interval> 1290 =item B<--interval>
1212 1299
1213 the path to the directory containing the Annovar databases and the files with the chromosome size. 1300 the path to the directory containing the Annovar databases and the files with the chromosome size.
1214 1301
1215 =item B<--pathAVDBList> 1302 =item B<--pathAVDBList>
1216 1303
1217 the path to a texte file containing the list of the Annovar databases installed. 1304 the path to a text file containing the list of Annovar databases installed.
1218 1305
1219 =item B<--pathTemporary> 1306 =item B<--pathTemporary>
1220 1307
1221 the path for saving temporary files generated by the script. 1308 the path for saving temporary files generated by the script.
1222 If any is specify a temporary folder is created in the same directory where the script is running. 1309 If any is specify a temporary folder is created in the same directory where the script is running.
1224 1311
1225 =item B<--fullAnnotation> 1312 =item B<--fullAnnotation>
1226 1313
1227 Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines) 1314 Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines)
1228 1315
1316 =item B<--max_cpu>
1317
1318 Specify the number of CPUs to be used. This number is used for spliting the file in n part and running the annotations in each part in parallel.
1319
1320
1229 =head1 DESCRIPTION 1321 =head1 DESCRIPTION
1230 1322
1231 MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS. 1323 MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS.
1232 Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added. 1324 Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added.
1233 A text tab delimited file is produced. 1325 A text tab delimited file is produced.