Mercurial > repos > iarc > mutspec
comparison mutspecAnnot.pl @ 7:eda59b985b1c draft default tip
Uploaded
author | iarc |
---|---|
date | Mon, 13 Mar 2017 08:21:19 -0400 |
parents | 46a10309dfe2 |
children |
comparison
equal
deleted
inserted
replaced
6:46a10309dfe2 | 7:eda59b985b1c |
---|---|
1 #!/usr/bin/env perl | 1 #!/usr/bin/env perl |
2 | 2 |
3 #-----------------------------------# | 3 #-----------------------------------# |
4 # Author: Maude # | 4 # Author: Maude # |
5 # Script: mutspecAnnot.pl # | 5 # Script: mutspecAnnot.pl # |
6 # Last update: 21/06/16 # | 6 # Last update: 02/03/17 # |
7 #-----------------------------------# | 7 #-----------------------------------# |
8 | 8 |
9 use strict; | 9 use strict; |
10 use warnings; | 10 use warnings; |
11 use Getopt::Long; | 11 use Getopt::Long; |
18 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. | 18 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. |
19 our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files | 19 our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files |
20 our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context. | 20 our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context. |
21 our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines) | 21 our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines) |
22 | 22 |
23 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp) or pod2usage(2); | 23 |
24 ######################################### | |
25 ### SPECIFY THE NUMBER OF CPU ### | |
26 ######################################### | |
27 our ($max_cpu) = 8; # Max number of CPU to use for the annotation | |
28 | |
29 | |
30 | |
31 | |
32 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp, 'max_cpu=i' => \$max_cpu) or pod2usage(2); | |
24 | 33 |
25 our ($input) = @ARGV; | 34 our ($input) = @ARGV; |
26 | 35 |
27 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); | 36 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); |
28 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); | 37 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); |
33 | 42 |
34 ###################################################################################################################################################### | 43 ###################################################################################################################################################### |
35 # GLOBAL VARIABLES # | 44 # GLOBAL VARIABLES # |
36 ###################################################################################################################################################### | 45 ###################################################################################################################################################### |
37 | 46 |
38 ######################################### | |
39 ### SPECIFY THE NUMBER OF CPU ### | |
40 ######################################### | |
41 our $max_cpu = 1; # Max number of CPU to use for the annotation | |
42 | |
43 | |
44 # Recover the current path | 47 # Recover the current path |
45 our $pwd = `pwd`; | 48 our $pwd = `pwd`; |
46 chomp($pwd); | 49 chomp($pwd); |
47 | 50 # Recover the filename and the input directory |
48 # Input file path | 51 our ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/); |
49 our @pathInput = split("/", $input); | |
50 # Output directories | 52 # Output directories |
51 our ($folderMutAnalysis, $folderAnnovar) = ("", ""); | 53 our ($folderMutAnalysis, $folderAnnovar) = ("", ""); |
52 # File with the list of Annovar databases to use | 54 # File with the list of Annovar databases to use |
53 our $listAVDB = ""; | 55 our $listAVDB = ""; |
54 # Initialisation of chromosome, position, ref and alt values | 56 # Initialisation of chromosome, position, ref and alt values |
55 our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a"); | 57 our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a"); |
56 | 58 |
57 | 59 |
60 | |
58 ###################################################################################################################################################### | 61 ###################################################################################################################################################### |
59 # MAIN # | 62 # MAIN # |
60 ###################################################################################################################################################### | 63 ###################################################################################################################################################### |
61 ## Check the presence of the flags and create the output and temp directories | 64 ## Check the presence of the flags and create the output and temp directories |
62 CheckFlags(); | 65 CheckFlags(); |
63 | 66 |
64 ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions | 67 ## Format the file correctly: |
68 ## 1) Check the length of the filename (must be <= 31 characters) | |
69 ## 2) Recover the input format. If MuTect output consider only "KEEP" variants | |
70 ## 3) Recover the column number for chr, start, ref and alt | |
65 FormatingInputFile(); | 71 FormatingInputFile(); |
66 | 72 |
67 # Annotate the file with Annovar, add the strand orientation and the sequence context | 73 # Annotate the file with Annovar, add the strand orientation and the sequence context |
68 FullAnnotation(); | 74 FullAnnotation(); |
69 | 75 |
73 | 79 |
74 ## Check the presence of the flags and create the output and temp directories | 80 ## Check the presence of the flags and create the output and temp directories |
75 sub CheckFlags | 81 sub CheckFlags |
76 { | 82 { |
77 # Check the reference genome | 83 # Check the reference genome |
78 if($refGenome eq "empty") { print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; exit; } | 84 if($refGenome eq "empty") |
79 if($intervalEnd eq "empty") { print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n"; exit; } | 85 { |
86 print STDERR "Missing flag !\n"; | |
87 print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; | |
88 exit; | |
89 } | |
90 if($intervalEnd eq "empty") | |
91 { | |
92 print STDERR "Missing flag !\n"; | |
93 print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n"; | |
94 exit; | |
95 } | |
80 # If no output is specified write the result as the same place as the input file | 96 # If no output is specified write the result as the same place as the input file |
81 if($output eq "empty") | 97 if($output eq "empty") |
82 { | 98 { |
83 my $folderRes = ""; | 99 my $directory = dirname( $input ); |
84 for(my $i=0; $i<$#pathInput; $i++) { $folderRes .= "$pathInput[$i]/"; } | 100 |
85 | 101 $folderMutAnalysis = "$directory/Mutational_Analysis"; |
86 $folderMutAnalysis = "$folderRes/Mutational_Analysis"; | |
87 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } | 102 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } |
88 } | 103 } |
89 else | 104 else |
90 { | 105 { |
91 if(!-e $output) { mkdir($output) or die "$!: $output\n"; } | 106 if(!-e $output) { mkdir($output) or die "$!: $output\n"; } |
96 # Create the output folder for Annovar | 111 # Create the output folder for Annovar |
97 $folderAnnovar = "$folderMutAnalysis/Annovar"; | 112 $folderAnnovar = "$folderMutAnalysis/Annovar"; |
98 if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; } | 113 if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; } |
99 | 114 |
100 # Verify the access to Annovar databases | 115 # Verify the access to Annovar databases |
101 if($path_AVDB eq "empty") { print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n"; exit; } | 116 if($path_AVDB eq "empty") |
102 elsif(!-e $path_AVDB) { print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n"; exit; } | 117 { |
118 print STDERR "Missing flag !\n"; | |
119 print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n"; | |
120 exit; | |
121 } | |
122 elsif(!-e $path_AVDB) | |
123 { | |
124 print STDERR "Error message:\n"; | |
125 print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n"; | |
126 } | |
103 | 127 |
104 # Check the file list AV DB | 128 # Check the file list AV DB |
105 if($pathAVDBList eq "empty") { print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n"; exit; } | 129 if($pathAVDBList eq "empty") |
130 { | |
131 print STDERR "Missing flag !\n"; | |
132 print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n"; | |
133 exit; | |
134 } | |
106 else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" } | 135 else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" } |
107 | 136 |
108 # If no temp folder is specified write the result in the current path | 137 # If no temp folder is specified write the result in the current path |
109 if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$pathInput[$#pathInput]"; } | 138 if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$filename"; } |
110 if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; } | 139 if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; } |
140 | |
141 # Verify listAVDB is not empty | |
142 if($listAVDB eq "") | |
143 { | |
144 print STDERR "Path to the text file containing the list of Annovar databases installed is not specified !!!\n"; | |
145 exit; | |
146 } | |
111 } | 147 } |
112 | 148 |
113 ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions | 149 ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions |
114 sub FormatingInputFile | 150 sub FormatingInputFile |
115 { | 151 { |
118 { | 154 { |
119 foreach my $file (`ls $input`) | 155 foreach my $file (`ls $input`) |
120 { | 156 { |
121 my $headerOriginalFile = ""; | 157 my $headerOriginalFile = ""; |
122 chomp($file); | 158 chomp($file); |
123 my ($filename, $directories, $suffix) = fileparse("$input/$file", qr/\.[^.]*/); | |
124 | 159 |
125 CheckLengthFilename("$input/$file"); | 160 CheckLengthFilename("$input/$file"); |
126 | 161 |
127 ################################################# | 162 ################################################# |
128 ### Recover the input format ### | 163 ### Recover the input format ### |
134 else | 169 else |
135 { | 170 { |
136 my $headerOriginalFile = ""; | 171 my $headerOriginalFile = ""; |
137 | 172 |
138 CheckLengthFilename($input); | 173 CheckLengthFilename($input); |
139 my ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/); | |
140 | 174 |
141 ################################################# | 175 ################################################# |
142 ### Recover the input format ### | 176 ### Recover the input format ### |
143 ################################################# | 177 ################################################# |
144 RecoverInputFormat($input, \$headerOriginalFile); | 178 RecoverInputFormat($input, \$headerOriginalFile); |
148 # The name for the Excel sheet can't be longer than 31 characters | 182 # The name for the Excel sheet can't be longer than 31 characters |
149 sub CheckLengthFilename | 183 sub CheckLengthFilename |
150 { | 184 { |
151 my ($inputFile) = @_; | 185 my ($inputFile) = @_; |
152 | 186 |
153 ## Verify the name of file, must be <= 31 chars for the sheet name | 187 my ($filenameInputFile, $directoriesInputFile, $suffixInputFile) = fileparse($inputFile, qr/\.[^.]*/); |
154 my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); | 188 |
155 | 189 if(length($filenameInputFile) > 32) |
156 if(length($filename) > 32) { print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n"; exit; } | 190 { |
157 } | 191 print STDERR "Error message:\n"; |
158 | 192 print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n"; |
159 # Recover the input format (vcf or txt) and depending on the format convert the input file in a suitable format for Annovar (ex: for MuTect files keep only the confident variants) | 193 } |
194 } | |
195 | |
196 # Recover the input format. If MuTect output consider only "KEEP" variants | |
160 sub RecoverInputFormat | 197 sub RecoverInputFormat |
161 { | 198 { |
162 my ($file, $refS_headerOriginalFile) = @_; | 199 my ($file, $refS_headerOriginalFile) = @_; |
163 | 200 |
164 my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); | 201 my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); |
319 # Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles. | 356 # Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles. |
320 sub RecoverColNameAuto | 357 sub RecoverColNameAuto |
321 { | 358 { |
322 our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_; | 359 our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_; |
323 | 360 |
324 $header =~ s/[\r\n]+$//; | 361 $header =~ s/[\r\n]+$//; |
325 | 362 |
326 ## Name of the columns | 363 ## Name of the columns |
327 my @mutect = qw(contig position ref_allele alt_allele); | 364 my @mutect = qw(contig position ref_allele alt_allele); |
328 my @vcf = qw(CHROM POS REF ALT); | 365 my @vcf = qw(CHROM POS REF ALT); |
329 my @cosmic = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic); | 366 my @cosmic = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic); |
330 my @icgc = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele); | 367 my @icgc = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele); |
331 my @tcga = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2); | 368 my @tcga = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2); |
332 my @ionTorrent = qw(chr Position Ref Alt); | 369 my @ionTorrent = qw(chr Position Ref Alt); |
333 my @proton = qw(Chrom Position Ref Variant); | 370 my @proton = qw(Chrom Position Ref Variant); |
334 my @varScan2 = qw(Chrom Position Ref VarAllele); | 371 my @varScan2 = qw(Chrom Position Ref VarAllele); |
335 my @annovar = qw(Chr Start Ref Obs); | 372 my @varScan2_somatic = qw(chrom position ref var); |
336 my @custom = qw(Chromosome Start Wild_Type Mutant); | 373 my @annovar = qw(Chr Start Ref Obs); |
337 | 374 my @custom = qw(Chromosome Start Wild_Type Mutant); |
338 my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@annovar, \@custom); | 375 |
376 my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@varScan2_somatic, \@annovar, \@custom); | |
339 my $timer = 0; # For controlling if the names are present on the dictionnary or not | 377 my $timer = 0; # For controlling if the names are present on the dictionnary or not |
340 | 378 |
341 foreach my $refTab (@allTab) | 379 foreach my $refTab (@allTab) |
342 { | 380 { |
343 my @tab = @$refTab; | 381 my @tab = @$refTab; |
350 else { $timer++; } | 388 else { $timer++; } |
351 } | 389 } |
352 | 390 |
353 if($timer == scalar(@allTab)) | 391 if($timer == scalar(@allTab)) |
354 { | 392 { |
393 print STDERR "Error message:\n"; | |
355 print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n"; | 394 print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n"; |
356 print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n"; | 395 print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n"; |
357 exit; | 396 exit; |
358 } | 397 } |
359 | 398 |
402 foreach my $file (`ls $folder_temp/*.txt`) | 441 foreach my $file (`ls $folder_temp/*.txt`) |
403 { | 442 { |
404 chomp($file); | 443 chomp($file); |
405 | 444 |
406 # For recover the name of the file without extension, the directory where the file is and the extension of the file | 445 # For recover the name of the file without extension, the directory where the file is and the extension of the file |
407 my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/); | 446 my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/); |
408 my $filenameOK = ""; | 447 my $filenameOK = ""; |
409 # For removing the ColumnCorrect for txt files | 448 # For removing the ColumnCorrect for txt files |
410 if($filename =~ /(.+)-ColumnCorrect/) | 449 if($filename =~ /(.+)-ColumnCorrect/) |
411 { | 450 { |
412 if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; } | 451 if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; } |
413 else { $filenameOK = $1; } | 452 else { $filenameOK = $1; } |
414 } | 453 } |
415 else { print STDERR "Case not considered for $filename!!!\n"; exit; } | 454 else { print STDERR "Error message:\n"; print STDERR "Case not considered for $filename!!!\n"; } |
416 | 455 |
417 | 456 |
418 ################################################# | 457 ################################################# |
419 ### Cut the files in n part ### | 458 ### Cut the files in n part ### |
420 ################################################# | 459 ################################################# |
421 # Recover the number of variants in the file for deciding the number of CPU to use | 460 # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts |
422 my $cpu = 0; | 461 my $cpu = 0; |
423 my $nbVariants = `wc -l $file`; | 462 # Keep the original header |
424 $nbVariants =~ /(\d+).+/; | 463 my $headerOriginalFile = ""; |
425 | 464 # Save the numer of lines |
426 if($1-1 <= 5000) { $cpu = 1; } | 465 my $nbLine = 0; |
427 elsif( ($1-1 > 5000) && ($1-1 < 25000) ) { $cpu = 2; } | 466 splitInputFile($file, \$cpu, \$headerOriginalFile, $filenameOK, \$nbLine); |
428 elsif( ($1-1 >= 25000) && ($1-1 < 100000) ) { $cpu = 8; } | 467 |
429 else { $cpu = $max_cpu; } | |
430 | |
431 # If the number predefined can't be used on the machine use the maximum number specify by the administrator | |
432 if($cpu > $max_cpu) { $cpu = $max_cpu } | |
433 | |
434 ## Recover the header | |
435 open(F1, $file) or die "$!: $file\n"; | |
436 my $headerOriginalFile = <F1>; | |
437 close F1; | |
438 | |
439 ## Remove the first line of the file | |
440 my $fileNoHeader = "$folder_temp/${filenameOK}-NoHeader"; | |
441 `sed 1d $file > $fileNoHeader`; | |
442 | |
443 if(!-e "$folder_temp/$filenameOK") { mkdir("$folder_temp/$filenameOK") or die "Can't create the directory $folder_temp/$filenameOK\n"; } | |
444 my $lines_per_temp = int(1+($1 / $cpu)); # +1 in case of the div == 0 | |
445 `split -l $lines_per_temp $fileNoHeader $folder_temp/$filenameOK/$filenameOK-`; | |
446 | |
447 if($headerOriginalFile eq "") { print STDERR "No header for the file $file!!!\nPlease check the format of your file\n"; exit; } | |
448 my @files = <$folder_temp/$filenameOK/$filenameOK-*>; | |
449 | 468 |
450 ################################################# | 469 ################################################# |
451 ### Annotate the n part ### | 470 ### Annotate the n part ### |
452 ################################################# | 471 ################################################# |
453 my $pm = Parallel::ForkManager->new($cpu); | 472 annotateFile($cpu, $filenameOK, $headerOriginalFile); |
454 | 473 |
455 foreach my $tempFile (@files) | 474 |
456 { | 475 ################################################# |
457 # Forks and returns the pid for the child: | 476 ### Paste the file together ### |
458 my $pid = $pm->start and next; | 477 ################################################# |
459 | 478 createOutput($filenameOK, $headerOriginalFile, $nbLine); |
460 # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo | |
461 my ($filename, $directories, $suffix) = fileparse($tempFile, qr/\-[^.]*/); | |
462 my $outFilenameTemp = $filename.$suffix; | |
463 Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput"); | |
464 | |
465 # Annotate the file with Annovar | |
466 my $tempFileName_AVOutput = $filename.$suffix.".".${refGenome}."_multianno.txt"; | |
467 if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } | |
468 else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } | |
469 | |
470 # Check if the annotations worked | |
471 open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n"; | |
472 while(<F1>) | |
473 { | |
474 if($_ =~ /ERROR/i) | |
475 { | |
476 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n"; | |
477 print STDERR $_; | |
478 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n"; | |
479 exit; | |
480 } | |
481 } | |
482 close F1; | |
483 | |
484 # Recover the strand orientation | |
485 my $length_AVheader = 0; | |
486 RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader); | |
487 | |
488 # Recover the sequence context | |
489 RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filenameOK/$outFilenameTemp".".".${refGenome}."_multianno.txt"); | |
490 | |
491 $pm->finish; # Terminates the child process | |
492 } | |
493 # Wait all the child process | |
494 $pm->wait_all_children; | |
495 | |
496 # Paste the file together | |
497 CombinedTempFile("$folder_temp/$filenameOK", "$folderAnnovar/$filenameOK".".".${refGenome}."_multianno.txt"); | |
498 } | 479 } |
499 } | 480 } |
500 # The input file is one file | 481 # The input file is one file |
501 else | 482 else |
502 { | 483 { |
503 my ($filenameO, $directoriesO, $suffixO) = fileparse($input, qr/\.[^.]*/); | |
504 | |
505 ################################################# | 484 ################################################# |
506 ### Cut the files in n part ### | 485 ### Cut the files in n part ### |
507 ################################################# | 486 ################################################# |
508 # Recover the number of variants in the file for deciding the number of CPU to use | 487 # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts |
509 my $cpu = 0; | 488 my $cpu = 0; |
510 my $nbVariants = `wc -l $folder_temp/$filenameO-ColumnCorrect.txt`; | 489 # Keep the original header |
511 $nbVariants =~ /(\d+).+/; | 490 my $headerOriginalFile = ""; |
512 | 491 # Save the numer of lines |
513 if($1-1 <= 5000) { $cpu = 1; } | 492 my $nbLine = 0; |
514 elsif( ($1-1 > 5000) && ($1-1 < 25000) ) { $cpu = 2; } | 493 splitInputFile("$folder_temp/$filename-ColumnCorrect.txt", \$cpu, \$headerOriginalFile, $filename, \$nbLine); |
515 elsif( ($1-1 >= 25000) && ($1-1 < 100000) ) { $cpu = 8; } | 494 |
516 else { $cpu = $max_cpu; } | |
517 | |
518 # If the number predefined can't be used on the machine use the maximum number specify by the administrator | |
519 if($cpu > $max_cpu) { $cpu = $max_cpu } | |
520 | |
521 ## Recover the header | |
522 open(F1, "$folder_temp/$filenameO-ColumnCorrect.txt") or die "$!: $folder_temp/$filenameO-ColumnCorrect.txt\n"; | |
523 my $headerOriginalFile = <F1>; | |
524 close F1; | |
525 | |
526 ## Remove the first line of the file | |
527 my $fileNoHeader = "$folder_temp/$filenameO-NoHeader"; | |
528 `sed 1d $folder_temp/$filenameO-ColumnCorrect.txt > $fileNoHeader`; | |
529 | |
530 if(!-e "$folder_temp/$filenameO") { mkdir("$folder_temp/$filenameO") or die "Can't create the directory $folder_temp/$filenameO\n"; } | |
531 my $lines_per_temp = int(1+($1 / $cpu)); # +1 in case of the div == 0 | |
532 `split -l $lines_per_temp $fileNoHeader $folder_temp/$filenameO/$filenameO-`; | |
533 | |
534 if($headerOriginalFile eq "") { print STDERR "No header for the file $input!!!\nPlease check the format of your file\n"; exit; } | |
535 my @files = <$folder_temp/$filenameO/$filenameO-*>; | |
536 | 495 |
537 ################################################# | 496 ################################################# |
538 ### Annotate the n part ### | 497 ### Annotate the n part ### |
539 ################################################# | 498 ################################################# |
540 my $pm = Parallel::ForkManager->new($cpu); | 499 annotateFile($cpu, $filename, $headerOriginalFile); |
541 foreach my $tempFile (@files) | 500 |
542 { | 501 |
543 # Forks and returns the pid for the child: | 502 ################################################# |
544 my $pid = $pm->start and next; | 503 ### Paste the file together ### |
545 | 504 ################################################# |
546 # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo | 505 createOutput($filename, $headerOriginalFile, $nbLine); |
547 # For recover the name of the file without extension, the directory were the file is and the extension of the file | |
548 my ($filename, $directories, $suffix) = fileparse($tempFile, qr/\.[^.]*/); | |
549 my $outFilenameTemp = $filename.$suffix; | |
550 Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput"); | |
551 | |
552 # Annotate the file with Annovar | |
553 my $tempFileName_AVOutput = $outFilenameTemp.".".${refGenome}."_multianno.txt"; | |
554 if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } | |
555 else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } | |
556 | |
557 # Check if the annotations worked | |
558 open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n"; | |
559 while(<F1>) | |
560 { | |
561 if($_ =~ /ERROR/i) | |
562 { | |
563 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n"; | |
564 print STDERR $_; | |
565 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n"; | |
566 exit; | |
567 } | |
568 } | |
569 close F1; | |
570 | |
571 # Recover the strand orientation | |
572 my $length_AVheader = 0; | |
573 RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader); | |
574 | |
575 # Recover the sequence context | |
576 RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filenameO/$tempFileName_AVOutput"); | |
577 | |
578 $pm->finish; # Terminates the child process | |
579 } | |
580 # Wait all the child process | |
581 $pm->wait_all_children; | |
582 | |
583 # Paste the file together | |
584 CombinedTempFile("$folder_temp/$filenameO", "$folderAnnovar/$filenameO".".".${refGenome}."_multianno.txt"); | |
585 } | 506 } |
586 # Remove the temporary directory | 507 # Remove the temporary directory |
587 rmtree($folder_temp); | 508 rmtree($folder_temp); |
509 } | |
510 | |
511 | |
512 | |
513 sub splitInputFile | |
514 { | |
515 my ($inputFile, $ref_cpu, $ref_header, $filename, $ref_nbLine) = @_; | |
516 | |
517 my $nbVariants = `wc -l $inputFile`; | |
518 $nbVariants =~ /(\d+).+/; | |
519 $$ref_nbLine = $1; | |
520 | |
521 if($$ref_nbLine-1 <= 5000) { $$ref_cpu = 1; } | |
522 elsif( ($$ref_nbLine-1 > 5000) && ($$ref_nbLine-1 < 25000) ) { $$ref_cpu = 2; } | |
523 elsif( ($$ref_nbLine-1 >= 25000) && ($$ref_nbLine-1 < 100000) ) { $$ref_cpu = 8; } | |
524 else { $$ref_cpu = $max_cpu; } | |
525 | |
526 # If the number predefined can't be used on the machine use the maximum number specify by the administrator | |
527 if($$ref_cpu > $max_cpu) { $$ref_cpu = $max_cpu } | |
528 | |
529 | |
530 ## Recover the header | |
531 open(F1, $inputFile) or die "$!: $inputFile\n"; | |
532 $$ref_header= <F1>; | |
533 close F1; | |
534 | |
535 ## Remove the first line of the file | |
536 my $fileNoHeader = "$folder_temp/${filename}-NoHeader"; | |
537 `sed 1d $inputFile > $fileNoHeader`; | |
538 | |
539 if(!-e "$folder_temp/$filename") { mkdir("$folder_temp/$filename") or die "Can't create the directory $folder_temp/$filename\n"; } | |
540 my $lines_per_temp = int(1+($1 / $$ref_cpu)); # +1 in case of the div == 0 | |
541 `split -l $lines_per_temp $fileNoHeader $folder_temp/$filename/$filename-`; | |
542 | |
543 if($$ref_header eq "") { print STDERR "Error message:\n"; print STDERR "No header for the file $inputFile!!!\nPlease check the format of your file\n"; } | |
544 } | |
545 | |
546 sub annotateFile | |
547 { | |
548 my ($cpu, $filename, $headerOriginalFile) = @_; | |
549 | |
550 my $pm = Parallel::ForkManager->new($cpu); | |
551 | |
552 foreach my $tempFile (`ls $folder_temp/$filename/$filename-*`) | |
553 { | |
554 chomp($tempFile); | |
555 | |
556 # Forks and returns the pid for the child: | |
557 my $pid = $pm->start and next; | |
558 | |
559 # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo | |
560 my ($filenameTempFile, $directoriesTempFile, $suffixTempFile) = fileparse($tempFile, qr/\-[^.]*/); | |
561 my $outFilenameTemp = $filenameTempFile.$suffixTempFile; | |
562 Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput"); | |
563 | |
564 # Annotate the file with Annovar | |
565 my $tempFileName_AVOutput = $filename.$suffixTempFile.".".${refGenome}."_multianno.txt"; | |
566 if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } | |
567 else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } | |
568 | |
569 # Check if the annotations worked | |
570 open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n"; | |
571 while(<F1>) | |
572 { | |
573 if($_ =~ /ERROR/i) | |
574 { | |
575 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n"; | |
576 print STDERR $_; | |
577 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n"; | |
578 } | |
579 } | |
580 close F1; | |
581 | |
582 # Recover the strand orientation | |
583 my $length_AVheader = 0; | |
584 RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader); | |
585 | |
586 # Recover the sequence context | |
587 RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filename/$outFilenameTemp".".".${refGenome}."_multianno.txt"); | |
588 | |
589 $pm->finish; # Terminates the child process | |
590 } | |
591 # Wait all the child process | |
592 $pm->wait_all_children; | |
593 } | |
594 | |
595 sub createOutput | |
596 { | |
597 my ($filename, $headerOriginalFile, $nbLine) = @_; | |
598 | |
599 ## For MuTect and MuTect2 calling only variants passing MuTect filters are kept and sometines there is no variant passing these filters making error in Galaxy when using "collection". | |
600 if($nbLine == 1) | |
601 { | |
602 print STDOUT "\nThe sample $filename didn't pass MuTect filters\n"; | |
603 | |
604 ### Print Annovar minimal header + the original header of the input file | |
605 my $outputFile = "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt"; | |
606 open(OUT, ">", $outputFile) or die "$!: $outputFile\n"; | |
607 | |
608 if($fullAVDB eq "no") | |
609 { | |
610 print OUT "Chr\tStart\tEnd\tRef\tAlt\tFunc.refGene\tGene.refGene\tGeneDetail.refGene\tExonicFunc.refGene\tAAChange.refGene\tStrand\tcontext"; | |
611 print OUT "\t".$headerOriginalFile; | |
612 } | |
613 ### Print complete Annovar header (using the database name present in the file listAVDB) + the original header of the input file | |
614 else | |
615 { | |
616 print OUT "Chr\tStart\tEnd\tRef\tAlt"; | |
617 open(F1, $listAVDB) or die "$!: $listAVDB\n"; | |
618 while(<F1>) | |
619 { | |
620 if($_ =~ /^#/) { next; } | |
621 | |
622 my @tab = split("\t", $_); | |
623 $tab[0] =~ /$refGenome\_(.+)\.txt/; | |
624 my $dbName = $1; | |
625 | |
626 if($dbName =~ /refGene|knownGene|ensGene/) | |
627 { | |
628 print OUT "\t"."Func.$dbName\tGene.$dbName\tGeneDetail.$dbName\tExonicFunc.$dbName\tAAChange.$dbName"; | |
629 } | |
630 else | |
631 { | |
632 print OUT "\t".$dbName; | |
633 } | |
634 } | |
635 print OUT "\tStrand\tcontext\t".$headerOriginalFile; | |
636 | |
637 close F1; | |
638 } | |
639 close OUT; | |
640 } | |
641 else | |
642 { | |
643 CombinedTempFile("$folder_temp/$filename", "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt"); | |
644 } | |
588 } | 645 } |
589 | 646 |
590 sub Convert2AV | 647 sub Convert2AV |
591 { | 648 { |
592 my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_; | 649 my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_; |
599 while(<F1>) | 656 while(<F1>) |
600 { | 657 { |
601 $_ =~ s/[\r\n]+$//; | 658 $_ =~ s/[\r\n]+$//; |
602 my @tab = split("\t", $_); | 659 my @tab = split("\t", $_); |
603 my $chr = ""; | 660 my $chr = ""; |
604 | |
605 # Don't consider chrM and GL | |
606 if($tab[$chr_value] =~ /M|GL/i) { next; } | |
607 | 661 |
608 # Replace chr23 or chr24 by X or Y | 662 # Replace chr23 or chr24 by X or Y |
609 if($tab[$chr_value] =~ /23/) { $chr = "chrX"; } | 663 if($tab[$chr_value] =~ /23/) { $chr = "chrX"; } |
610 elsif($tab[$chr_value] =~ /24/) { $chr = "chrY"; } | 664 elsif($tab[$chr_value] =~ /24/) { $chr = "chrY"; } |
611 elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; } | 665 elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; } |
612 else { $chr = "chr".$tab[$chr_value]; } | 666 else { $chr = "chr".$tab[$chr_value]; } |
613 | 667 |
668 | |
669 ### Consider only "normal" chromosomes for the annotation | |
670 if( ($chr !~ /chr\d{1,2}$|chrX|chrY/) ) { next; } | |
671 | |
672 | |
673 ### Don't consider variants with two or more alt bases | |
674 if($tab[$alt_value] =~ /\,/) { next; } | |
675 | |
614 ### Reformat the Indels for Annovar | 676 ### Reformat the Indels for Annovar |
615 # chr1 85642631 C CT => chr1 85642631 85642631 - T (mm10) | 677 # chr1 85642631 C CT => chr1 85642631 85642631 - T (mm10) |
616 # chr5 26085724 ACTT A => chr5 26085725 26085727 CTT - (mm10) | 678 # chr5 26085724 ACTT A => chr5 26085725 26085727 CTT - (mm10) |
617 if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) ) | 679 if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) ) |
618 { | 680 { |
619 ### First check if the indels in the file are not already correctly formated | 681 ### First check if the indels in the file are not already correctly formated |
620 if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) | 682 if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) |
621 { | 683 { |
622 # For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels) | 684 # For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels) |
623 # Insertion: start = start & end = start | 685 # Insertion: start = start & end = start |
624 if($tab[$ref_value] =~ /\-/) | 686 if($tab[$ref_value] =~ /\-/) |
625 { | 687 { |
626 print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]"; | 688 print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]"; |
627 } | 689 } |
628 ## Deletion: start = start & end = start + length(del) -1 | 690 ## Deletion: start = start & end = start + length(del) -1 |
629 else | |
630 { | |
631 my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1); | |
632 print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]"; | |
633 } | |
634 } | |
635 ### Indels not correctly formated for Annovar | |
636 else | 691 else |
637 { | 692 { |
638 my @tabRef = split("", $tab[$ref_value]); | 693 my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1); |
639 my @tabAlt = split("", $tab[$alt_value]); | 694 print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]"; |
640 | 695 } |
641 # Remove the first base | 696 } |
642 my $ref2 = join("", @tabRef[1 .. $#tabRef]); | 697 ### Indels not correctly formated for Annovar |
643 my $alt2 = join("", @tabAlt[1 .. $#tabAlt]); | |
644 | |
645 if(length($alt2) == 0) | |
646 { | |
647 my $altOK = "-"; | |
648 my $startOK = $tab[$start_value] + 1; | |
649 my $stopOK = $startOK + length($ref2) - 1; | |
650 print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK; | |
651 } | |
652 | |
653 if(length($ref2) == 0) | |
654 { | |
655 my $refOK = "-"; | |
656 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2; | |
657 } | |
658 } | |
659 } | |
660 ### SBS | |
661 else | 698 else |
662 { | 699 { |
663 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value]; | 700 my @tabRef = split("", $tab[$ref_value]); |
664 } | 701 my @tabAlt = split("", $tab[$alt_value]); |
665 | 702 |
666 ## Print the original file at the end | 703 # Remove the first base |
667 foreach (@tab) { print OUT "\t$_"; } | 704 my $ref2 = join("", @tabRef[1 .. $#tabRef]); |
668 print OUT "\n"; | 705 my $alt2 = join("", @tabAlt[1 .. $#tabAlt]); |
706 | |
707 if(length($alt2) == 0) | |
708 { | |
709 my $altOK = "-"; | |
710 my $startOK = $tab[$start_value] + 1; | |
711 my $stopOK = $startOK + length($ref2) - 1; | |
712 print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK; | |
713 } | |
714 | |
715 if(length($ref2) == 0) | |
716 { | |
717 my $refOK = "-"; | |
718 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2; | |
719 } | |
720 } | |
721 } | |
722 ### SBS | |
723 else | |
724 { | |
725 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value]; | |
726 } | |
727 | |
728 ## Print the original file at the end | |
729 foreach (@tab) { print OUT "\t$_"; } | |
730 print OUT "\n"; | |
669 } | 731 } |
670 close F1; close OUT; | 732 close F1; close OUT; |
671 } | 733 } |
672 | 734 |
673 sub AnnotateAV | 735 sub AnnotateAV |
674 { | 736 { |
675 my ($inputFile, $output) = @_; | 737 my ($inputFile, $output) = @_; |
676 | 738 |
677 if(!-e $path_AVDB) { print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; exit; } | 739 if(!-e $path_AVDB) |
740 { | |
741 print STDERR "Error message:\n"; | |
742 print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; | |
743 print STDERR "Please install the database for this genome before running Annovar\n"; | |
744 } | |
678 | 745 |
679 # Extract the name of the databases | 746 # Extract the name of the databases |
680 my $protocol = ""; my $operation = ""; | 747 my $protocol = ""; my $operation = ""; |
681 ExtractAVDBName($listAVDB, \$protocol, \$operation); | 748 ExtractAVDBName($listAVDB, \$protocol, \$operation); |
682 | 749 |
736 elsif($$refS_month == 8) { $$refS_month = "aug"; } | 803 elsif($$refS_month == 8) { $$refS_month = "aug"; } |
737 elsif($$refS_month == 9) { $$refS_month = "sept"; } | 804 elsif($$refS_month == 9) { $$refS_month = "sept"; } |
738 elsif($$refS_month == 10) { $$refS_month = "oct"; } | 805 elsif($$refS_month == 10) { $$refS_month = "oct"; } |
739 elsif($$refS_month == 11) { $$refS_month = "nov"; } | 806 elsif($$refS_month == 11) { $$refS_month = "nov"; } |
740 elsif($$refS_month == 12) { $$refS_month = "dec"; } | 807 elsif($$refS_month == 12) { $$refS_month = "dec"; } |
741 else { print STDERR "Month number don't considered\n"; exit; } | |
742 } | 808 } |
743 } | 809 } |
744 } | 810 } |
745 | 811 |
746 ### Add the minimum of annotations (refGene + strand + context) | 812 ### Add the minimum of annotations (refGene + strand + context) |
747 sub annotateAV_min | 813 sub annotateAV_min |
748 { | 814 { |
749 my ($inputFile, $output) = @_; | 815 my ($inputFile, $output) = @_; |
750 | 816 |
751 if(!-e $path_AVDB) { print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; exit; } | 817 if(!-e $path_AVDB) |
818 { | |
819 print STDERR "Error message:\n"; | |
820 print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; | |
821 print STDERR "Please install the database for this genome before running Annovar\n"; | |
822 } | |
752 | 823 |
753 # Extract the name of the databases | 824 # Extract the name of the databases |
754 my ($protocol, $operation) = ("refGene", "g"); | 825 my ($protocol, $operation) = ("refGene", "g"); |
755 | 826 |
756 `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; | 827 `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; |
785 elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; } | 856 elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; } |
786 elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; } | 857 elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; } |
787 else { $chr = $tab[$chr_value]; } | 858 else { $chr = $tab[$chr_value]; } |
788 | 859 |
789 # Verify if the element exists | 860 # Verify if the element exists |
790 if($chr eq "") { print "Error RecoverStrand: The chromosome value is nor defined for $_\n"; exit; } | 861 if($chr eq "") { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The chromosome value is nor defined for $_\n"; } |
791 if(! exists $tab[$start_value]) { print "Error RecoverStrand: The start value is nor defined for $_\n"; exit; } | 862 if(! exists $tab[$start_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The start value is nor defined for $_\n"; } |
792 if(! exists $tab[$ref_value]) { print "Error RecoverStrand: The reference value is nor defined for $_\n"; exit; } | 863 if(! exists $tab[$ref_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The reference value is nor defined for $_\n"; } |
793 if(! exists $tab[$alt_value]) { print "Error RecoverStrand: The alternate value is nor defined for $_\n"; exit; } | 864 if(! exists $tab[$alt_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The alternate value is nor defined for $_\n"; } |
794 if(! exists $tab[$func_value]) { print "Error RecoverStrand: The functional value is nor defined for $_\n"; exit; } | 865 if(! exists $tab[$func_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The functional value is nor defined for $_\n"; } |
795 if(! exists $tab[$geneSymbol_value]) { print "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; exit; } | 866 if(! exists $tab[$geneSymbol_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; } |
796 | 867 |
797 my $geneSymbol = ""; | 868 my $geneSymbol = ""; |
798 ######## For the splicing annotation we separate the gene symbol from the aa change | 869 ######## For the splicing annotation we separate the gene symbol from the aa change |
799 if($tab[$func_value] eq "splicing") | 870 if($tab[$func_value] eq "splicing") |
800 { | 871 { |
821 { | 892 { |
822 $_ =~ s/[\r\n]+$//; | 893 $_ =~ s/[\r\n]+$//; |
823 my @tab = split("\t", $_); | 894 my @tab = split("\t", $_); |
824 my $strand = ""; | 895 my $strand = ""; |
825 $strand = $tab[$db_strandInfo_value]; | 896 $strand = $tab[$db_strandInfo_value]; |
826 if($strand eq "") { print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; exit; } | 897 if($strand eq "") { print STDERR "Error message:\n"; print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; } |
827 else | 898 else |
828 { | 899 { |
829 # Some genes have several strand orientation, keep the first in the database | 900 # Some genes have several strand orientation, keep the first in the database |
830 if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; } | 901 if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; } |
831 } | 902 } |
872 | 943 |
873 foreach my $kDB (sort keys %h_database) | 944 foreach my $kDB (sort keys %h_database) |
874 { | 945 { |
875 if("$tab[5]:$tab[0]" eq $kDB) | 946 if("$tab[5]:$tab[0]" eq $kDB) |
876 { | 947 { |
877 if($lengthHeader != $lengthLine) { print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n"; exit; } | 948 if($lengthHeader != $lengthLine) |
949 { | |
950 print STDERR "Error message:\n"; | |
951 print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n"; | |
952 } | |
878 | 953 |
879 foreach my $line (@{$h_inputFile{$kFile}}) | 954 foreach my $line (@{$h_inputFile{$kFile}}) |
880 { | 955 { |
881 my @tab = split("\t", $line); | 956 my @tab = split("\t", $line); |
882 my $j = 0; | 957 my $j = 0; |
969 my %seqhash = (); #database sequence for each chromosome | 1044 my %seqhash = (); #database sequence for each chromosome |
970 my %name_seq = (); #sequence for each region | 1045 my %name_seq = (); #sequence for each region |
971 my (%seqlen, %discordlen, %badorf); #store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon | 1046 my (%seqlen, %discordlen, %badorf); #store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon |
972 my ($count_success, @failure) = (0); | 1047 my ($count_success, @failure) = (0); |
973 | 1048 |
974 for my $curchr (sort keys $refH_allRegion) | 1049 for my $curchr (sort keys %{$refH_allRegion}) |
975 { | 1050 { |
976 my ($seqid, $curseq) = ('', ''); | 1051 my ($seqid, $curseq) = ('', ''); |
977 my $fastafile = ""; | 1052 my $fastafile = ""; |
978 if ($curchr =~ m/^chr/) | 1053 if ($curchr =~ m/^chr/) |
979 { | 1054 { |
1078 $header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header); | 1153 $header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header); |
1079 # Print the Annovar header until the column before OtherInfo | 1154 # Print the Annovar header until the column before OtherInfo |
1080 print OUT "$tabHeaderInput[0]"; | 1155 print OUT "$tabHeaderInput[0]"; |
1081 my $j = 0; | 1156 my $j = 0; |
1082 for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; } | 1157 for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; } |
1083 print OUT "\tcontext"; | 1158 print OUT "\tcontext\ttrinucleotide_context"; |
1084 for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } | 1159 for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } |
1085 print OUT "\n"; | 1160 print OUT "\n"; |
1086 } | 1161 } |
1087 | 1162 |
1088 foreach my $k_hFile (sort keys $refH_InputFile) | 1163 foreach my $k_hFile (sort keys %{$refH_InputFile}) |
1089 { | 1164 { |
1090 foreach my $k_allRegonSeqContext (sort keys $refH_allRegionSeqContext) | 1165 foreach my $k_allRegonSeqContext (sort keys %{$refH_allRegionSeqContext}) |
1091 { | 1166 { |
1092 if($k_hFile eq $k_allRegonSeqContext) | 1167 if($k_hFile eq $k_allRegonSeqContext) |
1093 { | 1168 { |
1094 my $j=0; | 1169 my $j=0; |
1095 | 1170 |
1096 for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++) | 1171 for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++) |
1097 { | 1172 { |
1098 my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]); | 1173 my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]); |
1099 | 1174 |
1175 # Write Annovar annotation + strand orientation | |
1100 for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; } | 1176 for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; } |
1177 # Write the sequence context with the length defined by the user (default is 10) | |
1101 print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext}; | 1178 print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext}; |
1179 # Write the trinucleotide context | |
1180 my $contextSequence = $refH_allRegionSeqContext->{$k_allRegonSeqContext}; $contextSequence =~ tr/a-z/A-Z/; | |
1181 my @tempContextSequence = split("", $contextSequence); | |
1182 my $midlle_totalNbBaseContext = (scalar(@tempContextSequence)-1)/2; # For having the middle of the sequence | |
1183 print OUT "\t".$tempContextSequence[$midlle_totalNbBaseContext-1]."x".$tempContextSequence[$midlle_totalNbBaseContext+1]; | |
1184 # Write the original columns | |
1102 for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } | 1185 for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } |
1103 print OUT "\n"; | 1186 print OUT "\n"; |
1104 } | 1187 } |
1105 last; | 1188 last; |
1106 } | 1189 } |
1138 my $name_of_column_NB = "toto"; | 1221 my $name_of_column_NB = "toto"; |
1139 for(my $i=0; $i<=$#tab_search_header; $i++) | 1222 for(my $i=0; $i<=$#tab_search_header; $i++) |
1140 { | 1223 { |
1141 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; } | 1224 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; } |
1142 } | 1225 } |
1143 if($name_of_column_NB eq "toto") { print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } | 1226 if($name_of_column_NB eq "toto") |
1144 else { return $name_of_column_NB; } | 1227 { |
1228 print STDERR "Error message:\n"; | |
1229 print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; | |
1230 } | |
1231 else { return $name_of_column_NB; } | |
1145 } | 1232 } |
1146 | 1233 |
1147 | 1234 |
1148 | 1235 |
1149 | 1236 |
1163 -v, --verbose use verbose output | 1250 -v, --verbose use verbose output |
1164 --refGenome the reference genome to use | 1251 --refGenome the reference genome to use |
1165 --interval <interger> the number of bases for the sequence context | 1252 --interval <interger> the number of bases for the sequence context |
1166 -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file | 1253 -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file |
1167 -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size | 1254 -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size |
1168 --pathAVDBList the path to the list of AV databases installed | 1255 --pathAVDBList the path to a text file containing the list of Annovar databases installed |
1169 -temp --pathTemporary <string> the path for saving the temporary files | 1256 -temp --pathTemporary <string> the path for saving the temporary files |
1170 --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no) | 1257 --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no) |
1258 --max_cpu <integer> number of CPUs to be used for the annotation | |
1171 | 1259 |
1172 | 1260 |
1173 Function: automatically run a pipeline on a list of variants and annote them using Annovar | 1261 Function: automatically run a pipeline on a list of variants and annote them using Annovar |
1174 | 1262 |
1175 Example: # Annotation only | 1263 Example: # Annotation only |
1176 mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input | 1264 mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input |
1177 | 1265 |
1178 | 1266 |
1179 Version: 06-2016 (June 2016) | 1267 Version: 03-2017 (March 2017) |
1180 | 1268 |
1181 | 1269 |
1182 =head1 OPTIONS | 1270 =head1 OPTIONS |
1183 | 1271 |
1184 =over 8 | 1272 =over 8 |
1193 | 1281 |
1194 =item B<--verbose> | 1282 =item B<--verbose> |
1195 | 1283 |
1196 use verbose output. | 1284 use verbose output. |
1197 | 1285 |
1198 | |
1199 =item B<--refGenome> | 1286 =item B<--refGenome> |
1200 | 1287 |
1201 the reference genome to use, could be hg19 or mm9. | 1288 the reference genome to use, could be hg19 or mm9. |
1202 | 1289 |
1203 =item B<--interval> | 1290 =item B<--interval> |
1212 | 1299 |
1213 the path to the directory containing the Annovar databases and the files with the chromosome size. | 1300 the path to the directory containing the Annovar databases and the files with the chromosome size. |
1214 | 1301 |
1215 =item B<--pathAVDBList> | 1302 =item B<--pathAVDBList> |
1216 | 1303 |
1217 the path to a texte file containing the list of the Annovar databases installed. | 1304 the path to a text file containing the list of Annovar databases installed. |
1218 | 1305 |
1219 =item B<--pathTemporary> | 1306 =item B<--pathTemporary> |
1220 | 1307 |
1221 the path for saving temporary files generated by the script. | 1308 the path for saving temporary files generated by the script. |
1222 If any is specify a temporary folder is created in the same directory where the script is running. | 1309 If any is specify a temporary folder is created in the same directory where the script is running. |
1224 | 1311 |
1225 =item B<--fullAnnotation> | 1312 =item B<--fullAnnotation> |
1226 | 1313 |
1227 Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines) | 1314 Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines) |
1228 | 1315 |
1316 =item B<--max_cpu> | |
1317 | |
1318 Specify the number of CPUs to be used. This number is used for spliting the file in n part and running the annotations in each part in parallel. | |
1319 | |
1320 | |
1229 =head1 DESCRIPTION | 1321 =head1 DESCRIPTION |
1230 | 1322 |
1231 MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS. | 1323 MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS. |
1232 Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added. | 1324 Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added. |
1233 A text tab delimited file is produced. | 1325 A text tab delimited file is produced. |