7
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 #-----------------------------------#
|
|
4 # Author: Maude #
|
|
5 # Script: mutspecAnnot.pl #
|
|
6 # Last update: 02/03/17 #
|
|
7 #-----------------------------------#
|
|
8
|
|
9 use strict;
|
|
10 use warnings;
|
|
11 use Getopt::Long;
|
|
12 use Pod::Usage;
|
|
13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
|
|
14 use File::Path;
|
|
15 use Parallel::ForkManager;
|
|
16
|
|
17
|
|
18 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
|
|
19 our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files
|
|
20 our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context.
|
|
21 our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines)
|
|
22
|
|
23
|
|
24 #########################################
|
|
25 ### SPECIFY THE NUMBER OF CPU ###
|
|
26 #########################################
|
|
27 our ($max_cpu) = 8; # Max number of CPU to use for the annotation
|
|
28
|
|
29
|
|
30
|
|
31
|
|
32 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp, 'max_cpu=i' => \$max_cpu) or pod2usage(2);
|
|
33
|
|
34 our ($input) = @ARGV;
|
|
35
|
|
36 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
|
|
37 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
|
|
38 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script
|
|
39 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)
|
|
40
|
|
41
|
|
42
|
|
43 ######################################################################################################################################################
|
|
44 # GLOBAL VARIABLES #
|
|
45 ######################################################################################################################################################
|
|
46
|
|
47 # Recover the current path
|
|
48 our $pwd = `pwd`;
|
|
49 chomp($pwd);
|
|
50 # Recover the filename and the input directory
|
|
51 our ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/);
|
|
52 # Output directories
|
|
53 our ($folderMutAnalysis, $folderAnnovar) = ("", "");
|
|
54 # File with the list of Annovar databases to use
|
|
55 our $listAVDB = "";
|
|
56 # Initialisation of chromosome, position, ref and alt values
|
|
57 our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a");
|
|
58
|
|
59
|
|
60
|
|
61 ######################################################################################################################################################
|
|
62 # MAIN #
|
|
63 ######################################################################################################################################################
|
|
64 ## Check the presence of the flags and create the output and temp directories
|
|
65 CheckFlags();
|
|
66
|
|
67 ## Format the file correctly:
|
|
68 ## 1) Check the length of the filename (must be <= 31 characters)
|
|
69 ## 2) Recover the input format. If MuTect output consider only "KEEP" variants
|
|
70 ## 3) Recover the column number for chr, start, ref and alt
|
|
71 FormatingInputFile();
|
|
72
|
|
73 # Annotate the file with Annovar, add the strand orientation and the sequence context
|
|
74 FullAnnotation();
|
|
75
|
|
76 ######################################################################################################################################################
|
|
77 # FUNCTIONS #
|
|
78 ######################################################################################################################################################
|
|
79
|
|
80 ## Check the presence of the flags and create the output and temp directories
|
|
81 sub CheckFlags
|
|
82 {
|
|
83 # Check the reference genome
|
|
84 if($refGenome eq "empty")
|
|
85 {
|
|
86 print STDERR "Missing flag !\n";
|
|
87 print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n";
|
|
88 exit;
|
|
89 }
|
|
90 if($intervalEnd eq "empty")
|
|
91 {
|
|
92 print STDERR "Missing flag !\n";
|
|
93 print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n";
|
|
94 exit;
|
|
95 }
|
|
96 # If no output is specified write the result as the same place as the input file
|
|
97 if($output eq "empty")
|
|
98 {
|
|
99 my $directory = dirname( $input );
|
|
100
|
|
101 $folderMutAnalysis = "$directory/Mutational_Analysis";
|
|
102 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; }
|
|
103 }
|
|
104 else
|
|
105 {
|
|
106 if(!-e $output) { mkdir($output) or die "$!: $output\n"; }
|
|
107
|
|
108 $folderMutAnalysis = "$output/Mutational_Analysis";
|
|
109 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; }
|
|
110 }
|
|
111 # Create the output folder for Annovar
|
|
112 $folderAnnovar = "$folderMutAnalysis/Annovar";
|
|
113 if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; }
|
|
114
|
|
115 # Verify the access to Annovar databases
|
|
116 if($path_AVDB eq "empty")
|
|
117 {
|
|
118 print STDERR "Missing flag !\n";
|
|
119 print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n";
|
|
120 exit;
|
|
121 }
|
|
122 elsif(!-e $path_AVDB)
|
|
123 {
|
|
124 print STDERR "Error message:\n";
|
|
125 print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n";
|
|
126 }
|
|
127
|
|
128 # Check the file list AV DB
|
|
129 if($pathAVDBList eq "empty")
|
|
130 {
|
|
131 print STDERR "Missing flag !\n";
|
|
132 print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n";
|
|
133 exit;
|
|
134 }
|
|
135 else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" }
|
|
136
|
|
137 # If no temp folder is specified write the result in the current path
|
|
138 if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$filename"; }
|
|
139 if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; }
|
|
140
|
|
141 # Verify listAVDB is not empty
|
|
142 if($listAVDB eq "")
|
|
143 {
|
|
144 print STDERR "Path to the text file containing the list of Annovar databases installed is not specified !!!\n";
|
|
145 exit;
|
|
146 }
|
|
147 }
|
|
148
|
|
149 ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions
|
|
150 sub FormatingInputFile
|
|
151 {
|
|
152 # The input is a folder
|
|
153 if(-d $input)
|
|
154 {
|
|
155 foreach my $file (`ls $input`)
|
|
156 {
|
|
157 my $headerOriginalFile = "";
|
|
158 chomp($file);
|
|
159
|
|
160 CheckLengthFilename("$input/$file");
|
|
161
|
|
162 #################################################
|
|
163 ### Recover the input format ###
|
|
164 #################################################
|
|
165 RecoverInputFormat("$input/$file", \$headerOriginalFile);
|
|
166 }
|
|
167 }
|
|
168 # The input is one file
|
|
169 else
|
|
170 {
|
|
171 my $headerOriginalFile = "";
|
|
172
|
|
173 CheckLengthFilename($input);
|
|
174
|
|
175 #################################################
|
|
176 ### Recover the input format ###
|
|
177 #################################################
|
|
178 RecoverInputFormat($input, \$headerOriginalFile);
|
|
179 }
|
|
180 }
|
|
181
|
|
182 # The name for the Excel sheet can't be longer than 31 characters
|
|
183 sub CheckLengthFilename
|
|
184 {
|
|
185 my ($inputFile) = @_;
|
|
186
|
|
187 my ($filenameInputFile, $directoriesInputFile, $suffixInputFile) = fileparse($inputFile, qr/\.[^.]*/);
|
|
188
|
|
189 if(length($filenameInputFile) > 32)
|
|
190 {
|
|
191 print STDERR "Error message:\n";
|
|
192 print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n";
|
|
193 }
|
|
194 }
|
|
195
|
|
196 # Recover the input format. If MuTect output consider only "KEEP" variants
|
|
197 sub RecoverInputFormat
|
|
198 {
|
|
199 my ($file, $refS_headerOriginalFile) = @_;
|
|
200
|
|
201 my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
|
|
202
|
|
203 my $inputFormat = "";
|
|
204
|
|
205 open(F1, $file) or die "$!: $file\n";
|
|
206 my $header = <F1>;
|
|
207 close F1;
|
|
208
|
|
209 ### VCF and MuTect files have in their first line the type of the file
|
|
210 if($header =~ /fileformat=VCF/i) { $inputFormat = "vcf"; }
|
|
211 elsif($header =~ /mutect/i) { $inputFormat = "mutect"; }
|
|
212 else { $inputFormat = "unknown"; }
|
|
213
|
|
214
|
|
215 ### VCF files
|
|
216 if($inputFormat eq "vcf")
|
|
217 {
|
|
218 ### MuTect2 output VCFs
|
|
219 my $testVC = `grep MuTect2 $file`;
|
|
220 if($testVC =~ /MuTect2/)
|
|
221 {
|
|
222 # Keep only the variants passing MuTect2 filters
|
|
223 `grep PASS $file > $folder_temp/$filename-PASS.txt`;
|
|
224
|
|
225 # Recover the header
|
|
226 $$refS_headerOriginalFile = `grep '#CHROM' $file`;
|
|
227
|
|
228 # Add the header
|
|
229 # Sed command doesn't work... sed 's/^/some text\n/' file > res
|
|
230 open(OUT, ">", "$folder_temp/$filename-KEEP.txt") or die "$!: $folder_temp/$filename-KEEP.txt\n";
|
|
231 print OUT $$refS_headerOriginalFile;
|
|
232 open(F1, "$folder_temp/$filename-PASS.txt") or die "$!: $folder_temp/$filename-PASS.txt\n";
|
|
233 while(<F1>) { print OUT $_; }
|
|
234 close F1; close OUT;
|
|
235
|
|
236 `rm $folder_temp/$filename-PASS.txt`;
|
|
237
|
|
238 # Check if there if no empty column
|
|
239 CheckEmptyColumn("$folder_temp/$filename-KEEP.txt");
|
|
240 `rm $folder_temp/$filename-KEEP.txt`;
|
|
241
|
|
242 # Set the col number for the chr,start,ref and alt
|
|
243 ($chrValue, $positionValue, $refValue, $altValue) = (0, 1, 3, 4);
|
|
244 }
|
|
245 else
|
|
246 {
|
|
247 open(F1, $file) or die "$!: $file\n";
|
|
248 open(OUT, ">", "$folder_temp/$filename.txt") or die "$!: $folder_temp/$filename.txt\n";
|
|
249 while (<F1>)
|
|
250 {
|
|
251 $_ =~ s/[\r\n]+$//;
|
|
252 my @tab = split("\t", $_);
|
|
253 # Print the VCF header
|
|
254 if($tab[0] eq "#CHROM")
|
|
255 {
|
|
256 $tab[0] =~ /#(.+)/;
|
|
257 print OUT "$1";
|
|
258 for(my $i=1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; }
|
|
259 print OUT "\n";
|
|
260 }
|
|
261 elsif($tab[0] !~ /##/)
|
|
262 {
|
|
263 # Don't consider chromosome random, GL and MT
|
|
264 if( ($tab[0] =~ /random/) || ($tab[0] =~ /GL/i) ) { next; }
|
|
265 print OUT "$_\n";
|
|
266 }
|
|
267 }
|
|
268 close F1; close OUT;
|
|
269
|
|
270 ## Recover the header
|
|
271 open(F1, "$folder_temp/$filename.txt") or die "$!: $folder_temp/$filename.txt\n";
|
|
272 $$refS_headerOriginalFile = <F1>;
|
|
273 close F1;
|
|
274
|
|
275 # Check if there if no empty column
|
|
276 CheckEmptyColumn("$folder_temp/$filename.txt");
|
|
277 `rm $folder_temp/$filename.txt`;
|
|
278
|
|
279
|
|
280 # Set the col number for the chr,start,ref and alt
|
|
281 ($chrValue, $positionValue, $refValue, $altValue) = (0, 1, 3, 4);
|
|
282 }
|
|
283 }
|
|
284 ### MuTect files
|
|
285 elsif($inputFormat eq "mutect")
|
|
286 {
|
|
287 `sed '1d' $file > $folder_temp/$filename-HeaderOK`;
|
|
288 # Keep only the SNVs of good quality
|
|
289 `grep -v REJECT $folder_temp/$filename-HeaderOK > $folder_temp/$filename-KEEP.txt`;
|
|
290 `rm $folder_temp/$filename-HeaderOK`;
|
|
291
|
|
292 # Recover the header
|
|
293 open(F1, "$folder_temp/$filename-KEEP.txt") or die "$!: $folder_temp/$filename-KEEP.txt\n";
|
|
294 $$refS_headerOriginalFile = <F1>;
|
|
295 close F1;
|
|
296
|
|
297 # Check if there if no empty column
|
|
298 CheckEmptyColumn("$folder_temp/$filename-KEEP.txt");
|
|
299 `rm $folder_temp/$filename-KEEP.txt`;
|
|
300
|
|
301 # Recover the name and the number of the columns that contain the chromosome number, the start position, the ref and alt alleles.
|
|
302 # Use the dictionary for recover the names and the position of the columns
|
|
303 RecoverColNameAuto("$folder_temp/$filename-KEEPColumnCorrect.txt", $$refS_headerOriginalFile, \$chrValue, \$positionValue, \$refValue, \$altValue);
|
|
304 }
|
|
305 ### Unknown type
|
|
306 else
|
|
307 {
|
|
308 ## Recover the header
|
|
309 open(F1, $file) or die "$!: $file\n";
|
|
310 $$refS_headerOriginalFile = <F1>;
|
|
311 close F1;
|
|
312
|
|
313 # Check if there if no empty column
|
|
314 CheckEmptyColumn($file);
|
|
315
|
|
316 ## Recover the name and the number of the columns that contain the chromosome number, the start position, the ref and alt alleles.
|
|
317 # Use the dictionary for recover the names and the position of the columns
|
|
318 RecoverColNameAuto($file, $$refS_headerOriginalFile, \$chrValue, \$positionValue, \$refValue, \$altValue);
|
|
319 }
|
|
320 }
|
|
321
|
|
322 # Some files can have empty column with no information
|
|
323 sub CheckEmptyColumn
|
|
324 {
|
|
325 my ($inputFile) = @_;
|
|
326
|
|
327 my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/);
|
|
328
|
|
329 if($filename =~ /(.+)-KEEP/) { $filename = $1; }
|
|
330
|
|
331 open(OUT, ">", "$folder_temp/$filename-ColumnCorrect.txt") or die "$!: $folder_temp/$filename-ColumnCorrect.txt\n";
|
|
332
|
|
333 open(F1, $inputFile) or die "$!: $inputFile\n";
|
|
334 my $header = <F1>; $header =~ s/[\r\n]+$//; my @tabHeader = split("\t", $header);
|
|
335 print OUT $header, "\n";
|
|
336 while(<F1>)
|
|
337 {
|
|
338 $_ =~ s/[\r\n]+$//;
|
|
339 my @tab = split("\t", $_);
|
|
340
|
|
341 if(scalar(@tab) != scalar(@tabHeader))
|
|
342 {
|
|
343 print OUT $tab[0];
|
|
344 for(my $i=1; $i<=$#tabHeader; $i++)
|
|
345 {
|
|
346 if(defined($tab[$i])) { print OUT "\t$tab[$i]"; }
|
|
347 else { print OUT "\tNA"; }
|
|
348 }
|
|
349 print OUT "\n";
|
|
350 }
|
|
351 else { print OUT "$_\n"; }
|
|
352 }
|
|
353 close F1; close OUT;
|
|
354 }
|
|
355
|
|
356 # Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles.
|
|
357 sub RecoverColNameAuto
|
|
358 {
|
|
359 our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_;
|
|
360
|
|
361 $header =~ s/[\r\n]+$//;
|
|
362
|
|
363 ## Name of the columns
|
|
364 my @mutect = qw(contig position ref_allele alt_allele);
|
|
365 my @vcf = qw(CHROM POS REF ALT);
|
|
366 my @cosmic = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic);
|
|
367 my @icgc = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele);
|
|
368 my @tcga = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2);
|
|
369 my @ionTorrent = qw(chr Position Ref Alt);
|
|
370 my @proton = qw(Chrom Position Ref Variant);
|
|
371 my @varScan2 = qw(Chrom Position Ref VarAllele);
|
|
372 my @varScan2_somatic = qw(chrom position ref var);
|
|
373 my @annovar = qw(Chr Start Ref Obs);
|
|
374 my @custom = qw(Chromosome Start Wild_Type Mutant);
|
|
375
|
|
376 my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@varScan2_somatic, \@annovar, \@custom);
|
|
377 my $timer = 0; # For controlling if the names are present on the dictionnary or not
|
|
378
|
|
379 foreach my $refTab (@allTab)
|
|
380 {
|
|
381 my @tab = @$refTab;
|
|
382
|
|
383 SearchCol(\@tab);
|
|
384
|
|
385 # The columns names were find
|
|
386 if( ($$ref_chrValue ne "c") && ($$ref_positionValue ne "s") && ($$ref_refValue ne "r") && ($$ref_altValue ne "a") ) { last; }
|
|
387 # The names of the columns are not present in the dictionnary
|
|
388 else { $timer++; }
|
|
389 }
|
|
390
|
|
391 if($timer == scalar(@allTab))
|
|
392 {
|
|
393 print STDERR "Error message:\n";
|
|
394 print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n";
|
|
395 print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n";
|
|
396 exit;
|
|
397 }
|
|
398
|
|
399 # Extract the number of the column that contain the information
|
|
400 sub SearchCol
|
|
401 {
|
|
402 my ($refTab) = @_;
|
|
403
|
|
404 my @tabNames = @$refTab;
|
|
405 my @tabHeader = split("\t", $header);
|
|
406
|
|
407 # For VCF
|
|
408 if($tabHeader[0] eq "#CHROM") { ($$ref_chrValue, $$ref_positionValue, $$ref_refValue, $$ref_altValue) = (0, 1, 3, 4); }
|
|
409 # For tabular files
|
|
410 else
|
|
411 {
|
|
412 for(my $i=0; $i<=$#tabNames; $i++)
|
|
413 {
|
|
414 for(my $j=0; $j<=$#tabHeader; $j++)
|
|
415 {
|
|
416 if($tabHeader[$j] eq $tabNames[$i])
|
|
417 {
|
|
418 if($i == 0) { $$ref_chrValue = $j; }
|
|
419 if($i == 1) { $$ref_positionValue = $j; }
|
|
420 if($i == 2) { $$ref_refValue = $j; }
|
|
421 if($i == 3) { $$ref_altValue = $j; }
|
|
422 last; # Once find pass to the next name
|
|
423 }
|
|
424 }
|
|
425 }
|
|
426 }
|
|
427 }
|
|
428 }
|
|
429
|
|
430 # Annotate the file with Annovar, add the strand orientation and the sequence context
|
|
431 sub FullAnnotation
|
|
432 {
|
|
433 print "-----------------------------------------------------------------\n";
|
|
434 print "---------------------------Annotation----------------------------\n";
|
|
435 print "-----------------------------------------------------------------\n";
|
|
436
|
|
437
|
|
438 # If the input is a folder
|
|
439 if(-d $input)
|
|
440 {
|
|
441 foreach my $file (`ls $folder_temp/*.txt`)
|
|
442 {
|
|
443 chomp($file);
|
|
444
|
|
445 # For recover the name of the file without extension, the directory where the file is and the extension of the file
|
|
446 my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/);
|
|
447 my $filenameOK = "";
|
|
448 # For removing the ColumnCorrect for txt files
|
|
449 if($filename =~ /(.+)-ColumnCorrect/)
|
|
450 {
|
|
451 if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; }
|
|
452 else { $filenameOK = $1; }
|
|
453 }
|
|
454 else { print STDERR "Error message:\n"; print STDERR "Case not considered for $filename!!!\n"; }
|
|
455
|
|
456
|
|
457 #################################################
|
|
458 ### Cut the files in n part ###
|
|
459 #################################################
|
|
460 # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts
|
|
461 my $cpu = 0;
|
|
462 # Keep the original header
|
|
463 my $headerOriginalFile = "";
|
|
464 # Save the numer of lines
|
|
465 my $nbLine = 0;
|
|
466 splitInputFile($file, \$cpu, \$headerOriginalFile, $filenameOK, \$nbLine);
|
|
467
|
|
468
|
|
469 #################################################
|
|
470 ### Annotate the n part ###
|
|
471 #################################################
|
|
472 annotateFile($cpu, $filenameOK, $headerOriginalFile);
|
|
473
|
|
474
|
|
475 #################################################
|
|
476 ### Paste the file together ###
|
|
477 #################################################
|
|
478 createOutput($filenameOK, $headerOriginalFile, $nbLine);
|
|
479 }
|
|
480 }
|
|
481 # The input file is one file
|
|
482 else
|
|
483 {
|
|
484 #################################################
|
|
485 ### Cut the files in n part ###
|
|
486 #################################################
|
|
487 # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts
|
|
488 my $cpu = 0;
|
|
489 # Keep the original header
|
|
490 my $headerOriginalFile = "";
|
|
491 # Save the numer of lines
|
|
492 my $nbLine = 0;
|
|
493 splitInputFile("$folder_temp/$filename-ColumnCorrect.txt", \$cpu, \$headerOriginalFile, $filename, \$nbLine);
|
|
494
|
|
495
|
|
496 #################################################
|
|
497 ### Annotate the n part ###
|
|
498 #################################################
|
|
499 annotateFile($cpu, $filename, $headerOriginalFile);
|
|
500
|
|
501
|
|
502 #################################################
|
|
503 ### Paste the file together ###
|
|
504 #################################################
|
|
505 createOutput($filename, $headerOriginalFile, $nbLine);
|
|
506 }
|
|
507 # Remove the temporary directory
|
|
508 rmtree($folder_temp);
|
|
509 }
|
|
510
|
|
511
|
|
512
|
|
513 sub splitInputFile
|
|
514 {
|
|
515 my ($inputFile, $ref_cpu, $ref_header, $filename, $ref_nbLine) = @_;
|
|
516
|
|
517 my $nbVariants = `wc -l $inputFile`;
|
|
518 $nbVariants =~ /(\d+).+/;
|
|
519 $$ref_nbLine = $1;
|
|
520
|
|
521 if($$ref_nbLine-1 <= 5000) { $$ref_cpu = 1; }
|
|
522 elsif( ($$ref_nbLine-1 > 5000) && ($$ref_nbLine-1 < 25000) ) { $$ref_cpu = 2; }
|
|
523 elsif( ($$ref_nbLine-1 >= 25000) && ($$ref_nbLine-1 < 100000) ) { $$ref_cpu = 8; }
|
|
524 else { $$ref_cpu = $max_cpu; }
|
|
525
|
|
526 # If the number predefined can't be used on the machine use the maximum number specify by the administrator
|
|
527 if($$ref_cpu > $max_cpu) { $$ref_cpu = $max_cpu }
|
|
528
|
|
529
|
|
530 ## Recover the header
|
|
531 open(F1, $inputFile) or die "$!: $inputFile\n";
|
|
532 $$ref_header= <F1>;
|
|
533 close F1;
|
|
534
|
|
535 ## Remove the first line of the file
|
|
536 my $fileNoHeader = "$folder_temp/${filename}-NoHeader";
|
|
537 `sed 1d $inputFile > $fileNoHeader`;
|
|
538
|
|
539 if(!-e "$folder_temp/$filename") { mkdir("$folder_temp/$filename") or die "Can't create the directory $folder_temp/$filename\n"; }
|
|
540 my $lines_per_temp = int(1+($1 / $$ref_cpu)); # +1 in case of the div == 0
|
|
541 `split -l $lines_per_temp $fileNoHeader $folder_temp/$filename/$filename-`;
|
|
542
|
|
543 if($$ref_header eq "") { print STDERR "Error message:\n"; print STDERR "No header for the file $inputFile!!!\nPlease check the format of your file\n"; }
|
|
544 }
|
|
545
|
|
546 sub annotateFile
|
|
547 {
|
|
548 my ($cpu, $filename, $headerOriginalFile) = @_;
|
|
549
|
|
550 my $pm = Parallel::ForkManager->new($cpu);
|
|
551
|
|
552 foreach my $tempFile (`ls $folder_temp/$filename/$filename-*`)
|
|
553 {
|
|
554 chomp($tempFile);
|
|
555
|
|
556 # Forks and returns the pid for the child:
|
|
557 my $pid = $pm->start and next;
|
|
558
|
|
559 # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo
|
|
560 my ($filenameTempFile, $directoriesTempFile, $suffixTempFile) = fileparse($tempFile, qr/\-[^.]*/);
|
|
561 my $outFilenameTemp = $filenameTempFile.$suffixTempFile;
|
|
562 Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput");
|
|
563
|
|
564 # Annotate the file with Annovar
|
|
565 my $tempFileName_AVOutput = $filename.$suffixTempFile.".".${refGenome}."_multianno.txt";
|
|
566 if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
|
|
567 else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
|
|
568
|
|
569 # Check if the annotations worked
|
|
570 open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n";
|
|
571 while(<F1>)
|
|
572 {
|
|
573 if($_ =~ /ERROR/i)
|
|
574 {
|
|
575 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n";
|
|
576 print STDERR $_;
|
|
577 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n";
|
|
578 }
|
|
579 }
|
|
580 close F1;
|
|
581
|
|
582 # Recover the strand orientation
|
|
583 my $length_AVheader = 0;
|
|
584 RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader);
|
|
585
|
|
586 # Recover the sequence context
|
|
587 RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filename/$outFilenameTemp".".".${refGenome}."_multianno.txt");
|
|
588
|
|
589 $pm->finish; # Terminates the child process
|
|
590 }
|
|
591 # Wait all the child process
|
|
592 $pm->wait_all_children;
|
|
593 }
|
|
594
|
|
595 sub createOutput
|
|
596 {
|
|
597 my ($filename, $headerOriginalFile, $nbLine) = @_;
|
|
598
|
|
599 ## For MuTect and MuTect2 calling only variants passing MuTect filters are kept and sometines there is no variant passing these filters making error in Galaxy when using "collection".
|
|
600 if($nbLine == 1)
|
|
601 {
|
|
602 print STDOUT "\nThe sample $filename didn't pass MuTect filters\n";
|
|
603
|
|
604 ### Print Annovar minimal header + the original header of the input file
|
|
605 my $outputFile = "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt";
|
|
606 open(OUT, ">", $outputFile) or die "$!: $outputFile\n";
|
|
607
|
|
608 if($fullAVDB eq "no")
|
|
609 {
|
|
610 print OUT "Chr\tStart\tEnd\tRef\tAlt\tFunc.refGene\tGene.refGene\tGeneDetail.refGene\tExonicFunc.refGene\tAAChange.refGene\tStrand\tcontext";
|
|
611 print OUT "\t".$headerOriginalFile;
|
|
612 }
|
|
613 ### Print complete Annovar header (using the database name present in the file listAVDB) + the original header of the input file
|
|
614 else
|
|
615 {
|
|
616 print OUT "Chr\tStart\tEnd\tRef\tAlt";
|
|
617 open(F1, $listAVDB) or die "$!: $listAVDB\n";
|
|
618 while(<F1>)
|
|
619 {
|
|
620 if($_ =~ /^#/) { next; }
|
|
621
|
|
622 my @tab = split("\t", $_);
|
|
623 $tab[0] =~ /$refGenome\_(.+)\.txt/;
|
|
624 my $dbName = $1;
|
|
625
|
|
626 if($dbName =~ /refGene|knownGene|ensGene/)
|
|
627 {
|
|
628 print OUT "\t"."Func.$dbName\tGene.$dbName\tGeneDetail.$dbName\tExonicFunc.$dbName\tAAChange.$dbName";
|
|
629 }
|
|
630 else
|
|
631 {
|
|
632 print OUT "\t".$dbName;
|
|
633 }
|
|
634 }
|
|
635 print OUT "\tStrand\tcontext\t".$headerOriginalFile;
|
|
636
|
|
637 close F1;
|
|
638 }
|
|
639 close OUT;
|
|
640 }
|
|
641 else
|
|
642 {
|
|
643 CombinedTempFile("$folder_temp/$filename", "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt");
|
|
644 }
|
|
645 }
|
|
646
|
|
647 sub Convert2AV
|
|
648 {
|
|
649 my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_;
|
|
650
|
|
651 my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/);
|
|
652
|
|
653 open(F1, $inputFile) or die "$!: $inputFile\n";
|
|
654
|
|
655 open(OUT, ">", $output) or die "$!: $output\n";
|
|
656 while(<F1>)
|
|
657 {
|
|
658 $_ =~ s/[\r\n]+$//;
|
|
659 my @tab = split("\t", $_);
|
|
660 my $chr = "";
|
|
661
|
|
662 # Replace chr23 or chr24 by X or Y
|
|
663 if($tab[$chr_value] =~ /23/) { $chr = "chrX"; }
|
|
664 elsif($tab[$chr_value] =~ /24/) { $chr = "chrY"; }
|
|
665 elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; }
|
|
666 else { $chr = "chr".$tab[$chr_value]; }
|
|
667
|
|
668
|
|
669 ### Consider only "normal" chromosomes for the annotation
|
|
670 if( ($chr !~ /chr\d{1,2}$|chrX|chrY/) ) { next; }
|
|
671
|
|
672
|
|
673 ### Don't consider variants with two or more alt bases
|
|
674 if($tab[$alt_value] =~ /\,/) { next; }
|
|
675
|
|
676 ### Reformat the Indels for Annovar
|
|
677 # chr1 85642631 C CT => chr1 85642631 85642631 - T (mm10)
|
|
678 # chr5 26085724 ACTT A => chr5 26085725 26085727 CTT - (mm10)
|
|
679 if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) )
|
|
680 {
|
|
681 ### First check if the indels in the file are not already correctly formated
|
|
682 if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") )
|
|
683 {
|
|
684 # For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels)
|
|
685 # Insertion: start = start & end = start
|
|
686 if($tab[$ref_value] =~ /\-/)
|
|
687 {
|
|
688 print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]";
|
|
689 }
|
|
690 ## Deletion: start = start & end = start + length(del) -1
|
|
691 else
|
|
692 {
|
|
693 my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1);
|
|
694 print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]";
|
|
695 }
|
|
696 }
|
|
697 ### Indels not correctly formated for Annovar
|
|
698 else
|
|
699 {
|
|
700 my @tabRef = split("", $tab[$ref_value]);
|
|
701 my @tabAlt = split("", $tab[$alt_value]);
|
|
702
|
|
703 # Remove the first base
|
|
704 my $ref2 = join("", @tabRef[1 .. $#tabRef]);
|
|
705 my $alt2 = join("", @tabAlt[1 .. $#tabAlt]);
|
|
706
|
|
707 if(length($alt2) == 0)
|
|
708 {
|
|
709 my $altOK = "-";
|
|
710 my $startOK = $tab[$start_value] + 1;
|
|
711 my $stopOK = $startOK + length($ref2) - 1;
|
|
712 print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK;
|
|
713 }
|
|
714
|
|
715 if(length($ref2) == 0)
|
|
716 {
|
|
717 my $refOK = "-";
|
|
718 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2;
|
|
719 }
|
|
720 }
|
|
721 }
|
|
722 ### SBS
|
|
723 else
|
|
724 {
|
|
725 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value];
|
|
726 }
|
|
727
|
|
728 ## Print the original file at the end
|
|
729 foreach (@tab) { print OUT "\t$_"; }
|
|
730 print OUT "\n";
|
|
731 }
|
|
732 close F1; close OUT;
|
|
733 }
|
|
734
|
|
735 sub AnnotateAV
|
|
736 {
|
|
737 my ($inputFile, $output) = @_;
|
|
738
|
|
739 if(!-e $path_AVDB)
|
|
740 {
|
|
741 print STDERR "Error message:\n";
|
|
742 print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n";
|
|
743 print STDERR "Please install the database for this genome before running Annovar\n";
|
|
744 }
|
|
745
|
|
746 # Extract the name of the databases
|
|
747 my $protocol = ""; my $operation = "";
|
|
748 ExtractAVDBName($listAVDB, \$protocol, \$operation);
|
|
749
|
|
750 `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`;
|
|
751
|
|
752 sub ExtractAVDBName
|
|
753 {
|
|
754 my ($listAVDB, $refS_protocol, $refS_operation) = @_;
|
|
755
|
|
756 open(F1, $listAVDB) or die "$!: $listAVDB\n";
|
|
757 while(<F1>)
|
|
758 {
|
|
759 if ($_ =~ /^#/) { next; }
|
|
760
|
|
761 $_ =~ s/[\r\n]+$//;
|
|
762 my @tab = split("\t", $_);
|
|
763
|
|
764 # db name like refGenome_dbName.txt
|
|
765 if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /ljb26/) )
|
|
766 {
|
|
767 $$refS_protocol .= $1.","; $$refS_operation .= $tab[1].",";
|
|
768 }
|
|
769 # 1000 genome
|
|
770 if($tab[0] =~ /sites/)
|
|
771 {
|
|
772 $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/;
|
|
773 my ($dbName, $year, $month) = ($1, $2, $3);
|
|
774 $dbName =~ tr/A-Z/a-z/;
|
|
775
|
|
776 # convert the month number into the month name
|
|
777 ConvertMonth(\$month);
|
|
778
|
|
779 my $AVdbName_final = "1000g".$year.$month."_".$dbName;
|
|
780 $$refS_protocol .=$AVdbName_final.","; $$refS_operation .= $tab[1].",";
|
|
781 }
|
|
782 # ESP
|
|
783 if( ($tab[0] =~ /esp/) || ($tab[0] =~ /ljb26/) )
|
|
784 {
|
|
785 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/;
|
|
786 my $AVdbName_final = $1."_".$2;
|
|
787 $$refS_protocol .=$AVdbName_final.","; $$refS_operation .= $tab[1].",";
|
|
788 }
|
|
789 }
|
|
790 close F1;
|
|
791
|
|
792 sub ConvertMonth
|
|
793 {
|
|
794 my ($refS_month) = @_;
|
|
795
|
|
796 if($$refS_month == 1) { $$refS_month = "janv"; }
|
|
797 elsif($$refS_month == 2) { $$refS_month = "feb"; }
|
|
798 elsif($$refS_month == 3) { $$refS_month = "mar"; }
|
|
799 elsif($$refS_month == 4) { $$refS_month = "apr"; }
|
|
800 elsif($$refS_month == 5) { $$refS_month = "may"; }
|
|
801 elsif($$refS_month == 6) { $$refS_month = "jun"; }
|
|
802 elsif($$refS_month == 7) { $$refS_month = "jul"; }
|
|
803 elsif($$refS_month == 8) { $$refS_month = "aug"; }
|
|
804 elsif($$refS_month == 9) { $$refS_month = "sept"; }
|
|
805 elsif($$refS_month == 10) { $$refS_month = "oct"; }
|
|
806 elsif($$refS_month == 11) { $$refS_month = "nov"; }
|
|
807 elsif($$refS_month == 12) { $$refS_month = "dec"; }
|
|
808 }
|
|
809 }
|
|
810 }
|
|
811
|
|
812 ### Add the minimum of annotations (refGene + strand + context)
|
|
813 sub annotateAV_min
|
|
814 {
|
|
815 my ($inputFile, $output) = @_;
|
|
816
|
|
817 if(!-e $path_AVDB)
|
|
818 {
|
|
819 print STDERR "Error message:\n";
|
|
820 print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n";
|
|
821 print STDERR "Please install the database for this genome before running Annovar\n";
|
|
822 }
|
|
823
|
|
824 # Extract the name of the databases
|
|
825 my ($protocol, $operation) = ("refGene", "g");
|
|
826
|
|
827 `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`;
|
|
828 }
|
|
829
|
|
830 sub RecoverStrand
|
|
831 {
|
|
832 my ($input, $headerOriginalFile, $pathDB, $refGenome, $output, $refS_lengthAVheader) = @_;
|
|
833
|
|
834 my ($chr_value, $start_value, $ref_value, $alt_value, $func_value, $geneSymbol_value) = ("", "", "", "", "", "", "", "");
|
|
835
|
|
836 $chr_value = recoverNumCol($input, "Chr");
|
|
837 $start_value = recoverNumCol($input, "Start");
|
|
838 $ref_value = recoverNumCol($input, "Ref");
|
|
839 $alt_value = recoverNumCol($input, "Alt");
|
|
840 $func_value = recoverNumCol($input, "Func.refGene");
|
|
841 $geneSymbol_value = recoverNumCol($input, "Gene.refGene");
|
|
842
|
|
843 #################### Convert the input file into a hash table
|
|
844 my %h_inputFile = ();
|
|
845 open(F1, $input) or die "$!: $input\n";
|
|
846 my $annovar_header = <F1>;
|
|
847
|
|
848 while(<F1>)
|
|
849 {
|
|
850 $_ =~ s/[\r\n]+$//;
|
|
851 my @tab = split("\t", $_);
|
|
852
|
|
853 # In COSMIC the chromosome X and Y are annotated 23 and 24
|
|
854 my $chr = "";
|
|
855 if($tab[$chr_value] eq "chr23") { $chr = "chrX"; }
|
|
856 elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; }
|
|
857 elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; }
|
|
858 else { $chr = $tab[$chr_value]; }
|
|
859
|
|
860 # Verify if the element exists
|
|
861 if($chr eq "") { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The chromosome value is nor defined for $_\n"; }
|
|
862 if(! exists $tab[$start_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The start value is nor defined for $_\n"; }
|
|
863 if(! exists $tab[$ref_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The reference value is nor defined for $_\n"; }
|
|
864 if(! exists $tab[$alt_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The alternate value is nor defined for $_\n"; }
|
|
865 if(! exists $tab[$func_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The functional value is nor defined for $_\n"; }
|
|
866 if(! exists $tab[$geneSymbol_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; }
|
|
867
|
|
868 my $geneSymbol = "";
|
|
869 ######## For the splicing annotation we separate the gene symbol from the aa change
|
|
870 if($tab[$func_value] eq "splicing")
|
|
871 {
|
|
872 if($tab[$geneSymbol_value] =~ /(.+)\((.+)\)/) { $geneSymbol = $1; }
|
|
873 else { $geneSymbol = $tab[$geneSymbol_value]; }
|
|
874 }
|
|
875 else { $geneSymbol = $tab[$geneSymbol_value]; }
|
|
876
|
|
877 push(@{$h_inputFile{"$chr:$tab[$start_value]:$tab[$start_value]:$tab[$ref_value]:$tab[$alt_value]:$geneSymbol"}}, $_);
|
|
878 }
|
|
879 close F1;
|
|
880
|
|
881 # print "\t\tRecoverStrand: $input\n";
|
|
882
|
|
883 #################### Convert the database file into a hash table
|
|
884 my %h_database = ();
|
|
885 my ($db_geneSymbol_value, $db_strandInfo_value, $db_chr_value) = (12, 3, 2);
|
|
886
|
|
887 my $folderNameDB = $refGenome."db";
|
|
888 my $fileNameDB = $refGenome."_refGene.txt";
|
|
889
|
|
890 open(F1, "$pathDB/$fileNameDB") or die "$!: $pathDB/$fileNameDB\n";
|
|
891 while(<F1>)
|
|
892 {
|
|
893 $_ =~ s/[\r\n]+$//;
|
|
894 my @tab = split("\t", $_);
|
|
895 my $strand = "";
|
|
896 $strand = $tab[$db_strandInfo_value];
|
|
897 if($strand eq "") { print STDERR "Error message:\n"; print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; }
|
|
898 else
|
|
899 {
|
|
900 # Some genes have several strand orientation, keep the first in the database
|
|
901 if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; }
|
|
902 }
|
|
903 }
|
|
904 close F1;
|
|
905
|
|
906 #################### Parse the two hash tables for recover the strand information
|
|
907 open(OUT, ">", $output) or die "$!: $output\n";
|
|
908
|
|
909
|
|
910 ## Add the header only for the firts part of the files
|
|
911 if($input =~ /\-aa/)
|
|
912 {
|
|
913 my @tabHeaderInput = "";
|
|
914 $annovar_header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $annovar_header);
|
|
915 # Save the length of the Annovar header for the next function (RecoverGenomicSequence)
|
|
916 $$refS_lengthAVheader = $#tabHeaderInput;
|
|
917
|
|
918 # Print the Annovar header until the column before OtherInfo
|
|
919 print OUT "$tabHeaderInput[0]";
|
|
920 for(my $i=1; $i<$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; }
|
|
921 print OUT "\tStrand";
|
|
922 print OUT "\t",$headerOriginalFile;
|
|
923 }
|
|
924
|
|
925
|
|
926 # Timer for comparing the number of SNVs present in the hash table
|
|
927 my $timerUniqueSNVs = 0;
|
|
928 # Timer for comparing the number of SNVs with the strand orientation
|
|
929 my $timerSNVsStrand = 0;
|
|
930
|
|
931 foreach my $kFile (sort keys %h_inputFile)
|
|
932 {
|
|
933 my $test = 0;
|
|
934 my @tab = split(":", $kFile);
|
|
935
|
|
936 # Sometimes the line is not printed correctely !!!!! :@
|
|
937 my @tHeaderInput = split("\t", $annovar_header); my @lengthLine = split("\t", $h_inputFile{$kFile}[0]);
|
|
938 my @tHeaderOriginalFile = split("\t", $headerOriginalFile);
|
|
939 my $lengthHeader = @tHeaderInput + (scalar(@tHeaderOriginalFile)-1) ; my $lengthLine = @lengthLine;
|
|
940
|
|
941 # Save the length of the Annovar header for the next function (RecoverGenomicSequence)
|
|
942 $$refS_lengthAVheader = $#tHeaderInput;
|
|
943
|
|
944 foreach my $kDB (sort keys %h_database)
|
|
945 {
|
|
946 if("$tab[5]:$tab[0]" eq $kDB)
|
|
947 {
|
|
948 if($lengthHeader != $lengthLine)
|
|
949 {
|
|
950 print STDERR "Error message:\n";
|
|
951 print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n";
|
|
952 }
|
|
953
|
|
954 foreach my $line (@{$h_inputFile{$kFile}})
|
|
955 {
|
|
956 my @tab = split("\t", $line);
|
|
957 my $j = 0;
|
|
958
|
|
959 for(my $i=0; $i<$#tHeaderInput; $i++) { print OUT $tab[$i],"\t"; $j=$i }
|
|
960 print OUT $h_database{$kDB};
|
|
961 for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; }
|
|
962 print OUT "\n";
|
|
963 }
|
|
964 $timerSNVsStrand++;
|
|
965 $test = 1; last;
|
|
966 }
|
|
967 }
|
|
968 # The strand orientation isn't defined
|
|
969 if($test == 0)
|
|
970 {
|
|
971 my @tHeaderInput = split("\t", $annovar_header);
|
|
972 foreach my $line (@{$h_inputFile{$kFile}})
|
|
973 {
|
|
974 my @tab = split("\t", $line);
|
|
975 my $j = 0;
|
|
976 for(my $i=0; $i<$#tHeaderInput; $i++) { print OUT $tab[$i],"\t"; $j=$i }
|
|
977 print OUT "NA";
|
|
978 for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; }
|
|
979 print OUT "\n";
|
|
980 }
|
|
981 $timerSNVsStrand++;
|
|
982 }
|
|
983 $timerUniqueSNVs++;
|
|
984 }
|
|
985 close OUT;
|
|
986
|
|
987 # print "Strand orientation recover for $timerSNVsStrand SNVs out of $timerUniqueSNVs uniques\n";
|
|
988 }
|
|
989
|
|
990 sub RecoverGenomicSequence
|
|
991 {
|
|
992 my ($inputFile, $length_AVheader, $intervalEnd, $referenceGenome, $pathToRefSeq, $output) = @_;
|
|
993
|
|
994 ############ 1) Transform the input file in a hash table: one for recover the sequence context and one for keeping the original file
|
|
995 my %h_inputFileForSeqContext = (); my %h_inputFile = ();
|
|
996 my $header = "";
|
|
997 CreateHashTable_from_InputFile($inputFile, $length_AVheader, \$header, $intervalEnd, \%h_inputFileForSeqContext, \%h_inputFile);
|
|
998
|
|
999 sub CreateHashTable_from_InputFile
|
|
1000 {
|
|
1001 my ($input, $length_AVheader, $refS_header, $intervalEnd, $refH_inputFileForSeqContext, $refH_inputFile) = @_;
|
|
1002
|
|
1003 my ($chr_value, $start_value, $strand_value) = (0, 1, $length_AVheader);
|
|
1004
|
|
1005 my $countregion = 0;
|
|
1006 my %allchr = ();
|
|
1007
|
|
1008 open(F1, $input) or die "$!: $input\n";
|
|
1009 if($input =~ /\-aa/) { $$refS_header = <F1>; }
|
|
1010
|
|
1011 while(<F1>)
|
|
1012 {
|
|
1013 $_ =~ s/[\r\n]+$//;
|
|
1014 my @tab = split("\t", $_);
|
|
1015
|
|
1016 my $name = "$tab[$chr_value]:$tab[$start_value]";
|
|
1017 my $start = $tab[$start_value] - $intervalEnd;
|
|
1018 my $end = $tab[$start_value] + $intervalEnd;
|
|
1019
|
|
1020 $start--; #make zero-start coordinate, to be consistent with UCSC
|
|
1021 my $exonpos = "$tab[$chr_value]:$start";
|
|
1022
|
|
1023 push @{$refH_inputFileForSeqContext->{$tab[$chr_value]}}, [$name, $start, $end, $tab[$strand_value], $exonpos];
|
|
1024 push(@{$refH_inputFile->{"$tab[$chr_value]\t$start\t$end"}}, $_);
|
|
1025 $countregion++;
|
|
1026 $allchr{$tab[$chr_value]}++;
|
|
1027 }
|
|
1028 close F1;
|
|
1029 }
|
|
1030
|
|
1031 ############ 2) Extract the sequence context from the hash table
|
|
1032 my %h_allRegionSeqContext = ();
|
|
1033 my $refSeq = $pathToRefSeq;
|
|
1034 Extract_SequenceContext(\%h_inputFileForSeqContext, $referenceGenome, $refSeq, \%h_allRegionSeqContext);
|
|
1035
|
|
1036 sub Extract_SequenceContext
|
|
1037 {
|
|
1038 my ($refH_allRegion, $referenceGenome, $refSeq, $refH_allRegionSeqContext) = @_;
|
|
1039
|
|
1040 my $folderDB = $referenceGenome."db";
|
|
1041 my $folderSeq = $referenceGenome."_seq";
|
|
1042 my $seqdir = "$refSeq/$folderSeq";
|
|
1043
|
|
1044 my %seqhash = (); #database sequence for each chromosome
|
|
1045 my %name_seq = (); #sequence for each region
|
|
1046 my (%seqlen, %discordlen, %badorf); #store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon
|
|
1047 my ($count_success, @failure) = (0);
|
|
1048
|
|
1049 for my $curchr (sort keys %{$refH_allRegion})
|
|
1050 {
|
|
1051 my ($seqid, $curseq) = ('', '');
|
|
1052 my $fastafile = "";
|
|
1053 if ($curchr =~ m/^chr/)
|
|
1054 {
|
|
1055 %seqhash = (); #clear the seqhash storage
|
|
1056 $fastafile = "$seqdir/$curchr.fa"; #by default, all FASTA files should be saved at fastadir, with the same name
|
|
1057 }
|
|
1058 else
|
|
1059 {
|
|
1060 %seqhash = (); #clear the seqhash storage
|
|
1061 $fastafile = "$seqdir/chr$curchr.fa"; #by default, all FASTA files should be saved at fastadir, with the same name
|
|
1062 }
|
|
1063 if (not -e $fastafile) { #to handle cases where no "chr" prefix is given
|
|
1064 print "WARNING: the FASTA file $curchr.fa cannot be retrieved from the specified directory $seqdir. Sequences in this chromosome will not be processed\n";
|
|
1065 next;
|
|
1066 }
|
|
1067
|
|
1068 if (not %seqhash)
|
|
1069 {
|
|
1070 open (FASTA, $fastafile) or print "WARNING: cannot read from FASTA file $fastafile so sequences in $curchr will not be processed: $!\n" and next;
|
|
1071 while (<FASTA>)
|
|
1072 {
|
|
1073 if (m/^>(\S+)/)
|
|
1074 {
|
|
1075 $seqid and $seqhash{$seqid} = $curseq; #finish reading the sequence for seqid and save it
|
|
1076 $seqid = $1;
|
|
1077 $curseq = '';
|
|
1078 }
|
|
1079 else
|
|
1080 {
|
|
1081 s/[\r\n]+$//;
|
|
1082 $curseq .= uc $_; #only use upper case characters
|
|
1083 }
|
|
1084 }
|
|
1085 close FASTA;
|
|
1086 $seqhash{$seqid} = $curseq;
|
|
1087 }
|
|
1088 if (not $seqhash{$curchr})
|
|
1089 {
|
|
1090 #this chromosome just do not have FASTA sequences (maybe users used a wrong seqdir
|
|
1091 print "WARNING: Unable to retrieve regions at $curchr due to lack of sequence information\n";
|
|
1092 next;
|
|
1093 }
|
|
1094
|
|
1095 for my $i (0 .. @{$refH_allRegion->{$curchr}}-1)
|
|
1096 {
|
|
1097 my ($name, $start, $end, $strand, $exonpos) = @{$refH_allRegion->{$curchr}[$i]};
|
|
1098 my @start = split (/,/, $start);
|
|
1099 my @end = split (/,/, $end);
|
|
1100 my $seq;
|
|
1101 for my $i (0..@start-1)
|
|
1102 {
|
|
1103 if ($start[$i] >= length ($seqhash{$curchr}))
|
|
1104 {
|
|
1105 #here there must be an annotation error in user-specified gene/region definition file
|
|
1106 print "WARNING: Ignoring the start position start=$start[$i] since it is longer than the $curchr sequence (length=" , length($seqhash{$curchr}), ")\n";
|
|
1107 undef $seq;
|
|
1108 last;
|
|
1109 }
|
|
1110 $seq .= substr ($seqhash{$curchr}, $start[$i], $end[$i]-$start[$i]);
|
|
1111 }
|
|
1112
|
|
1113 if (defined $seq)
|
|
1114 {
|
|
1115 if (defined $seqlen{$name})
|
|
1116 {
|
|
1117 $seqlen{$name} != length ($seq) and warn "WARNING: the sequence $name was found more than once with different sequence lengths\n";
|
|
1118 $seqlen{$name} != length ($seq) and $discordlen{$name}++;
|
|
1119 }
|
|
1120 else { $seqlen{$name} = length ($seq); }
|
|
1121
|
|
1122 $name_seq{$name, $exonpos} = $seq;
|
|
1123 $count_success++;
|
|
1124
|
|
1125 # Put the sequence context in a hash table for Write the result after
|
|
1126 ## Some sequence context are NNNNNN or empty
|
|
1127 if( ($seq ne "NA") && ($seq =~ /N/i) ) { $refH_allRegionSeqContext->{"$curchr\t$start\t$end"} = "NA"; }
|
|
1128 else { $refH_allRegionSeqContext->{"$curchr\t$start\t$end"} = $seq; }
|
|
1129 }
|
|
1130 else
|
|
1131 {
|
|
1132 print "WARNING: DNA sequence for $name cannot be inferred\n";
|
|
1133 push @failure, $name;
|
|
1134 }
|
|
1135 }
|
|
1136 } # End for $curchr
|
|
1137 }
|
|
1138
|
|
1139 ############ 3) Create a file with the sequence context
|
|
1140 WriteFile_SeqContext($inputFile, $length_AVheader, \%h_inputFile, $header, \%h_allRegionSeqContext, $output);
|
|
1141
|
|
1142 sub WriteFile_SeqContext
|
|
1143 {
|
|
1144 my ($inputFile, $length_AVheader, $refH_InputFile, $header, $refH_allRegionSeqContext, $output) = @_;
|
|
1145
|
|
1146 open(OUT, ">", $output) or die "$!: $output\n";
|
|
1147
|
|
1148 ## Add the header only for the firts part of the files
|
|
1149 if($inputFile =~ /\-aa/)
|
|
1150 {
|
|
1151 my @tabHeaderInput = "";
|
|
1152
|
|
1153 $header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header);
|
|
1154 # Print the Annovar header until the column before OtherInfo
|
|
1155 print OUT "$tabHeaderInput[0]";
|
|
1156 my $j = 0;
|
|
1157 for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; }
|
|
1158 print OUT "\tcontext\ttrinucleotide_context";
|
|
1159 for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; }
|
|
1160 print OUT "\n";
|
|
1161 }
|
|
1162
|
|
1163 foreach my $k_hFile (sort keys %{$refH_InputFile})
|
|
1164 {
|
|
1165 foreach my $k_allRegonSeqContext (sort keys %{$refH_allRegionSeqContext})
|
|
1166 {
|
|
1167 if($k_hFile eq $k_allRegonSeqContext)
|
|
1168 {
|
|
1169 my $j=0;
|
|
1170
|
|
1171 for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++)
|
|
1172 {
|
|
1173 my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]);
|
|
1174
|
|
1175 # Write Annovar annotation + strand orientation
|
|
1176 for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; }
|
|
1177 # Write the sequence context with the length defined by the user (default is 10)
|
|
1178 print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext};
|
|
1179 # Write the trinucleotide context
|
|
1180 my $contextSequence = $refH_allRegionSeqContext->{$k_allRegonSeqContext}; $contextSequence =~ tr/a-z/A-Z/;
|
|
1181 my @tempContextSequence = split("", $contextSequence);
|
|
1182 my $midlle_totalNbBaseContext = (scalar(@tempContextSequence)-1)/2; # For having the middle of the sequence
|
|
1183 print OUT "\t".$tempContextSequence[$midlle_totalNbBaseContext-1]."x".$tempContextSequence[$midlle_totalNbBaseContext+1];
|
|
1184 # Write the original columns
|
|
1185 for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; }
|
|
1186 print OUT "\n";
|
|
1187 }
|
|
1188 last;
|
|
1189 }
|
|
1190 }
|
|
1191 }
|
|
1192 close OUT;
|
|
1193 }
|
|
1194 }
|
|
1195
|
|
1196 sub CombinedTempFile
|
|
1197 {
|
|
1198 my ($folderTempFile, $output) = @_;
|
|
1199
|
|
1200 my $cmd_cat_mt_results = "cat ";
|
|
1201
|
|
1202 foreach my $file (`ls $folderTempFile/*.txt`)
|
|
1203 {
|
|
1204 chomp($file);
|
|
1205 $cmd_cat_mt_results = $cmd_cat_mt_results." $file";
|
|
1206 }
|
|
1207 $cmd_cat_mt_results = $cmd_cat_mt_results." > $output";
|
|
1208 `$cmd_cat_mt_results`;
|
|
1209 }
|
|
1210
|
|
1211
|
|
1212 sub recoverNumCol
|
|
1213 {
|
|
1214 my ($input, $name_of_column) = @_;
|
|
1215
|
|
1216 open(F1,$input) or die "recoverNumCol: $!: $input\n";
|
|
1217 # For having the name of the columns
|
|
1218 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
|
|
1219 close F1;
|
|
1220 # The number of the column
|
|
1221 my $name_of_column_NB = "toto";
|
|
1222 for(my $i=0; $i<=$#tab_search_header; $i++)
|
|
1223 {
|
|
1224 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; }
|
|
1225 }
|
|
1226 if($name_of_column_NB eq "toto")
|
|
1227 {
|
|
1228 print STDERR "Error message:\n";
|
|
1229 print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n";
|
|
1230 }
|
|
1231 else { return $name_of_column_NB; }
|
|
1232 }
|
|
1233
|
|
1234
|
|
1235
|
|
1236
|
|
1237 =head1 NAME
|
|
1238
|
|
1239 mutspec-Annot
|
|
1240
|
|
1241 =head1 SYNOPSIS
|
|
1242
|
|
1243 mutspecannot.pl [arguments] <query-file>
|
|
1244
|
|
1245 <query-file> can be a folder with multiple VCF or a single VCF
|
|
1246
|
|
1247 Arguments:
|
|
1248 -h, --help print help message
|
|
1249 -m, --man print complete documentation
|
|
1250 -v, --verbose use verbose output
|
|
1251 --refGenome the reference genome to use
|
|
1252 --interval <interger> the number of bases for the sequence context
|
|
1253 -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file
|
|
1254 -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size
|
|
1255 --pathAVDBList the path to a text file containing the list of Annovar databases installed
|
|
1256 -temp --pathTemporary <string> the path for saving the temporary files
|
|
1257 --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no)
|
|
1258 --max_cpu <integer> number of CPUs to be used for the annotation
|
|
1259
|
|
1260
|
|
1261 Function: automatically run a pipeline on a list of variants and annote them using Annovar
|
|
1262
|
|
1263 Example: # Annotation only
|
|
1264 mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input
|
|
1265
|
|
1266
|
|
1267 Version: 03-2017 (March 2017)
|
|
1268
|
|
1269
|
|
1270 =head1 OPTIONS
|
|
1271
|
|
1272 =over 8
|
|
1273
|
|
1274 =item B<--help>
|
|
1275
|
|
1276 print a brief usage message and detailed explanation of options.
|
|
1277
|
|
1278 =item B<--man>
|
|
1279
|
|
1280 print the complete manual of the program.
|
|
1281
|
|
1282 =item B<--verbose>
|
|
1283
|
|
1284 use verbose output.
|
|
1285
|
|
1286 =item B<--refGenome>
|
|
1287
|
|
1288 the reference genome to use, could be hg19 or mm9.
|
|
1289
|
|
1290 =item B<--interval>
|
|
1291
|
|
1292 the number of bases surrounding the mutated bases, for the sequence context analysis.
|
|
1293
|
|
1294 =item B<--outfile>
|
|
1295
|
|
1296 the directory of output file names. If it is nor specify the same directory as the input file is used.
|
|
1297
|
|
1298 =item B<--pathAnnovarDB>
|
|
1299
|
|
1300 the path to the directory containing the Annovar databases and the files with the chromosome size.
|
|
1301
|
|
1302 =item B<--pathAVDBList>
|
|
1303
|
|
1304 the path to a text file containing the list of Annovar databases installed.
|
|
1305
|
|
1306 =item B<--pathTemporary>
|
|
1307
|
|
1308 the path for saving temporary files generated by the script.
|
|
1309 If any is specify a temporary folder is created in the same directory where the script is running.
|
|
1310 Deleted when the script is finish
|
|
1311
|
|
1312 =item B<--fullAnnotation>
|
|
1313
|
|
1314 Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines)
|
|
1315
|
|
1316 =item B<--max_cpu>
|
|
1317
|
|
1318 Specify the number of CPUs to be used. This number is used for spliting the file in n part and running the annotations in each part in parallel.
|
|
1319
|
|
1320
|
|
1321 =head1 DESCRIPTION
|
|
1322
|
|
1323 MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS.
|
|
1324 Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added.
|
|
1325 A text tab delimited file is produced.
|
|
1326
|
|
1327 =cut
|