comparison mutspecStat.pl @ 0:8c682b3a7c5b draft

Uploaded
author iarc
date Tue, 19 Apr 2016 03:07:11 -0400
parents
children 46a10309dfe2
comparison
equal deleted inserted replaced
-1:000000000000 0:8c682b3a7c5b
1 #!/usr/bin/env perl
2
3 #-----------------------------------#
4 # Author: Maude #
5 # Script: mutspecStat.pl #
6 # Last update: 09/04/16 #
7 #-----------------------------------#
8
9 use strict;
10 use warnings;
11 use Getopt::Long;
12 use Pod::Usage;
13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
14 use File::Path;
15 use Statistics::R;
16 use Spreadsheet::WriteExcel;
17
18 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
19 our ($refGenome, $output, $folder_temp, $path_R_Scripts, $path_SeqrefGenome) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path for saving the temporary files; The path to R scripts; The path to the fasta reference sequences
20 our ($poolData, $oneReportPerSample) = (2, 2); # If a folder is pass as input file pool all the data and generate the report on the pool and for each samples; # Generate one report for each samples
21
22
23 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'outfile|o=s' => \$output, 'pathTemporary|temp=s' => \$folder_temp, 'pathRscript=s' => \$path_R_Scripts, 'pathSeqRefGenome=s' => \$path_SeqrefGenome, 'poolData' => \$poolData, 'reportSample' => \$oneReportPerSample) or pod2usage(2);
24
25 our ($input) = @ARGV;
26
27 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
28 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
29 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script
30 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)
31
32
33
34 ######################################################################################################################################################
35 # GLOBAL VARIABLES #
36 ######################################################################################################################################################
37 # Recover the current path
38 our $pwd = `pwd`;
39 chomp($pwd);
40
41 # Path to R scripts
42 our $pathRScriptTxnSB = "$path_R_Scripts/R/transciptionalStrandBias.r";
43 our $pathRScriptMutSpectrum = "$path_R_Scripts/R/mutationSpectra_Galaxy.r";
44
45 our $folderMutAnalysis = "";
46 our @pathInput = split("/", $input);
47
48 # Hash table with the length of each chromosomes
49 our %chromosomes;
50
51 ######################################################################################################################################################
52 # MAIN #
53 ######################################################################################################################################################
54 # Check the presence of the flags and create the output and temp directories
55 CheckFlags();
56
57 # Retrieve chromosomes length
58 checkChrDir();
59
60
61 print "-----------------------------------------------------------------\n";
62 print "-----------------Report Mutational Analysis----------------------\n";
63 print"-----------------------------------------------------------------\n";
64
65 # First check if the file is annotated or not
66 CheckAnnotationFile($input);
67
68 # Calculate the statistics and generate the report
69 my @colInfoAV = qw(Chr Start Ref Alt);
70 ReportMutDist($input, $folderMutAnalysis, $folder_temp, \@colInfoAV, $refGenome);
71
72 # Remove the temporary directory
73 rmtree($folder_temp);
74
75
76 ######################################################################################################################################################
77 # FUNCTIONS #
78 ######################################################################################################################################################
79
80 # Check the presence of the flags and create the output and temp directories
81 sub CheckFlags
82 {
83 # Check the reference genome
84 if($refGenome eq "empty") { print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; exit; }
85
86 # If no output is specified write the result as the same place as the input file
87 if($output eq "empty")
88 {
89 my $folderRes = "";
90 for(my $i=0; $i<$#pathInput; $i++) { $folderRes .= "$pathInput[$i]/"; }
91
92 $folderMutAnalysis = "$folderRes/Mutational_Analysis";
93 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; }
94 }
95 else
96 {
97 if(!-e $output) { mkdir($output) or die "$!: $output\n"; }
98
99 $folderMutAnalysis = "$output/Mutational_Analysis";
100 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; }
101 }
102
103 # If no temp folder is specified write the result in the current path
104 if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$pathInput[$#pathInput]"; }
105 if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; }
106
107 # Check the path to the R scripts
108 if($path_R_Scripts eq "empty") { print STDERR "You forget to specify the path for the R scripts!!!\nPlease specify it with the flag --pathRscript\n"; exit; }
109
110
111 # The input is a folder
112 if(-d $input) { foreach my $file (`ls $input`) { CheckLengthFilename("$input/$file"); } }
113 # The input is one file
114 else { CheckLengthFilename($input); }
115 }
116 # Check the length of the file, must be < 32 characters for the Excel sheet
117 sub CheckLengthFilename
118 {
119 my ($inputFile) = @_;
120
121 ## Verify the name of file, must be <= 31 chars for the sheet name
122 my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/);
123
124 if(length($filename) > 31) { print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n"; exit; }
125 }
126
127 # Retrieve chromosomes length
128 sub checkChrDir
129 {
130 my @files = `ls $path_SeqrefGenome/$refGenome"_seq"/*.fa`;
131 foreach my $file (@files)
132 {
133 if ($file !~ /chr(\d+|x|y)\.fa/i){next;}
134 open(FILE,$file);
135 <FILE>;
136 my $seq="";
137 while (<FILE>){ chomp; $seq.=$_;}
138 $file =~ /chr(.*)\.fa/;
139 $chromosomes{"chr".$1}=length($seq);
140 }
141 }
142
143 # Check if the file is annotated or not
144 sub CheckAnnotationFile
145 {
146 my ($inputFile) = @_;
147
148 # A folder is pass in argument
149 if(-d $inputFile)
150 {
151 foreach my $file (`ls $inputFile`)
152 {
153 chomp($file);
154
155 open(F1, "$inputFile/$file") or die "$!: $inputFile/$file\n";
156 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
157 close F1;
158 # The number of the column
159 my $value_of_column_NB = "toto";
160 for(my $i=0; $i<=$#tab_search_header; $i++)
161 {
162 if($tab_search_header[$i] eq "Func.refGene") { $value_of_column_NB = $i; }
163 }
164 if($value_of_column_NB eq "toto") { print STDERR "Error the input file you specify is not annotated! $inputFile/$file !!!!\nPlease first annotate your file before trying to generate the report on mutation patterns\n"; exit; }
165 }
166 }
167 else
168 {
169 open(F1, $inputFile) or die "$!: $inputFile\n";
170 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
171 close F1;
172 # The number of the column
173 my $value_of_column_NB = "toto";
174 for(my $i=0; $i<=$#tab_search_header; $i++)
175 {
176 if($tab_search_header[$i] eq "Func.refGene") { $value_of_column_NB = $i; }
177 }
178 if($value_of_column_NB eq "toto") { print STDERR "Error the input file you specify is not annotated! $inputFile !!!!\nPlease first annotate your file before trying to generate the report on mutation patterns\n"; exit; }
179 }
180 }
181
182 # Calculate the statistics and generate the report
183 sub ReportMutDist
184 {
185 our ($input, $output, $folder_temp, $refTab_colInfo, $refGenome) = @_;
186
187 my @column = @$refTab_colInfo;
188
189 our ($chr_name, $start_name, $ref_name, $alt_name) = split(/,/, join(',', @column)); # Separe each element
190
191 our $func_name = "Func.refGene";
192 our $exonicFunc_name = "ExonicFunc.refGene";
193 our $strand_name = "Strand";
194 our $context_name = "context";
195
196 my $folderFigure = "$output/Figures";
197 if(-e $folderFigure) { rmtree($folderFigure); mkdir($folderFigure) or die "Can't create the directory $folderFigure\n"; }
198 else { mkdir($folderFigure) or die "Can't create the directory $folderFigure\n"; }
199 my $folderChi2 = "$folderFigure/Chi2";
200 if(!-e $folderChi2) { mkdir($folderChi2) or die "Can't create the directory $folderChi2\n"; }
201 my $folderWebLogo = "$folderFigure/WebLogo";
202 if(!-e $folderWebLogo) { mkdir($folderWebLogo) or die "Can't create the directory $folderWebLogo\n"; }
203 my $folderNMF = "$folderFigure/Input_NMF";
204 if(!-e $folderNMF) { mkdir($folderNMF) or die "Can't create the directory $folderNMF\n"; }
205
206 ################################################################################################
207 ### Calculates all the statistics ###
208 ################################################################################################
209 ############ Recover Annovar annotations (for having the save number of functional regions for each samples)
210 my @tab_func = recoverAnnovarAnnotation($input, $func_name);
211 if(@tab_func == 0) { print STDERR "Error the table for the functional region is empty!!!!! check $input $func_name\n"; exit; }
212
213 ############ Calculate the different statistics present in the report
214 my %h_file = ();
215 CalculateStatistics(\%h_file, \@tab_func);
216
217 ############ Calculate the chi2 for the strand bias
218 CalculateChi2(\%h_file, $folderChi2);
219
220 ############ Write the different statistics present in the report
221 WriteStatistics(\%h_file, $#tab_func, $folderFigure, $folderChi2, $folderNMF);
222
223 ############ Create logo for studying the 10 flanking bases of the mutation
224 CreateLogo(\%h_file, $folderWebLogo);
225
226
227 ################### Subroutines for generating the report for the mutational analysis
228 sub recoverAnnovarAnnotation
229 {
230 my ($input, $AV_annotation) = @_;
231
232 my %hash = ();
233
234 # The input is a folder
235 if(-d $input)
236 {
237 foreach my $file (`ls $input`)
238 {
239 $file =~ s/[\r\n]+$//;
240 my $AV_annotation_value = recoverNumCol("$input/$file", $AV_annotation);
241
242 open(F1, "$input/$file") or die "$!: $input/$file\n";
243 my $header = <F1>;
244 while(<F1>)
245 {
246 $_ =~ s/[\r\n]+$//;
247 my @tab = split("\t", $_);
248
249 # Some files can have an empty line at the end and WE DON'T WANT to consider it
250 if(! defined $tab[0]) { next; }
251 # Some func value are repeated and separated by ";"
252 my $funcSegment = "";
253 if($tab[$AV_annotation_value] =~ /;/) { my @temp = split(";", $tab[$AV_annotation_value]); $funcSegment = $temp[0]; }
254 else { $funcSegment = $tab[$AV_annotation_value]; }
255
256 $hash{$funcSegment} = "";
257 }
258 close F1;
259 }
260 my @tab_AVAnnotation = ();
261 foreach my $k (sort keys %hash) { push(@tab_AVAnnotation, $k); }
262 return @tab_AVAnnotation;
263 }
264 # The input is a file
265 else
266 {
267 my $AV_annotation_value = recoverNumCol($input, $AV_annotation);
268
269 open(F1, $input) or die "$!: $input\n";
270 my $header = <F1>;
271 while(<F1>)
272 {
273 $_ =~ s/[\r\n]+$//;
274 my @tab = split("\t", $_);
275
276 # Some func value are repeated and separated by ";"
277 my $funcSegment = "";
278 if($tab[$AV_annotation_value] =~ /;/) { my @temp = split(";", $tab[$AV_annotation_value]); $funcSegment = $temp[0]; }
279 else { $funcSegment = $tab[$AV_annotation_value]; }
280
281 $hash{$funcSegment} = "";
282 }
283 close F1;
284 my @tab_AVAnnotation = ();
285 foreach my$k (sort keys %hash) { push(@tab_AVAnnotation, $k); }
286 return @tab_AVAnnotation;
287 }
288 }
289 # Calculate the different statistics present in the report
290 sub CalculateStatistics
291 {
292 my ($refH_file, $refT_func) = @_;
293
294 my ($chr_value, $start_value, $ref_value, $alt_value, $func_value, $exonicFunc_value, $strand_value, $contextSeq_value) = ("", "", "", "", "", "", "", "", "", "");
295
296 # If the input is a folder
297 if(-d $input)
298 {
299 my $folderPool = "$folder_temp/Pool";
300 if(!-e $folderPool) { mkdir($folderPool) or die "Can't create the directory $folderPool\n"; }
301
302 # Copy each sample
303 foreach my $file (`ls $input`) { chomp($file); system("cp $input/$file $folderPool"); }
304
305 # Generate the pool of all the data
306 if($poolData == 1)
307 {
308 my @listFile = `ls $input`;
309
310 # For keeping the header only one time
311 chomp($listFile[0]);
312 system("cp $input/$listFile[0] $folderPool/Pool_Data.txt");
313
314 open(OUT, ">>", "$folderPool/Pool_Data.txt") or die "$!: $folderPool/Pool_Data.txt\n";
315
316 for(my $i=1; $i<=$#listFile; $i++)
317 {
318 chomp($listFile[$i]);
319 open(F1, "$input/$listFile[$i]") or die "$!: $input/$listFile[$i]\n";
320 my $header = <F1>;
321 while(<F1>) { print OUT $_; }
322 close F1;
323 }
324 close OUT;
325 }
326
327 foreach my $file (`ls $folderPool`)
328 {
329 chomp($file);
330 ############ Recover the number of the columns of interest
331 $chr_value = recoverNumCol("$folderPool/$file", $chr_name);
332 $start_value = recoverNumCol("$folderPool/$file", $start_name);
333 $ref_value = recoverNumCol("$folderPool/$file", $ref_name);
334 $alt_value = recoverNumCol("$folderPool/$file", $alt_name);
335 $func_value = recoverNumCol("$folderPool/$file", $func_name);
336 $exonicFunc_value = recoverNumCol("$folderPool/$file", $exonicFunc_name);
337 $strand_value = recoverNumCol("$folderPool/$file", $strand_name);
338 $contextSeq_value = recoverNumCol("$folderPool/$file", $context_name);
339 ############ Recover the number of the columns of interest
340
341 ############ Control the annotated file pass in argument
342 ## Check if the files have variants
343 my $nbLines_originalFile = `wc -l $folderPool/$file`; $nbLines_originalFile =~ /(\d+) /;
344 if($1==1) { print STDERR "\n\nNo line in the file $folderPool/$file\n\n"; exit; }
345 ## Check if there is variant with strand information. If not the rest of the script generates errors
346 my $testFile = 0;
347 CheckVariantReport("$folderPool/$file", $strand_value, \$testFile);
348 if($testFile==0) { print STDERR "\n\nNo strand information for the file $folderPool/$file\n\n"; exit; }
349 ############ Control the annotated file pass in argument
350
351 ############ Calculate the statistics
352 File2Hash("$folderPool/$file", $func_value, $exonicFunc_value, $chr_value, $ref_value, $alt_value, $strand_value, $contextSeq_value, $refH_file, $refT_func);
353 }
354 }
355 # If the input is a file
356 else
357 {
358 ############ Recover the number of the columns of interest
359 $chr_value = recoverNumCol($input, $chr_name);
360 $start_value = recoverNumCol($input, $start_name);
361 $ref_value = recoverNumCol($input, $ref_name);
362 $alt_value = recoverNumCol($input, $alt_name);
363 $func_value = recoverNumCol($input, $func_name);
364 $exonicFunc_value = recoverNumCol($input, $exonicFunc_name);
365 $strand_value = recoverNumCol($input, $strand_name);
366 $contextSeq_value = recoverNumCol($input, $context_name);
367 ############ Recover the number of the columns of interest
368
369 ############ Control the annotated file pass in argument
370 ## Check if the files have variants
371 my $nbLines_originalFile = `wc -l $input`; $nbLines_originalFile =~ /(\d+) /;
372 if($1==1) { print STDERR "\n\nNo line in the file $input\n\n"; exit; }
373 ## Check if there is variant with strand information. If not the rest of the script generates errors
374 my $testFile = 0;
375 CheckVariantReport($input, $strand_value, \$testFile);
376 if($testFile==0) { print STDERR "\n\nNo strand information for the file $input\n\n"; exit; }
377 ############ Control the annotated file pass in argument
378
379 ############ Calculate the statistics
380 File2Hash($input, $func_value, $exonicFunc_value, $chr_value, $ref_value, $alt_value, $strand_value, $contextSeq_value, $refH_file, $refT_func);
381 }
382 }
383 # Check if there is at least one variant with a strand information
384 sub CheckVariantReport
385 {
386 my ($file, $strand_value, $refS_testFile) = @_;
387
388 open(F1, $file) or die "$!: $file\n";
389 my $header = <F1>;
390 while(<F1>)
391 {
392 $_ =~ s/[\r\n]+$//;
393 my @tab = split("\t", $_);
394
395 if( ($tab[$strand_value] eq "+") || ($tab[$strand_value] eq "-") ) { $$refS_testFile++; }
396 }
397 close F1;
398 }
399 # Convert the annotated VCF into a hash table
400 sub File2Hash
401 {
402 my ($inputFile, $func_value, $exonicFunc_value, $chr_value, $ref_value, $alt_value, $strand_value, $contextSeq_value, $refH_file, $refT_func) = @_;
403 my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/);
404
405 # Initialisation of the hash
406 my @tab_mutation = qw(C:G>A:T C:G>G:C C:G>T:A T:A>A:T T:A>C:G T:A>G:C);
407 my @tab_aaChange = ("NonTr", "Tr", "TotalMutG");
408 my @tabExoFunc = ("frameshift insertion", "frameshift deletion", "frameshift block substitution", "frameshift substitution", "stopgain", "stoploss", "nonframeshift insertion", "nonframeshift deletion", "nonframeshift substitution", "nonframeshift block substitution", "nonsynonymous SNV", "synonymous SNV", "unknown", "NA");
409
410 # Total number of SBS on the genomic strand
411 $refH_file->{$filename}{'TotalSBSGenomic'} = 0;
412 # Total number of Indel on the genomic strand
413 $refH_file->{$filename}{'TotalIndelGenomic'} = 0;
414 # Total number of SBS on the coding strand
415 $refH_file->{$filename}{'TotalSBSCoding'} = 0;
416 # Total number of SBS and Indel on the genomic strand
417 $refH_file->{$filename}{'TotalMutGenomic'} = 0;
418
419 #####################################
420 # SBS by segment (6 mutation types) #
421 #####################################
422 foreach my $elt_tabFunc (@$refT_func)
423 {
424 foreach my $elt_tabMutation (@tab_mutation)
425 {
426 foreach my $elt_aaChange (@tab_aaChange)
427 {
428 $refH_file->{$filename}{'6mutType'}{$elt_tabFunc}{$elt_tabMutation}{$elt_aaChange} = 0;
429 }
430 }
431 }
432
433 #######################
434 # Pearson correlation #
435 #######################
436 $refH_file->{$filename}{'SBSPerChr'}{'AllMutType'} = 0;
437 # Count of SBS per chromosome foreach mutation types
438 foreach my $elt_tabMutation (@tab_mutation)
439 {
440 foreach my $chromosome (sort keys %chromosomes){ $refH_file->{$filename}{'SBSPerChr'}{$elt_tabMutation}{'CHR'}{$chromosome}{'chr'} = 0;}
441 $refH_file->{$filename}{'SBSPerChr'}{$elt_tabMutation}{'Pearson'} = 0;
442 }
443
444 foreach my $chromosome (sort keys %chromosomes){
445 $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'}=0;
446 }
447
448 ############################
449 # Impact of SBS on protein #
450 ############################
451 foreach my $elt_exoFunc (@tabExoFunc)
452 {
453 $refH_file->{$filename}{'ImpactSBS'}{$elt_exoFunc} = 0;
454 }
455
456 #####################################
457 # Sequence context (genomic strand) #
458 #####################################
459 my @tab_mutation2 = qw(C>A C>G C>T T>A T>C T>G);
460 my @tab_context = qw(A_A A_C A_G A_T C_A C_C C_G C_T G_A G_C G_G G_T T_A T_C T_G T_T);
461 foreach my $elt_context (@tab_context)
462 {
463 foreach my $elt_mutation3 (@tab_mutation2)
464 {
465 $refH_file->{$filename}{'SeqContextG'}{$elt_context}{$elt_mutation3} = 0;
466 }
467 }
468
469 ####################################
470 # Sequence context (coding strand) #
471 ####################################
472 my @tab_TrNonTr = qw(NonTr Tr);
473 foreach my $elt_context (@tab_context)
474 {
475 foreach my $elt_mutation2 (@tab_mutation2)
476 {
477 foreach my $trNonTr (@tab_TrNonTr)
478 {
479 $refH_file->{$filename}{'SeqContextC'}{$elt_context}{$elt_mutation2}{$trNonTr} = 0;
480 }
481 }
482 }
483
484 open(F1,$inputFile) or die "$!: $inputFile\n";
485 my $header = <F1>;
486 while(<F1>)
487 {
488 $_ =~ s/[\r\n]+$//;
489 my @tab = split("\t", $_);
490
491 # Random chromosome and chromosome M
492 if( ($tab[$chr_value] =~ /random/i) || ($tab[$chr_value] =~ /M/i) ) { next; }
493
494 ############################################## Extract the base just before and after the mutation ##############################################
495 my $context = "";
496 my $contextSequence = $tab[$contextSeq_value]; $contextSequence =~ tr/a-z/A-Z/;
497 my @tempContextSequence = split("", $contextSequence);
498 my $total_nbBaseContext = $#tempContextSequence;
499 my $midlle_totalNbBaseContext = $total_nbBaseContext/2; # For having the middle of the sequence
500 my $before = $midlle_totalNbBaseContext - 1; my $after = $midlle_totalNbBaseContext + 1;
501 $context = $tempContextSequence[$before]."_".$tempContextSequence[$after];
502 ############################################## Extract the base just before and after the mutation ##############################################
503
504
505 ############################################################### Impact on protein ###############################################################
506 my $exoFunc = "";
507 # Sometimes the annotation is repeated frameshift deletion;frameshift deletion
508 if($tab[$exonicFunc_value] =~ /\;/)
509 {
510 my @temp = split(";", $tab[$exonicFunc_value]);
511 if($temp[0] eq $temp[1]) { $exoFunc = $temp[0]; }
512 }
513 # The annotations have changed after MAJ Annovar 2014Jul22 (stopgain SNV => stopgain)
514 elsif($tab[$exonicFunc_value] eq "stopgain SNV") { $exoFunc = "stopgain"; }
515 elsif($tab[$exonicFunc_value] eq "stoploss SNV") { $exoFunc = "stoploss"; }
516 elsif($tab[$exonicFunc_value] eq "nonsynonymous_SNV") { $exoFunc = "nonsynonymous SNV"; }
517 elsif($tab[$exonicFunc_value] eq "stopgain_SNV") { $exoFunc = "stopgain SNV"; }
518 elsif($tab[$exonicFunc_value] eq "synonymous_SNV") { $exoFunc = "synonymous SNV"; }
519 else { $exoFunc = $tab[$exonicFunc_value]; }
520
521 if(exists $refH_file->{$filename}{'ImpactSBS'}{$exoFunc})
522 {
523 # If the sequence context if not recovered correctly don't considered the variants
524 if( ($context =~ /N/) || (length($context) != 3) ) { next; }
525
526 $refH_file->{$filename}{'ImpactSBS'}{$exoFunc}++;
527 $refH_file->{$filename}{'TotalMutGenomic'}++;
528 }
529 else { print "WARNING: Exonic function not considered: $exoFunc\n"; }
530 ############################################################### Impact on protein ###############################################################
531
532 ################################################### Only SBS are considered for the statistics ##################################################
533 if( ($tab[$ref_value] =~ /^[ACGT]$/i) && ($tab[$alt_value] =~ /^[ACGT]$/i) )
534 {
535 # If the sequence context if not recovered correctly don't considered the variants
536 if( ($context =~ /N/) || (length($context) != 3) ) { next; }
537
538 # Total number of SBS on the genomic strand
539 $refH_file->{$filename}{'TotalSBSGenomic'}++;
540
541 # Total number of SBS on the coding strand with a sequence context
542 if( ($tab[$strand_value] eq "+") || ($tab[$strand_value] eq "-") )
543 {
544 if( ($context ne "NA") && (($context =~ /N/) || (length($context) != 3)) ) { next; }
545 $refH_file->{$filename}{'TotalSBSCoding'}++;
546 }
547 }
548 else { $refH_file->{$filename}{'TotalIndelGenomic'}++; }
549 ################################################### Only SBS are considered for the statistics ##################################################
550
551 # Number of SBS per chromosome: remove the "chr"
552 my $chrNameForH=$tab[$chr_value];
553 if(exists $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chrNameForH}{'chr'}++; }
554
555
556 ################################################### Some func value are repeated and separated by ";" ##################################################
557 my $funcSegment = "";
558 if($tab[$func_value] =~ /;/) { my @temp = split(";", $tab[$func_value]); $funcSegment = $temp[0]; }
559 else { $funcSegment = $tab[$func_value]; }
560
561
562 ############################################################### MUTATION C> #############################################
563 ###################################### C:G>A:T
564 if( (($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "A")) || ( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "T") ) )
565 {
566 my $mutation = "C:G>A:T";
567 $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Count the total number of mutations
568
569 # Pearson correlation
570 if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; }
571
572 # Sequence context - 6 mutation types - genomic strand
573 my $mutationSeqContext6mutType = "C>A";
574 # We want to express the mutation in C>
575 if( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "T") )
576 {
577 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
578 my $context_reverse = $base5."_".$base3;
579 if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; }
580 }
581 elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; }
582
583 # Strand analysis C>A on NonTr strand
584 if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "A"))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "T"))) )
585 {
586 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; }
587
588 # C>A With the sequence context (C>A strand = +)
589 if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "A")) )
590 {
591 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'C>A'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'C>A'}{'NonTr'}++; }
592 }
593 # C>A With the sequence context (G>T strand = -)
594 else
595 {
596 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
597 my $context_reverse = $base5."_".$base3;
598 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>A'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>A'}{'NonTr'}++; }
599 }
600 }
601 # Strand analysis C>A on Tr strand
602 if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "A"))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "T"))) )
603 {
604 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; }
605
606 # C>A With the sequence context (C>A strand = -)
607 if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "A")) )
608 {
609 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'C>A'}{'Tr'}) { { $refH_file->{$filename}{'SeqContextC'}{$context}{'C>A'}{'Tr'}++; } }
610 }
611 # C>A with the sequence context (G>T strand = +)
612 if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "T")) )
613 {
614 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
615 my $context_reverse = $base5."_".$base3;
616 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>A'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>A'}{'Tr'}++; }
617 }
618 }
619 # WebLogo-3
620 if(($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "A"))
621 {
622 # For the logo all the sequences must have the same length
623 if(scalar(@tempContextSequence) == 2) { next; }
624 my ($contextTemp1, $contextTemp2) = ("", "");
625 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; }
626 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; }
627 my $context = $contextTemp1."C".$contextTemp2;
628 push(@{$refH_file->{$filename}{'WebLogo3'}{'CA'}}, $context);
629 }
630 else
631 {
632 if(scalar(@tempContextSequence) == 2) { next; }
633 my ($contextTemp1, $contextTemp2) = ("", "");
634 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); }
635 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); }
636 my $context = $contextTemp1."C".$contextTemp2; $context = reverse $context;
637 push(@{$refH_file->{$filename}{'WebLogo3'}{'CA'}}, $context);
638 }
639 }
640 ###################################### C:G>G:C
641 if( (($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "G")) || ( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "C") ) )
642 {
643 my $mutation = "C:G>G:C";
644 $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Count the total number of mutations
645
646 # Pearson correlation
647 if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; }
648
649 # Sequence context - 6 mutation types - genomic strand
650 my $mutationSeqContext6mutType = "C>G";
651 # We want to express the mutation in C>
652 if( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "C") )
653 {
654 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
655 my $context_reverse = $base5."_".$base3;
656 if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; }
657 }
658 elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; }
659
660 # Strand analysis C>G on NonTr strand
661 if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "G"))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "C"))) )
662 {
663 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; }
664
665 # C>G with the sequence context (C>G strand = +)
666 if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "G")) )
667 {
668 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'C>G'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'C>G'}{'NonTr'}++; }
669 }
670 # C>G with the sequence context (G>C strand = -)
671 if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "C")) )
672 {
673 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
674 my $context_reverse = $base5."_".$base3;
675 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>G'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>G'}{'NonTr'}++; }
676 }
677 }
678 # Strand analysis C>G on Tr strand
679 if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "G"))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "C"))) )
680 {
681 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; }
682
683 # C>G with the sequence context (C>G strand = -)
684 if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "G")) )
685 {
686 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'C>G'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'C>G'}{'Tr'}++; }
687 }
688 # C>G with the sequence context (G>C strand = +)
689 if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "C")) )
690 {
691 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
692 my $context_reverse = $base5."_".$base3;
693 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>G'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>G'}{'Tr'}++; }
694 }
695 }
696 # WebLogo-3
697 if(($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "G"))
698 {
699 if(scalar(@tempContextSequence) == 2) { next; }
700 my ($contextTemp1, $contextTemp2) = ("", "");
701 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; }
702 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; }
703 my $context = $contextTemp1."C".$contextTemp2;
704 push(@{$refH_file->{$filename}{'WebLogo3'}{'CG'}}, $context);
705 }
706 else
707 {
708 if(scalar(@tempContextSequence) == 2) { next; }
709 my ($contextTemp1, $contextTemp2) = ("", "");
710 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); }
711 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); }
712 my $context = $contextTemp1."C".$contextTemp2; $context = reverse $context;
713 push(@{$refH_file->{$filename}{'WebLogo3'}{'CG'}}, $context);
714 }
715 }
716 ###################################### C:G>T:A
717 if( (($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "T")) || ( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "A") ) )
718 {
719 my $mutation = "C:G>T:A";
720 $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Count the total number of mutations
721
722 # Pearson correlation
723 if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; }
724
725 # Sequence context - 6 mutation types - genomic strand
726 my $mutationSeqContext6mutType = "C>T";
727 # We want to express the mutation in C>
728 if( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "A") )
729 {
730 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
731 my $context_reverse = $base5."_".$base3;
732 if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; }
733 }
734 elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; }
735
736 # Strand analysis C>T on NonTr strand
737 if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "T"))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "A"))) )
738 {
739 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; }
740
741 # C>T with the sequence context (C>T strand = +)
742 if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "T")) )
743 {
744 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'C>T'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'C>T'}{'NonTr'}++; }
745 }
746 # C>T with the sequence context (G>A strand = -)
747 if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "A")) )
748 {
749 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
750 my $context_reverse = $base5."_".$base3;
751 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>T'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>T'}{'NonTr'}++; }
752 }
753 }
754 # Strand analysis C>T on Tr strand
755 if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "T"))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "A"))) )
756 {
757 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; }
758
759 # C>T with the sequence context (C>T strand = -)
760 if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "T")) )
761 {
762 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'C>T'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'C>T'}{'Tr'}++; }
763 }
764 # C>T with the sequence context (G>A strand = +)
765 if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "A")) )
766 {
767 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
768 my $context_reverse = $base5."_".$base3;
769 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>T'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>T'}{'Tr'}++; }
770 }
771 }
772 # WebLogo-3
773 if(($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "T"))
774 {
775 if(scalar(@tempContextSequence) == 2) { next; }
776 my ($contextTemp1, $contextTemp2) = ("", "");
777 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; }
778 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; }
779 my $context = $contextTemp1."C".$contextTemp2;
780 push(@{$refH_file->{$filename}{'WebLogo3'}{'CT'}}, $context);
781 }
782 else
783 {
784 if(scalar(@tempContextSequence) == 2) { next; }
785 my ($contextTemp1, $contextTemp2) = ("", "");
786 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); }
787 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); }
788 my $context = $contextTemp1."C".$contextTemp2; $context = reverse $context;
789 push(@{$refH_file->{$filename}{'WebLogo3'}{'CT'}}, $context);
790 }
791 }
792
793 ############################################################### MUTATION T> #############################################
794 ###################################### T:A>A:T
795 if( (($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "A")) || ( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "T") ) )
796 {
797 my $mutation = "T:A>A:T";
798 $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Count the total number of mutations
799
800 # Pearson correlation
801 if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; }
802
803 # Sequence context - 6 mutation types - genomic strand
804 my $mutationSeqContext6mutType = "T>A";
805 # We want to express the mutation in T>
806 if( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "T") )
807 {
808 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
809 my $context_reverse = $base5."_".$base3;
810 if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; }
811 }
812 elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; }
813
814 # Strand analysis T>A on NonTr stand
815 if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "A"))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "A")&&($tab[$alt_value] eq "T"))) )
816 {
817 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; }
818
819 # T>A with the sequence context (T>A strand = +)
820 if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "A")) )
821 {
822 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'T>A'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'T>A'}{'NonTr'}++; }
823 }
824 # T>A with the sequence context (A>T strand = -)
825 else
826 {
827 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
828 my $context_reverse = $base5."_".$base3;
829 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>A'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>A'}{'NonTr'}++; }
830 }
831 }
832 # Strand analysis T>A on Tr strand
833 if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "A"))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "A")&&($tab[$alt_value] eq "T"))) )
834 {
835 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; }
836
837 # T>A <ith the sequence context (T>A strand = -)
838 if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "A")) )
839 {
840 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'T>A'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'T>A'}{'Tr'}++; }
841 }
842 # T>A with the sequence context (A>T strand = +)
843 else
844 {
845 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
846 my $context_reverse = $base5."_".$base3;
847 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>A'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>A'}{'Tr'}++; }
848 }
849 }
850 # WebLogo-3
851 if(($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "A"))
852 {
853 if(scalar(@tempContextSequence) == 2) { next; }
854 my ($contextTemp1, $contextTemp2) = ("", "");
855 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; }
856 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; }
857 my $context = $contextTemp1."T".$contextTemp2;
858 push(@{$refH_file->{$filename}{'WebLogo3'}{'TA'}}, $context);
859 }
860 else
861 {
862 if(scalar(@tempContextSequence) == 2) { next; }
863 my ($contextTemp1, $contextTemp2) = ("", "");
864 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); }
865 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); }
866 my $context = $contextTemp1."T".$contextTemp2; $context = reverse $context;
867 push(@{$refH_file->{$filename}{'WebLogo3'}{'TA'}}, $context);
868 }
869 }
870 ###################################### T:A>C:G
871 if( (($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "C")) || ( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "G")) )
872 {
873 my $mutation = "T:A>C:G";
874 $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Count the total number of mutations
875
876 # Pearson correlation
877 if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; }
878
879 # Sequence context - 6 mutation types - genomic strand
880 my $mutationSeqContext6mutType = "T>C";
881 # We want to express the mutation in T>
882 if( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "T") )
883 {
884 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
885 my $context_reverse = $base5."_".$base3;
886 if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; }
887 }
888 elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; }
889
890 # Strand analysis T>C on NonTr strand
891 if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "C"))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "A")&&($tab[$alt_value] eq "G"))) )
892 {
893 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; }
894
895 # T>C (T>C strand = +)
896 if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "C")) )
897 {
898 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'T>C'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'T>C'}{'NonTr'}++; }
899 }
900 # T>C (A>G strand = -)
901 else
902 {
903 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
904 my $context_reverse = $base5."_".$base3;
905 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>C'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>C'}{'NonTr'}++; }
906 }
907 }
908 # Strand analysis T>C on Tr strand
909 if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "C"))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "A")&&($tab[$alt_value] eq "G"))) )
910 {
911 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; }
912
913 # T>C (T>C strand = -)
914 if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "C")) )
915 {
916 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'T>C'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'T>C'}{'Tr'}++; }
917 }
918 # T>C (A>G strand = +)
919 else
920 {
921 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
922 my $context_reverse = $base5."_".$base3;
923 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>C'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>C'}{'Tr'}++; }
924 }
925 }
926 # WebLogo-3
927 if(($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "C"))
928 {
929 if(scalar(@tempContextSequence) == 2) { next; }
930 my ($contextTemp1, $contextTemp2) = ("", "");
931 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; }
932 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; }
933 my $context = $contextTemp1."T".$contextTemp2; $context = reverse $context;
934 push(@{$refH_file->{$filename}{'WebLogo3'}{'TC'}}, $context);
935 }
936 else
937 {
938 if(scalar(@tempContextSequence) == 2) { next; }
939 my ($contextTemp1, $contextTemp2) = ("", "");
940 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); }
941 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); }
942 my $context = $contextTemp1."T".$contextTemp2;
943 push(@{$refH_file->{$filename}{'WebLogo3'}{'TC'}}, $context);
944 }
945 }
946 ###################################### T:A>G:C
947 if( (($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "G")) || ( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "C")) )
948 {
949 my $mutation = "T:A>G:C";
950 $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Count the total number of mutations
951
952 # Pearson correlation
953 if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; }
954
955 # Sequence context - 6 mutation types - genomic strand
956 my $mutationSeqContext6mutType = "T>G";
957 # We want to express the mutation in T>
958 if( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "T") )
959 {
960 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
961 my $context_reverse = $base5."_".$base3;
962 if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; }
963 }
964 elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; }
965
966 # Strand analysis T>G on NonTr strand
967 if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "G"))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "A")&&($tab[$alt_value] eq "C"))) )
968 {
969 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; }
970
971 # T>G (T>G strand = +)
972 if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "G")) )
973 {
974 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'T>G'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'T>G'}{'NonTr'}++; }
975 }
976 # T>G (A>C strand = -)
977 else
978 {
979 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
980 my $context_reverse = $base5."_".$base3;
981 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>G'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>G'}{'NonTr'}++; }
982 }
983 }
984 # Strand analysis T>G on Tr strand
985 if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "G"))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "A")&&($tab[$alt_value] eq "C"))) )
986 {
987 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; }
988
989 # T>G (T>G strand = -)
990 if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "G")) )
991 {
992 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'T>G'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'T>G'}{'Tr'}++; }
993 }
994 # T>G (A>C strand = +)
995 else
996 {
997 my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]);
998 my $context_reverse = $base5."_".$base3;
999 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>G'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>G'}{'Tr'}++; }
1000 }
1001 }
1002 # WebLogo-3
1003 if(($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "G"))
1004 {
1005 if(scalar(@tempContextSequence) == 2) { next; }
1006 my ($contextTemp1, $contextTemp2) = ("", "");
1007 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; }
1008 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; }
1009 my $context = $contextTemp1."T".$contextTemp2; $context = reverse $context;
1010 push(@{$refH_file->{$filename}{'WebLogo3'}{'TG'}}, $context);
1011 }
1012 else
1013 {
1014 if(scalar(@tempContextSequence) == 2) { next; }
1015 my ($contextTemp1, $contextTemp2) = ("", "");
1016 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); }
1017 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); }
1018 my $context = $contextTemp1."T".$contextTemp2;
1019 push(@{$refH_file->{$filename}{'WebLogo3'}{'TG'}}, $context);
1020 }
1021 }
1022 }
1023 close F1;
1024 }
1025 # Write the different statistics in the report
1026 sub WriteStatistics
1027 {
1028 my ($refH_file, $nb_func, $folderFigure, $folderChi2, $folderNMF) = @_;
1029
1030 # Save the different graphs in specific folder instead of in a general one.
1031 if(!-e "$folderFigure/Overall_mutation_distribution") { mkdir("$folderFigure/Overall_mutation_distribution") or die "Can't create the directory $folderFigure/Overall_mutation_distribution\n"; }
1032 if(!-e "$folderFigure/Impact_protein_sequence") { mkdir("$folderFigure/Impact_protein_sequence") or die "Can't create the directory $folderFigure/Impact_protein_sequence\n"; }
1033 if(!-e "$folderFigure/SBS_distribution") { mkdir("$folderFigure/SBS_distribution") or die "Can't create the directory $folderFigure/SBS_distribution\n"; }
1034 if(!-e "$folderFigure/Stranded_Analysis") { mkdir("$folderFigure/Stranded_Analysis") or die "Can't create the directory $folderFigure/Stranded_Analysis\n"; }
1035 if(!-e "$folderFigure/Trinucleotide_Sequence_Context") { mkdir("$folderFigure/Trinucleotide_Sequence_Context") or die "Can't create the directory $folderFigure/Trinucleotide_Sequence_Context\n"; }
1036 if(!-e "$folderFigure/Distribution_SBS_Per_Chromosomes") { mkdir("$folderFigure/Distribution_SBS_Per_Chromosomes") or die "Can't create the directory $folderFigure/Distribution_SBS_Per_Chromosomes\n"; }
1037
1038
1039 # Create a workbook with all the samples
1040 my $wb = ""; my $ws_sum = ""; my %h_chi2 = ();
1041 my ($ws_inputNMF_count, $ws_inputNMF_percent) = ("", "");
1042 ############### Define the format
1043 my ($format_A10, $format_A10Boldleft, $format_A10ItalicRed) = ("", "", "");
1044 my ($formatT_left, $formatT_right, $formatT_bottomRight, $formatT_bottomLeft, $formatT_bottom, $formatT_bottomHeader, $formatT_bottomRightHeader, $formatT_bottomHeader2, $formatT_rightHeader);
1045 my ($formatT_graphTitle);
1046 my ($table_topleft, $table_topRight, $table_bottomleft, $table_bottomRight, $table_top, $table_right, $table_bottom, $table_bottomItalicRed, $table_left, $table_bottomrightHeader, $table_left2, $table_middleHeader, $table_middleHeader2);
1047
1048 if($oneReportPerSample == 2)
1049 {
1050 $wb = Spreadsheet::WriteExcel->new("$output/Report_Mutation_Spectra.xls");
1051
1052 ############### Define the format
1053 Format_A10($wb, \$format_A10); # Text center in Arial 10
1054 Format_A10BoldLeft($wb, \$format_A10Boldleft); # Text on the left in Arial 10 bold
1055 Format_TextSection($wb, \$formatT_left, \$formatT_right, \$formatT_bottomRight, \$formatT_bottomLeft, \$formatT_bottom, \$formatT_bottomHeader, \$formatT_bottomRightHeader, \$formatT_bottomHeader2, \$formatT_rightHeader);
1056 Format_GraphTitle($wb, \$formatT_graphTitle);
1057 Format_Table($wb, \$table_topleft, \$table_topRight, \$table_bottomleft, \$table_bottomRight, \$table_top, \$table_right, \$table_bottom, \$table_bottomItalicRed, \$table_left, \$table_bottomrightHeader, \$table_left2, \$table_middleHeader, \$table_middleHeader2);
1058 Format_A10ItalicRed($wb, \$format_A10ItalicRed);
1059
1060
1061 ############### Worksheet with a summary of the samples
1062 $ws_sum = $wb->add_worksheet("Sample_List");
1063 $ws_sum->write(0, 0, "Samples", $format_A10); $ws_sum->write(0, 1, "Total number SBS", $format_A10); $ws_sum->write(0, 2, "Total number of Indel", $format_A10); $ws_sum->write(0, 3, "Total number of mutations", $format_A10);
1064 $ws_sum->set_column(0,0, 50); $ws_sum->set_column(1,1, 20); $ws_sum->set_column(2,2, 20); $ws_sum->set_column(3,3, 22);
1065
1066 ############### Save the chi2 values into a hash table
1067 if(-e "$folderChi2/Output_chi2_strandBias.txt")
1068 {
1069 open(F1, "$folderChi2/Output_chi2_strandBias.txt") or die "$!: $folderChi2/Output_chi2_strandBias.txt\n";
1070 my $header = <F1>; # Strand_Bias($tab[0]) NonTr-Tr($tab[1]) Proportion($tab[2]) P-val-Chi2($tab[3]) FDR($tab[4]) Confidence Interval($tab[5]) Mutation_Type($tab[6]) SampleName($tab[7])
1071 while(<F1>)
1072 {
1073 $_ =~ s/[\r\n]+$//;
1074 my @tab = split("\t", $_);
1075
1076 $h_chi2{$tab[7]}{$tab[6]}{'p-value'} = $tab[3]; $h_chi2{$tab[7]}{$tab[6]}{'ConfInt'} = $tab[5];
1077
1078 # For the pool data the FDR isn't calculated so replace the NA (=Missing values) by "-"
1079 if($tab[7] eq "Pool_Data") { $h_chi2{$tab[7]}{$tab[6]}{'FDR'} = "-"; }
1080 else { $h_chi2{$tab[7]}{$tab[6]}{'FDR'} = $tab[4]; }
1081 }
1082 close F1;
1083 }
1084 ############### Write the input matrix for NMF for the count and the un-normalized frequency
1085 $ws_inputNMF_count = $wb->add_worksheet("Input_NMF_Count");
1086 $ws_inputNMF_percent = $wb->add_worksheet("Input_NMF_Percent");
1087 }
1088
1089
1090 ################################################ Set the Rows and columns of the different part of the report ################################################
1091 my $row_SumSheet = 1; # First row for the summary sheet of the report
1092 my $rowStart_SBSdistrBySeg = 48; my $colStart_SBSdistrBySeg = 0; # For the table SBS distribution by segment
1093 my $colStart_matrixSeqContext = 19; # Sequence context
1094 my $col_inputNMF = 0; # Write the names of the samples with at least 33 SBS
1095
1096 # For NMF input
1097 my %h_inputNMF = ();
1098
1099 ## For each file
1100 foreach my $k_file (sort keys $refH_file)
1101 {
1102 print "File in process: $k_file\n";
1103 if($k_file ne "Pool_Data") { $col_inputNMF++; }
1104
1105 # Create one workbook for each sample
1106 if($oneReportPerSample == 1)
1107 {
1108 $wb = Spreadsheet::WriteExcel->new("$output/Report_Mutation_Spectra-$k_file.xls");
1109
1110 ############### Define the format
1111 Format_A10($wb, \$format_A10); # Text center in Arial 10
1112 Format_A10BoldLeft($wb, \$format_A10Boldleft); # Text on the left in Arial 10 bold
1113 Format_TextSection($wb, \$formatT_left, \$formatT_right, \$formatT_bottomRight, \$formatT_bottomLeft, \$formatT_bottom, \$formatT_bottomHeader, \$formatT_bottomRightHeader, \$formatT_bottomHeader2, \$formatT_rightHeader);
1114 Format_GraphTitle($wb, \$formatT_graphTitle);
1115 Format_Table($wb, \$table_topleft, \$table_topRight, \$table_bottomleft, \$table_bottomRight, \$table_top, \$table_right, \$table_bottom, \$table_bottomItalicRed, \$table_left, \$table_bottomrightHeader, \$table_left2, \$table_middleHeader, \$table_middleHeader2);
1116 Format_A10ItalicRed($wb, \$format_A10ItalicRed);
1117
1118
1119 ############### Worksheet with a summary of the samples
1120 $ws_sum = $wb->add_worksheet("Sample_List");
1121 $ws_sum->write(0, 0, "Samples", $format_A10); $ws_sum->write(0, 1, "Total number SBS", $format_A10); $ws_sum->write(0, 2, "Total number of Indel", $format_A10); $ws_sum->write(0, 3, "Total number of mutations", $format_A10);
1122 $ws_sum->set_column(0,0, 50); $ws_sum->set_column(1,1, 20); $ws_sum->set_column(2,2, 20); $ws_sum->set_column(3,3, 22);
1123 # Write in the Samples sheet the name and the total number of SBS
1124 $ws_sum->write(1, 0, "$k_file", $format_A10);
1125 $ws_sum->write(1, 1, $refH_file->{$k_file}{'TotalSBSGenomic'}, $format_A10); $ws_sum->write(1, 2, $refH_file->{$k_file}{'TotalIndelGenomic'}, $format_A10); $ws_sum->write($row_SumSheet, 3, $refH_file->{$k_file}{'TotalMutGenomic'}, $format_A10);
1126
1127 ############### Save the chi2 values into a hash table
1128 if(-e "$folderChi2/Output_chi2_strandBias.txt")
1129 {
1130 open(F1, "$folderChi2/Output_chi2_strandBias.txt") or die "$!: $folderChi2/Output_chi2_strandBias.txt\n";
1131 my $header = <F1>; # Strand_Bias($tab[0]) NonTr-Tr($tab[1]) Proportion($tab[2]) P-val-Chi2($tab[3]) FDR($tab[4]) Confidence Interval($tab[5]) Mutation_Type($tab[6]) SampleName($tab[7])
1132 while(<F1>)
1133 {
1134 $_ =~ s/[\r\n]+$//;
1135 my @tab = split("\t", $_);
1136
1137 if($tab[7] eq $k_file)
1138 {
1139 $h_chi2{$tab[7]}{$tab[6]}{'p-value'} = $tab[3]; $h_chi2{$tab[7]}{$tab[6]}{'ConfInt'} = $tab[5];
1140
1141 # For the pool data the FDR isn't calculated so replace the NA (=Missing values) by "-"
1142 if($tab[7] eq "Pool_Data") { $h_chi2{$tab[7]}{$tab[6]}{'FDR'} = "-"; }
1143 else { $h_chi2{$tab[7]}{$tab[6]}{'FDR'} = $tab[4]; }
1144 }
1145 }
1146 close F1;
1147 }
1148
1149 ############### Write the input matrix for NMF
1150 if($k_file ne "Pool_Data")
1151 {
1152 # For NMF don't consider the pool of the samples
1153 $ws_inputNMF_count = $wb->add_worksheet("Input_NMF_Count");
1154 $ws_inputNMF_percent = $wb->add_worksheet("Input_NMF_Percent");
1155 # Write in the input NMF sheet the name of the samples
1156 $ws_inputNMF_count->write(0, 1, $k_file);
1157 $ws_inputNMF_percent->write(0, 1, $k_file);
1158 }
1159 }
1160 # One workbook with all the samples
1161 else
1162 {
1163 # Write in the Samples sheet the name and the total number of SBS
1164 $ws_sum->write($row_SumSheet, 0, $k_file, $format_A10);
1165 $ws_sum->write($row_SumSheet, 1, $refH_file->{$k_file}{'TotalSBSGenomic'}, $format_A10); $ws_sum->write($row_SumSheet, 2, $refH_file->{$k_file}{'TotalIndelGenomic'}, $format_A10); $ws_sum->write($row_SumSheet, 3, $refH_file->{$k_file}{'TotalMutGenomic'}, $format_A10);
1166
1167 # For NMF don't consider the pool of the samples
1168 if($k_file ne "Pool_Data")
1169 {
1170 # Write in the input NMF sheet the name of the samples
1171 $ws_inputNMF_count->write(0, $col_inputNMF, $k_file);
1172 $ws_inputNMF_percent->write(0, $col_inputNMF, $k_file);
1173 }
1174 }
1175
1176 # Count of SBS per chromosome
1177 PearsonCoefficient($refH_file, $k_file);
1178
1179 # Add a worksheet to the workbook
1180 my $ws = $wb->add_worksheet($k_file);
1181
1182 # Write the titles of the different sections of the report
1183 WriteBoderSection($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext);
1184
1185 # Write the mutation types (6 types)
1186 WriteHeaderSection($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext);
1187
1188
1189 # Save the figures of each samples in a different folder
1190 if(!-e "$folderFigure/Overall_mutation_distribution/$k_file") { mkdir("$folderFigure/Overall_mutation_distribution/$k_file") or die "Can't create the directory $folderFigure/Overall_mutation_distribution/$k_file\n"; }
1191 if(!-e "$folderFigure/Impact_protein_sequence/$k_file") { mkdir("$folderFigure/Impact_protein_sequence/$k_file") or die "Can't create the directory $folderFigure/Impact_protein_sequence/$k_file\n"; }
1192 if(!-e "$folderFigure/SBS_distribution/$k_file") { mkdir("$folderFigure/SBS_distribution/$k_file") or die "Can't create the directory $folderFigure/SBS_distribution\n"; }
1193 if(!-e "$folderFigure/Stranded_Analysis/$k_file") { mkdir("$folderFigure/Stranded_Analysis/$k_file") or die "Can't create the directory $folderFigure/Stranded_Analysis/$k_file\n"; }
1194 if(!-e "$folderFigure/Trinucleotide_Sequence_Context/$k_file") { mkdir("$folderFigure/Trinucleotide_Sequence_Context/$k_file") or die "Can't create the directory $folderFigure/Trinucleotide_Sequence_Context/$k_file\n"; }
1195
1196
1197
1198 ###########################################################################################################################################################
1199 ################################################################# Write the statistics ###################################################################
1200 ###########################################################################################################################################################
1201 my ($ca_genomique, $cg_genomique, $ct_genomique, $ta_genomique, $tc_genomique, $tg_genomique) = (0,0,0,0,0,0);
1202 my ($ca_NonTr, $ca_Tr, $cg_NonTr, $cg_Tr, $ct_NonTr, $ct_Tr, $ta_NonTr, $ta_Tr, $tc_NonTr, $tc_Tr, $tg_NonTr, $tg_Tr) = (0,0,0,0,0,0, 0,0,0,0,0,0);
1203
1204 my $row_SBSdistrBySeg = $rowStart_SBSdistrBySeg+4;
1205 my $row_SBSDistrBySegAndFunc_CA = $rowStart_SBSdistrBySeg+$nb_func+12;
1206 my $row_SBSDistrBySegAndFunc_CG = $rowStart_SBSdistrBySeg+($nb_func*2)+16; my $rowEndCG_SBSDistrBySegAndFunc_CG = $row_SBSDistrBySegAndFunc_CG+$nb_func;
1207 my $row_SBSDistrBySegAndFunc_CT = $rowStart_SBSdistrBySeg+($nb_func*3)+20;
1208
1209 ## 6 mutation types by segment
1210 foreach my $k_func (sort keys $refH_file->{$k_file}{'6mutType'})
1211 {
1212 my $totalSBS_bySegment = 0;
1213
1214 # Write the functional region for the section SBS distribution by segment
1215 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg, $k_func, $formatT_left);
1216
1217 # Write the exonic func for the section strand bias by segment
1218 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg, $k_func, $formatT_left);
1219
1220 if($row_SBSDistrBySegAndFunc_CG == $rowEndCG_SBSDistrBySegAndFunc_CG)
1221 {
1222 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg, $k_func, $formatT_bottomLeft);
1223 }
1224 else
1225 {
1226 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg, $k_func, $formatT_left);
1227 }
1228
1229 foreach my $k_mutation (sort keys $refH_file->{$k_file}{'6mutType'}{$k_func})
1230 {
1231 if($k_mutation eq "C:G>A:T")
1232 {
1233 # Write the ratio NonTr(CA)/Tr(GT)
1234 my $ratioSB = 0;
1235 if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; }
1236 else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; }
1237 $ratioSB = sprintf("%.2f", $ratioSB);
1238 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+1, $ratioSB, $format_A10);
1239
1240 # Write the count of SBS in the NonTr and Tr strand
1241 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10);
1242 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $format_A10);
1243
1244 # Calculate the total number of SBS per mut type (genomic strand)
1245 $ca_genomique += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
1246 # Calculate the total number of SBS by NonTr / Tr strand
1247 $ca_NonTr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $ca_Tr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
1248
1249 # Write the count by exonic region
1250 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $format_A10);
1251 }
1252 if($k_mutation eq "C:G>G:C")
1253 {
1254 # Write the ratio NonTr(CG)/Tr(GC)
1255 my $ratioSB = 0;
1256 if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; }
1257 else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; }
1258 $ratioSB = sprintf("%.2f", $ratioSB);
1259 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+5, $ratioSB, $format_A10);
1260
1261 # Write the count of SBS in the NonTr and Tr strand
1262 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+6, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10);
1263 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+7, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $format_A10);
1264
1265 # Calculate the total number of SBS per mut type (genomic strand)
1266 $cg_genomique += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
1267 # Calculate the total number of SBS by NonTr / Tr strand
1268 $cg_NonTr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $cg_Tr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
1269
1270 # Write the count by exonic region
1271 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+5, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $format_A10);
1272 }
1273 if($k_mutation eq "C:G>T:A")
1274 {
1275 # Write the ratio NonTr(CT)/Tr(GA)
1276 my $ratioSB = 0;
1277 if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; }
1278 else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; }
1279 $ratioSB = sprintf("%.2f", $ratioSB);
1280 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+9, $ratioSB, $format_A10);
1281
1282 # Write the count of SBS in the NonTr and Tr strand
1283 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+10, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10);
1284 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+11, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_right);
1285
1286 # Calculate the total number of SBS per mut type (genomic strand)
1287 $ct_genomique += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
1288 # Calculate the total number of SBS by NonTr / Tr strand
1289 $ct_NonTr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $ct_Tr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
1290
1291 # Write the count by exonic region
1292 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+7, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $format_A10);
1293 }
1294 if($k_mutation eq "T:A>A:T")
1295 {
1296 # Write the ratio NonTr(AT)/Tr(TA)
1297 my $ratioSB = 0;
1298 if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; }
1299 else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; }
1300 $ratioSB = sprintf("%.2f", $ratioSB);
1301
1302
1303 if($row_SBSDistrBySegAndFunc_CG == $rowEndCG_SBSDistrBySegAndFunc_CG)
1304 {
1305 # Write the ratio NonTr(AC)/Tr(TG)
1306 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+1, $ratioSB, $formatT_bottom);
1307 # Write the count of SBS in the NonTr and Tr strand
1308 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $formatT_bottom);
1309 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_bottom);
1310 }
1311 else
1312 {
1313 # Write the ratio NonTr(AC)/Tr(TG)
1314 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+1, $ratioSB, $format_A10);
1315 # Write the count of SBS in the NonTr and Tr strand
1316 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10);
1317 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $format_A10);
1318 }
1319
1320
1321 # Calculate the total number of SBS per mut type (genomic strand)
1322 $ta_genomique += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
1323 # Calculate the total number of SBS by NonTr / Tr strand
1324 $ta_NonTr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $ta_Tr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
1325
1326 # Write the count by exonic region
1327 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+9, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $format_A10);
1328 }
1329 if($k_mutation eq "T:A>C:G")
1330 {
1331 # Write the ratio NonTr(AG)/Tr(TC)
1332 my $ratioSB = 0;
1333 if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; }
1334 else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; }
1335 $ratioSB = sprintf("%.2f", $ratioSB);
1336
1337 if($row_SBSDistrBySegAndFunc_CG == $rowEndCG_SBSDistrBySegAndFunc_CG)
1338 {
1339 # Write the ratio NonTr(AC)/Tr(TG)
1340 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+5, $ratioSB, $formatT_bottom);
1341 # Write the count of SBS in the NonTr and Tr strand
1342 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+6, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $formatT_bottom);
1343 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+7, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_bottom);
1344 }
1345 else
1346 {
1347 # Write the ratio NonTr(AC)/Tr(TG)
1348 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+5, $ratioSB, $format_A10);
1349 # Write the count of SBS in the NonTr and Tr strand
1350 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+6, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10);
1351 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+7, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $format_A10);
1352 }
1353
1354 # Calculate the total number of SBS per mut type (genomic strand)
1355 $tc_genomique += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
1356 # Calculate the total number of SBS by NonTr / Tr strand
1357 $tc_NonTr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $tc_Tr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
1358
1359 # Write the count by exonic region
1360 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+11, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $format_A10);
1361 }
1362 if($k_mutation eq "T:A>G:C")
1363 {
1364 # Calculate the ratio for the strand bias
1365 my $ratioSB = 0;
1366 if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; }
1367 else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; }
1368 $ratioSB = sprintf("%.2f", $ratioSB);
1369
1370 if($row_SBSDistrBySegAndFunc_CG == $rowEndCG_SBSDistrBySegAndFunc_CG)
1371 {
1372 # Write the ratio NonTr(AC)/Tr(TG)
1373 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+9, $ratioSB, $formatT_bottom);
1374 # Write the count of SBS in the NonTr and Tr strand
1375 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+10, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $formatT_bottom);
1376 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+11, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_bottomRight);
1377 }
1378 else
1379 {
1380 # Write the ratio NonTr(AC)/Tr(TG)
1381 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+9, $ratioSB, $format_A10);
1382 # Write the count of SBS in the NonTr and Tr strand
1383 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+10, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10);
1384 $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+11, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_right);
1385 }
1386
1387 # Calculate the total number of SBS per mut type (genomic strand)
1388 $tg_genomique += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
1389 # Calculate the total number of SBS by NonTr / Tr strand
1390 $tg_NonTr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $tg_Tr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
1391
1392 # Write the count by exonic region
1393 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+13, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $formatT_right);
1394 }
1395
1396 # Calculate the total number of SBS on the genomic strand for each mutation types by exonic region
1397 $totalSBS_bySegment += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
1398 } # End $k_mutation
1399
1400 $row_SBSDistrBySegAndFunc_CA++; $row_SBSDistrBySegAndFunc_CG++; $row_SBSDistrBySegAndFunc_CT++;
1401
1402 # Write the percent by exonic region
1403 my $percent_ca = 0;
1404 if($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} == 0) { $percent_ca = 0; }
1405 else { $percent_ca = ($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} / $totalSBS_bySegment ) * 100; $percent_ca = sprintf("%.2f", $percent_ca); }
1406 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+2, $percent_ca, $format_A10);
1407 my $percent_cg = 0;
1408 if($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} == 0) { $percent_cg = 0; }
1409 else { $percent_cg = ($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>G:C'}{'TotalMutG'} / $totalSBS_bySegment ) * 100; $percent_cg = sprintf("%.2f", $percent_cg); }
1410 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+4, $percent_cg, $format_A10);
1411 my $percent_ct = 0;
1412 if($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} == 0) { $percent_ct = 0; }
1413 else { $percent_ct = ($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>T:A'}{'TotalMutG'} / $totalSBS_bySegment ) * 100; $percent_ct = sprintf("%.2f", $percent_ct); }
1414 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+6, $percent_ct, $format_A10);
1415 my $percent_ta = 0;
1416 if($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} == 0) { $percent_ta = 0; }
1417 else { $percent_ta = ($refH_file->{$k_file}{'6mutType'}{$k_func}{'T:A>A:T'}{'TotalMutG'} / $totalSBS_bySegment ) * 100; $percent_ta = sprintf("%.2f", $percent_ta); }
1418 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+8, $percent_ta, $format_A10);
1419 my $percent_tc = 0;
1420 if($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} == 0) { $percent_tc = 0; }
1421 else { $percent_tc = ($refH_file->{$k_file}{'6mutType'}{$k_func}{'T:A>C:G'}{'TotalMutG'} / $totalSBS_bySegment ) * 100; $percent_tc = sprintf("%.2f", $percent_tc); }
1422 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+10, $percent_tc, $format_A10);
1423 my $percent_tg = 0;
1424 if($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} == 0) { $percent_tg = 0; }
1425 else { $percent_tg = ($refH_file->{$k_file}{'6mutType'}{$k_func}{'T:A>G:C'}{'TotalMutG'} / $totalSBS_bySegment ) * 100; $percent_tg = sprintf("%.2f", $percent_tg); }
1426 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+12, $percent_tg, $format_A10);
1427
1428 # Write the count of SBS by segment
1429 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+1, $totalSBS_bySegment, $format_A10);
1430
1431 $row_SBSdistrBySeg++;
1432 } # End $k_func
1433
1434 # Write the total number of SBS on the genomic strand
1435 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+1, $refH_file->{$k_file}{'TotalSBSGenomic'}, $formatT_bottomHeader);
1436
1437 # Write the total and the percentage of SBS for each mutation types and save it to a text file
1438 open(DISTRSBS, ">", "$folderFigure/SBS_distribution/$k_file/$k_file-SBS_distribution.txt") or die "$!: $folderFigure/SBS_distribution/$k_file/$k_file-SBS_distribution.txt\n";
1439 print DISTRSBS "Mutation_Type\tCount\tPercentage\tSample\n";
1440 my $percent_ca = 0;
1441 if($ca_genomique == 0) { $percent_ca = 0; }
1442 else { $percent_ca = ($ca_genomique/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_ca = sprintf("%.2f", $percent_ca); }
1443 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+2, $percent_ca, $formatT_bottom); print DISTRSBS "C:G>A:T\t$ca_genomique\t$percent_ca\t$k_file\n";
1444 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+3, $ca_genomique, $formatT_bottomHeader);
1445 my $percent_cg = 0;
1446 if($cg_genomique == 0) { $percent_cg = 0; }
1447 else { $percent_cg = ($cg_genomique/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_cg = sprintf("%.2f", $percent_cg); }
1448 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+4, $percent_cg, $formatT_bottom); print DISTRSBS "C:G>G:C\t$cg_genomique\t$percent_cg\t$k_file\n";
1449 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+5, $cg_genomique, $formatT_bottomHeader);
1450 my $percent_ct = 0;
1451 if($ct_genomique == 0) { $percent_ct = 0; }
1452 else { $percent_ct = ($ct_genomique/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_ct = sprintf("%.2f", $percent_ct); }
1453 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+6, $percent_ct, $formatT_bottom); print DISTRSBS "C:G>T:A\t$ct_genomique\t$percent_ct\t$k_file\n";
1454 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+7, $ct_genomique, $formatT_bottomHeader);
1455 my $percent_ta = 0;
1456 if($ta_genomique == 0) { $percent_ta = 0; }
1457 else { $percent_ta = ($ta_genomique/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_ta = sprintf("%.2f", $percent_ta); }
1458 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+8, $percent_ta, $formatT_bottom); print DISTRSBS "T:A>A:T\t$ta_genomique\t$percent_ta\t$k_file\n";
1459 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+9, $ta_genomique, $formatT_bottomHeader);
1460 my $percent_tc = 0;
1461 if($tc_genomique == 0) { $percent_tc = 0; }
1462 else { $percent_tc = ($tc_genomique/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_tc = sprintf("%.2f", $percent_tc); }
1463 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+10, $percent_tc, $formatT_bottom); print DISTRSBS "T:A>C:G\t$tc_genomique\t$percent_tc\t$k_file\n";
1464 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+11, $tc_genomique, $formatT_bottomHeader);
1465 my $percent_tg = 0;
1466 if($tg_genomique == 0) { $percent_tg = 0; }
1467 else { $percent_tg = ($tg_genomique/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_tg = sprintf("%.2f", $percent_tg); }
1468 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+12, $percent_tg, $formatT_bottom); print DISTRSBS "T:A>G:C\t$tg_genomique\t$percent_tg\t$k_file\n";
1469 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+13, $tg_genomique, $formatT_bottomRightHeader);
1470 close DISTRSBS;
1471
1472 ###########################################################################################################################################################
1473 ################################################################### Write Strand BIAS #####################################################################
1474 ###########################################################################################################################################################
1475 # Write the SB for each mutation type (table 3)
1476 $ws->write(28, 11, "Table 3. Significance of the strand biases", $format_A10Boldleft);
1477 $ws->set_column(11, 11, 13); $ws->set_column(16, 16, 15); $ws->set_column(17, 17, 10);
1478 $ws->write(29, 11, "Mutation Type", $table_topleft); $ws->write(29, 12, "Non-Tr/Tr", $table_top); $ws->write(29, 13, "Non-Tr", $table_top); $ws->write(29, 14, "Tr", $table_top); $ws->write(29, 15, "P-value", $table_top); $ws->write(29, 16, "FDR q value", $table_top); $ws->write(29, 17, "95% CI", $table_topRight);
1479
1480 $ws->write(39, 11, "Table 3. Significance of the strand biases", $format_A10Boldleft);
1481 $ws->write(40, 11, "Mutation Type", $table_topleft); $ws->write(40, 12, "Non-Tr/Tr", $table_top); $ws->write(40, 13, "Non-Tr", $table_top); $ws->write(40, 14, "Tr", $table_top); $ws->write(40, 15, "P-value", $table_top); $ws->write(40, 16, "FDR q value", $table_top); $ws->write(40, 17, "95% CI", $table_topRight);
1482
1483 # For ggplot2
1484 open(SB, ">", "$folderFigure/Stranded_Analysis/$k_file/$k_file-StrandBias.txt") or die "$!: $folderFigure/Stranded_Analysis/$k_file/$k_file-StrandBias.txt\n";
1485 print SB "Alteration\tStrand\tCount\n";
1486
1487
1488 #-----------------------------------------------------------------------------------------------------#
1489 my ($ratio_ca, $ratio_gt, $percent_ca_NonTr, $percent_ca_Tr) = (0, 0, 0, 0, 0);
1490 if( ($ca_NonTr==0) || ($ca_Tr==0) ) { $ratio_ca = 0; $ratio_gt = 0; $percent_ca_NonTr = 0; $percent_ca_Tr = 0; }
1491 else
1492 {
1493 $ratio_ca = $ca_NonTr/$ca_Tr; $ratio_ca = sprintf("%.2f", $ratio_ca);
1494 $ratio_gt = $ca_Tr/$ca_NonTr; $ratio_gt = sprintf("%.2f", $ratio_gt);
1495 $percent_ca_NonTr = ($ca_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_ca_Tr = ($ca_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100;
1496 }
1497 print SB "C>A\tNonTranscribed\t$ca_NonTr\n", "C>A\tTranscribed\t$ca_Tr\n";
1498 # C>A
1499 $ws->write(30, 11, "C>A", $table_left); $ws->write(30, 12, $ratio_ca, $table_middleHeader); $ws->write(30, 13, $ca_NonTr, $format_A10); $ws->write(30, 14, $ca_Tr, $format_A10);
1500 # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10
1501 if(($ca_NonTr+$ca_Tr)< 10)
1502 {
1503 if($h_chi2{$k_file}{'C>A'}{'p-value'} eq "NA") { $ws->write_string(30, 15, $h_chi2{$k_file}{'C>A'}{'p-value'}, $format_A10); }
1504 else { $ws->write_string(30, 15, $h_chi2{$k_file}{'C>A'}{'p-value'}, $format_A10ItalicRed); }
1505 }
1506 else { $ws->write_string(30, 15, $h_chi2{$k_file}{'C>A'}{'p-value'}, $format_A10); }
1507 $ws->write(30, 16, $h_chi2{$k_file}{'C>A'}{'FDR'}, $format_A10); $ws->write(30, 17, $h_chi2{$k_file}{'C>A'}{'ConfInt'}, $table_right);
1508 # G>T
1509 $ws->write(41, 11, "G>T", $table_left); $ws->write(41, 12, $ratio_gt, $table_middleHeader); $ws->write(41, 13, $ca_Tr, $format_A10); $ws->write(41, 14, $ca_NonTr, $format_A10);
1510 if(($ca_NonTr+$ca_Tr)< 10)
1511 {
1512 if($h_chi2{$k_file}{'C>A'}{'p-value'} eq "NA") { $ws->write_string(41, 15, $h_chi2{$k_file}{'C>A'}{'p-value'}, $format_A10); }
1513 else { $ws->write_string(41, 15, $h_chi2{$k_file}{'C>A'}{'p-value'}, $format_A10ItalicRed); }
1514 }
1515 else { $ws->write_string(41, 15, $h_chi2{$k_file}{'C>A'}{'p-value'}, $format_A10); }
1516 $ws->write(41, 16, $h_chi2{$k_file}{'C>A'}{'FDR'}, $format_A10); $ws->write(41, 17, $h_chi2{$k_file}{'C>A'}{'ConfInt'}, $table_right);
1517
1518 #-----------------------------------------------------------------------------------------------------#
1519 my ($ratio_cg, $ratio_gc, $percent_cg_NonTr, $percent_cg_Tr) = (0, 0, 0, 0, 0);
1520 if( ($cg_NonTr==0) || ($cg_Tr==0) ) { $ratio_cg = 0; $ratio_gc = 0; $percent_cg_NonTr = 0; $percent_cg_Tr = 0; }
1521 else
1522 {
1523 $ratio_cg = $cg_NonTr/$cg_Tr; $ratio_cg = sprintf("%.2f", $ratio_cg);
1524 $ratio_gc = $cg_Tr/$cg_NonTr; $ratio_gc = sprintf("%.2f", $ratio_gc);
1525 $percent_cg_NonTr = ($cg_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_cg_Tr = ($cg_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100;
1526 }
1527 print SB "C>G\tNonTranscribed\t$cg_NonTr\n", "C>G\tTranscribed\t$cg_Tr\n";
1528 # C>G
1529 $ws->write(31, 11, "C>G", $table_left); $ws->write(31, 12, $ratio_cg, $table_middleHeader); $ws->write(31, 13, $cg_NonTr, $format_A10); $ws->write(31, 14, $cg_Tr, $format_A10);
1530 # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10
1531 if(($cg_NonTr+$cg_Tr)< 10)
1532 {
1533 if($h_chi2{$k_file}{'C>G'}{'p-value'} eq "NA") { $ws->write_string(31, 15, $h_chi2{$k_file}{'C>G'}{'p-value'}, $format_A10); }
1534 else { $ws->write_string(31, 15, $h_chi2{$k_file}{'C>G'}{'p-value'}, $format_A10ItalicRed); }
1535 }
1536 else { $ws->write_string(31, 15, $h_chi2{$k_file}{'C>G'}{'p-value'}, $format_A10); }
1537 $ws->write(31, 16, $h_chi2{$k_file}{'C>G'}{'FDR'}, $format_A10); $ws->write(31, 17, $h_chi2{$k_file}{'C>G'}{'ConfInt'}, $table_right);
1538 # G>C
1539 $ws->write(42, 11, "G>C", $table_left); $ws->write(42, 12, $ratio_gc, $table_middleHeader); $ws->write(42, 13, $cg_Tr, $format_A10); $ws->write(42, 14, $cg_NonTr, $format_A10);
1540 if(($cg_NonTr+$cg_Tr)< 10)
1541 {
1542 if($h_chi2{$k_file}{'C>G'}{'p-value'} eq "NA") { $ws->write_string(42, 15, $h_chi2{$k_file}{'C>G'}{'p-value'}, $format_A10); }
1543 else { $ws->write_string(42, 15, $h_chi2{$k_file}{'C>G'}{'p-value'}, $format_A10ItalicRed); }
1544 }
1545 else { $ws->write_string(42, 15, $h_chi2{$k_file}{'C>G'}{'p-value'}, $format_A10); }
1546 $ws->write(42, 16, $h_chi2{$k_file}{'C>G'}{'FDR'}, $format_A10); $ws->write(42, 17, $h_chi2{$k_file}{'C>G'}{'ConfInt'}, $table_right);
1547
1548 #-----------------------------------------------------------------------------------------------------#
1549 my ($ratio_ct, $ratio_ga, $percent_ct_NonTr, $percent_ct_Tr) = (0, 0, 0, 0, 0);
1550 if( ($ct_NonTr==0) || ($ct_Tr==0) ) { $ratio_ct = 0; $ratio_ga = 0; $percent_ct_NonTr = 0; $percent_ct_Tr = 0; }
1551 else
1552 {
1553 $ratio_ct = $ct_NonTr/$ct_Tr; $ratio_ct = sprintf("%.2f", $ratio_ct);
1554 $ratio_ga = $ct_Tr/$ct_NonTr; $ratio_ga = sprintf("%.2f", $ratio_ga);
1555 $percent_ct_NonTr = ($ct_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_ct_Tr = ($ct_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100;
1556 }
1557 print SB "C>T\tNonTranscribed\t$ct_NonTr\n", "C>T\tTranscribed\t$ct_Tr\n";
1558 # C>T
1559 $ws->write(32, 11, "C>T", $table_left); $ws->write(32, 12, $ratio_ct, $table_middleHeader); $ws->write(32, 13, $ct_NonTr, $format_A10); $ws->write(32, 14, $ct_Tr, $format_A10);
1560 # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10
1561 if(($ct_NonTr+$ct_Tr)< 10)
1562 {
1563 if($h_chi2{$k_file}{'C>T'}{'p-value'} eq "NA") { $ws->write_string(32, 15, $h_chi2{$k_file}{'C>T'}{'p-value'}, $format_A10); }
1564 else { $ws->write_string(32, 15, $h_chi2{$k_file}{'C>T'}{'p-value'}, $format_A10ItalicRed); }
1565 }
1566 else { $ws->write_string(32, 15, $h_chi2{$k_file}{'C>T'}{'p-value'}, $format_A10); }
1567 $ws->write(32, 16, $h_chi2{$k_file}{'C>T'}{'FDR'}, $format_A10); $ws->write(32, 17, $h_chi2{$k_file}{'C>T'}{'ConfInt'}, $table_right);
1568 # G>A
1569 $ws->write(43, 11, "G>A", $table_left); $ws->write(43, 12, $ratio_ga, $table_middleHeader); $ws->write(43, 13, $ct_Tr, $format_A10); $ws->write(43, 14, $ct_NonTr, $format_A10);
1570 if(($ct_NonTr+$ct_Tr)< 10)
1571 {
1572 if($h_chi2{$k_file}{'C>T'}{'p-value'} eq "NA") { $ws->write_string(43, 15, $h_chi2{$k_file}{'C>T'}{'p-value'}, $format_A10); }
1573 else { $ws->write_string(43, 15, $h_chi2{$k_file}{'C>T'}{'p-value'}, $format_A10ItalicRed); }
1574 }
1575 else { $ws->write_string(43, 15, $h_chi2{$k_file}{'C>T'}{'p-value'}, $format_A10); }
1576 $ws->write(43, 16, $h_chi2{$k_file}{'C>T'}{'FDR'}, $format_A10); $ws->write(43, 17, $h_chi2{$k_file}{'C>T'}{'ConfInt'}, $table_right);
1577
1578 #-----------------------------------------------------------------------------------------------------#
1579 my ($ratio_ta, $ratio_at, $percent_ta_NonTr, $percent_ta_Tr) = (0, 0, 0, 0, 0);
1580 if( ($ta_NonTr==0) || ($ta_Tr==0) ) { $ratio_ta = 0; $ratio_at = 0; $percent_ta_NonTr = 0; $percent_ta_Tr = 0; }
1581 else
1582 {
1583 $ratio_ta = $ta_NonTr/$ta_Tr; $ratio_ta = sprintf("%.2f", $ratio_ta);
1584 $ratio_at = $ta_Tr/$ta_NonTr; $ratio_at = sprintf("%.2f", $ratio_at);
1585 $percent_ta_NonTr = ($ta_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_ta_Tr = ($ta_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100;
1586 }
1587 print SB "T>A\tNonTranscribed\t$ta_NonTr\n", "T>A\tTranscribed\t$ta_Tr\n";
1588 # T>A
1589 $ws->write(33, 11, "T>A", $table_left); $ws->write(33, 12, $ratio_ta, $table_middleHeader); $ws->write(33, 13, $ta_NonTr, $format_A10); $ws->write(33, 14, $ta_Tr, $format_A10);
1590 # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10
1591 if(($ta_NonTr+$ta_Tr)< 10)
1592 {
1593 if($h_chi2{$k_file}{'T>A'}{'p-value'} eq "NA") { $ws->write_string(33, 15, $h_chi2{$k_file}{'T>A'}{'p-value'}, $format_A10); }
1594 else { $ws->write_string(33, 15, $h_chi2{$k_file}{'T>A'}{'p-value'}, $format_A10ItalicRed); }
1595 }
1596 else { $ws->write_string(33, 15, $h_chi2{$k_file}{'T>A'}{'p-value'}, $format_A10); }
1597 $ws->write(33, 16, $h_chi2{$k_file}{'T>A'}{'FDR'}, $format_A10); $ws->write(33, 17, $h_chi2{$k_file}{'T>A'}{'ConfInt'}, $table_right);
1598 # A>T
1599 $ws->write(44, 11, "A>T", $table_left); $ws->write(44, 12, $ratio_at, $table_middleHeader); $ws->write(44, 13, $ta_Tr, $format_A10); $ws->write(44, 14, $ta_NonTr, $format_A10);
1600 if(($ta_NonTr+$ta_Tr)< 10)
1601 {
1602 if($h_chi2{$k_file}{'T>A'}{'p-value'} eq "NA") { $ws->write_string(44, 15, $h_chi2{$k_file}{'T>A'}{'p-value'}, $format_A10); }
1603 else { $ws->write_string(44, 15, $h_chi2{$k_file}{'T>A'}{'p-value'}, $format_A10ItalicRed); }
1604 }
1605 else { $ws->write_string(44, 15, $h_chi2{$k_file}{'T>A'}{'p-value'}, $format_A10); }
1606 $ws->write(44, 16, $h_chi2{$k_file}{'T>A'}{'FDR'}, $format_A10); $ws->write(44, 17, $h_chi2{$k_file}{'T>A'}{'ConfInt'}, $table_right);
1607
1608 #-----------------------------------------------------------------------------------------------------#
1609 my ($ratio_tc, $ratio_ag, $percent_tc_NonTr, $percent_tc_Tr) = (0, 0, 0, 0, 0);
1610 if( ($tc_NonTr==0) || ($tc_Tr==0) ) { $ratio_tc = 0; $ratio_ag = 0; $percent_tc_NonTr = 0; $percent_tc_Tr = 0; }
1611 else
1612 {
1613 $ratio_tc = $tc_NonTr/$tc_Tr; $ratio_tc = sprintf("%.2f", $ratio_tc);
1614 $ratio_ag = $tc_Tr/$tc_NonTr; $ratio_ag = sprintf("%.2f", $ratio_ag);
1615 $percent_tc_NonTr = ($tc_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_tc_Tr = ($tc_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100;
1616 }
1617 print SB "T>C\tNonTranscribed\t$tc_NonTr\n", "T>C\tTranscribed\t$tc_Tr\n";
1618 # T>C
1619 $ws->write(34, 11, "T>C", $table_left); $ws->write(34, 12, $ratio_tc, $table_middleHeader); $ws->write(34, 13, $tc_NonTr, $format_A10); $ws->write(34, 14, $tc_Tr, $format_A10);
1620 # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10
1621 if(($tc_NonTr+$tc_Tr)< 10)
1622 {
1623 if($h_chi2{$k_file}{'T>C'}{'p-value'} eq "NA") { $ws->write_string(34, 15, $h_chi2{$k_file}{'T>C'}{'p-value'}, $format_A10); }
1624 else { $ws->write_string(34, 15, $h_chi2{$k_file}{'T>C'}{'p-value'}, $format_A10ItalicRed); }
1625 }
1626 else { $ws->write_string(34, 15, $h_chi2{$k_file}{'T>C'}{'p-value'}, $format_A10); }
1627 $ws->write(34, 16, $h_chi2{$k_file}{'T>C'}{'FDR'}, $format_A10); $ws->write(34, 17, $h_chi2{$k_file}{'T>C'}{'ConfInt'}, $table_right);
1628 # A>G
1629 $ws->write(45, 11, "A>G", $table_left); $ws->write(45, 12, $ratio_ag, $table_middleHeader); $ws->write(45, 13, $tc_Tr, $format_A10); $ws->write(45, 14, $tc_NonTr, $format_A10);
1630 if(($tc_NonTr+$tc_Tr)< 10)
1631 {
1632 if($h_chi2{$k_file}{'T>C'}{'p-value'} eq "NA") { $ws->write_string(45, 15, $h_chi2{$k_file}{'T>C'}{'p-value'}, $format_A10); }
1633 else { $ws->write_string(45, 15, $h_chi2{$k_file}{'T>C'}{'p-value'}, $format_A10ItalicRed); }
1634 }
1635 else { $ws->write_string(45, 15, $h_chi2{$k_file}{'T>C'}{'p-value'}, $format_A10); }
1636 $ws->write(45, 16, $h_chi2{$k_file}{'T>C'}{'FDR'}, $format_A10); $ws->write(45, 17, $h_chi2{$k_file}{'T>C'}{'ConfInt'}, $table_right);
1637
1638 #-----------------------------------------------------------------------------------------------------#
1639 my ($ratio_tg, $ratio_ac, $percent_tg_NonTr, $percent_tg_Tr) = (0, 0, 0, 0, 0);
1640 if( ($tg_NonTr==0) || ($tg_Tr==0) ) { $ratio_tg = 0; $ratio_ac = 0; $percent_tg_NonTr = 0; $percent_tg_Tr = 0; }
1641 else
1642 {
1643 $ratio_tg = $tg_NonTr/$tg_Tr; $ratio_tg = sprintf("%.2f", $ratio_tg);
1644 $ratio_ac = $tg_Tr/$tg_NonTr; $ratio_ac = sprintf("%.2f", $ratio_ac);
1645 $percent_tg_NonTr = ($tg_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_tg_Tr = ($tg_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100;
1646 }
1647 print SB "T>G\tNonTranscribed\t$tg_NonTr\n", "T>G\tTranscribed\t$tg_Tr\n";
1648 # T>G
1649 $ws->write(35, 11, "T>G", $table_bottomleft); $ws->write(35, 12, $ratio_tg, $table_middleHeader2); $ws->write(35, 13, $tg_NonTr, $table_bottom); $ws->write(35, 14, $tg_Tr, $table_bottom);
1650 # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10
1651 if(($tg_NonTr+$tg_Tr)< 10)
1652 {
1653 if($h_chi2{$k_file}{'T>G'}{'p-value'} eq "NA") { $ws->write_string(35, 15, $h_chi2{$k_file}{'T>G'}{'p-value'}, $table_bottom); }
1654 else { $ws->write_string(35, 15, $h_chi2{$k_file}{'T>G'}{'p-value'}, $table_bottomItalicRed); }
1655 }
1656 else { $ws->write_string(35, 15, $h_chi2{$k_file}{'T>G'}{'p-value'}, $table_bottom); }
1657 $ws->write(35, 16, $h_chi2{$k_file}{'T>G'}{'FDR'}, $table_bottom); $ws->write(35, 17, $h_chi2{$k_file}{'T>G'}{'ConfInt'}, $table_bottomRight);
1658 # A>C
1659 $ws->write(46, 11, "A>C", $table_bottomleft); $ws->write(46, 12, $ratio_ac, $table_middleHeader2); $ws->write(46, 13, $tg_Tr, $table_bottom); $ws->write(46, 14, $tg_NonTr, $table_bottom);
1660 if(($tg_NonTr+$tg_Tr)< 10)
1661 {
1662 if($h_chi2{$k_file}{'T>G'}{'p-value'} eq "NA") { $ws->write_string(46, 15, $h_chi2{$k_file}{'T>G'}{'p-value'}, $table_bottom); }
1663 else { $ws->write_string(46, 15, $h_chi2{$k_file}{'T>G'}{'p-value'}, $table_bottomItalicRed); }
1664 }
1665 else { $ws->write_string(46, 15, $h_chi2{$k_file}{'T>G'}{'p-value'}, $table_bottom); }
1666 $ws->write(46, 16, $h_chi2{$k_file}{'T>G'}{'FDR'}, $table_bottom); $ws->write(46, 17, $h_chi2{$k_file}{'T>G'}{'ConfInt'}, $table_bottomRight);
1667
1668 ### Write a warning message when NonTr+Tr < 10
1669 my $format_italic_red = $wb->add_format(font=>'Arial', size=>10, italic=>1, color => 'red');
1670
1671 if( (($ca_NonTr+$ca_Tr)< 10) || (($cg_NonTr+$cg_Tr)< 10) || (($ct_NonTr+$ct_Tr)< 10) || (($ta_NonTr+$ta_Tr)< 10) || (($tc_NonTr+$tc_Tr)< 10) || (($tg_NonTr+$tg_Tr)< 10) )
1672 {
1673 $ws->write(36, 11, "Warning message: chi-squared approximation may be incorrect because the number of SBS", $format_italic_red);
1674 $ws->write(37, 11, "on Non-transcribed and transcribed strand is lower than 10", $format_italic_red);
1675 }
1676
1677 close SB;
1678
1679
1680 ###########################################################################################################################################################
1681 ################################################################### Write SBS Per Chr #####################################################################
1682 ###########################################################################################################################################################
1683 # For the HTML report
1684 open(SBSPerChr, ">", "$folderFigure/Distribution_SBS_Per_Chromosomes/$k_file-DistributionSNVS_per_chromosome.txt") or die "$!: $folderFigure/Distribution_SBS_Per_Chromosomes/$k_file-DistributionSNVS_per_chromosome.txt\n";
1685 print SBSPerChr "\tPearson\t$refH_file->{$k_file}{'SBSPerChr'}{'AllMutType'}\t", $refH_file->{$k_file}{'SBSPerChr'}{"C:G>A:T"}{'Pearson'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"C:G>G:C"}{'Pearson'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"C:G>T:A"}{'Pearson'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"T:A>A:T"}{'Pearson'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"T:A>C:G"}{'Pearson'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"T:A>G:C"}{'Pearson'},"\n";
1686 print SBSPerChr "Chr\tSize\tAll SBS\tC:G>A:T\tC:G>G:C\tC:G>T:A\tT:A>A:T\tT:A>C:G\tT:A>G:C\n";
1687
1688 my $row_SBSPerChr = $row_SBSDistrBySegAndFunc_CG + 8; # Line 158
1689
1690 # Write the Pearson coefficient
1691 $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'SBSPerChr'}{"C:G>A:T"}{'Pearson'}, $format_A10);
1692 $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+4, $refH_file->{$k_file}{'SBSPerChr'}{"C:G>G:C"}{'Pearson'}, $format_A10);
1693 $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+5, $refH_file->{$k_file}{'SBSPerChr'}{"C:G>T:A"}{'Pearson'}, $format_A10);
1694 $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+6, $refH_file->{$k_file}{'SBSPerChr'}{"T:A>A:T"}{'Pearson'}, $format_A10);
1695 $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+7, $refH_file->{$k_file}{'SBSPerChr'}{"T:A>C:G"}{'Pearson'}, $format_A10);
1696 $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+8, $refH_file->{$k_file}{'SBSPerChr'}{"T:A>G:C"}{'Pearson'}, $formatT_right);
1697
1698 # Write the chromosome number and their sizes / Write the total of SBS per chromosome
1699 my $line=0;
1700
1701 foreach my $chromosome (sort keys %chromosomes)
1702 {
1703 $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg, $chromosome, $formatT_left);
1704 $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+1, $chromosomes{$chromosome}, $format_A10);
1705 $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'}, $format_A10);
1706
1707 # Write the count per mutation
1708 $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'SBSPerChr'}{"C:G>A:T"}{'CHR'}{$chromosome}{'chr'}, $format_A10);
1709 $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+4, $refH_file->{$k_file}{'SBSPerChr'}{"C:G>G:C"}{'CHR'}{$chromosome}{'chr'}, $format_A10);
1710 $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+5, $refH_file->{$k_file}{'SBSPerChr'}{"C:G>T:A"}{'CHR'}{$chromosome}{'chr'}, $format_A10);
1711 $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+6, $refH_file->{$k_file}{'SBSPerChr'}{"T:A>A:T"}{'CHR'}{$chromosome}{'chr'}, $format_A10);
1712 $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+7, $refH_file->{$k_file}{'SBSPerChr'}{"T:A>C:G"}{'CHR'}{$chromosome}{'chr'}, $format_A10);
1713 $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+8, $refH_file->{$k_file}{'SBSPerChr'}{"T:A>G:C"}{'CHR'}{$chromosome}{'chr'}, $formatT_right);
1714
1715
1716 # For the HTML report
1717 print SBSPerChr "$chromosome\t", $chromosomes{$chromosome},"\t", $refH_file->{$k_file}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"C:G>A:T"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"C:G>G:C"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"C:G>T:A"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"T:A>A:T"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"T:A>C:G"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"T:A>G:C"}{'CHR'}{$chromosome}{'chr'},"\n";
1718 $line++;
1719 }
1720
1721 # Write the Pearson coefficient for the total number of SBS
1722 $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'SBSPerChr'}{'AllMutType'}, $format_A10);
1723 $ws->write($row_SBSPerChr+(keys %chromosomes), $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'TotalSBSGenomic'}, $formatT_bottomHeader);
1724
1725 print SBSPerChr "\t\t$refH_file->{$k_file}{'TotalSBSGenomic'}\n";
1726 close SBSPerChr;
1727
1728
1729
1730 ###########################################################################################################################################################
1731 ####################################################################### Impact on protein #################################################################
1732 ###########################################################################################################################################################
1733 $ws->write(29, 6, "Table 2. Frequency and counts of functional impact", $format_A10Boldleft);
1734 $ws->set_column(6, 6, 13); $ws->set_column(10, 10, 15);
1735 $ws->write(30, 6, "RefSeq gene", $table_topleft); $ws->write(30, 7, "", $table_top); $ws->write(30, 8, "Percent", $table_top); $ws->write(30, 9, "Count", $table_topRight);
1736 my $lImpactSBS = 31;
1737 open(IMPACTSBS, ">", "$folderFigure/Impact_protein_sequence/$k_file/$k_file-DistributionExoFunc.txt") or die "$!: $folderFigure/Impact_protein_sequence/$k_file/$k_file-DistributionExoFunc.txt\n";
1738 print IMPACTSBS "AA_Change\tCount\tPercent\n";
1739
1740 # Pie chart with the distribution of SBS vs Indel
1741 open(SBSINDEL, ">", "$folderFigure/Overall_mutation_distribution/$k_file/$k_file-OverallMutationDistribution.txt") or die "$!: $folderFigure/Overall_mutation_distribution/$k_file/$k_file-OverallMutationDistribution.txt\n";
1742 print SBSINDEL "Variant_type\tCount\tPercent\n";
1743 my ($deletion, $insertion) = (0, 0);
1744
1745
1746 foreach my $k_exoFunc(sort keys $refH_file->{$k_file}{'ImpactSBS'})
1747 {
1748 my $percent = ($refH_file->{$k_file}{'ImpactSBS'}{$k_exoFunc} / $refH_file->{$k_file}{'TotalMutGenomic'})*100;
1749 $percent = sprintf("%.2f", $percent);
1750
1751 if($k_exoFunc eq "NA") { print IMPACTSBS "Not_Applicable\t$percent\t$refH_file->{$k_file}{'ImpactSBS'}{$k_exoFunc}\n"; }
1752 else { my $temp = $k_exoFunc; $temp =~ s/ /_/g; print IMPACTSBS "$temp\t$percent\t$refH_file->{$k_file}{'ImpactSBS'}{$k_exoFunc}\n"; }
1753
1754 $ws->write($lImpactSBS, 6, $k_exoFunc, $table_left2); $ws->write($lImpactSBS, 8, $percent, $format_A10); $ws->write($lImpactSBS, 9, $refH_file->{$k_file}{'ImpactSBS'}{$k_exoFunc}, $table_right);
1755 $lImpactSBS++;
1756
1757 # Pie chart with the distribution of SBS vs Indel
1758 if($k_exoFunc =~ /deletion/i) { $deletion += $refH_file->{$k_file}{'ImpactSBS'}{$k_exoFunc}; }
1759 elsif($k_exoFunc =~ /insertion/i) { $insertion += $refH_file->{$k_file}{'ImpactSBS'}{$k_exoFunc}; }
1760 }
1761 close IMPACTSBS;
1762 $ws->write($lImpactSBS, 9, $refH_file->{$k_file}{'TotalMutGenomic'}, $table_bottomrightHeader);
1763 $ws->write($lImpactSBS, 6, "", $table_bottomleft); $ws->write($lImpactSBS, 7, "", $table_bottom); $ws->write($lImpactSBS, 8, "", $table_bottom);
1764
1765 # Pie chart with the distribution of SBS vs Indel
1766 my $percentSBSIndel = ($deletion/$refH_file->{$k_file}{'TotalMutGenomic'})*100; $percentSBSIndel = sprintf("%.2f", $percentSBSIndel);
1767 print SBSINDEL "Deletion\t$deletion\t$percentSBSIndel\n";
1768 $percentSBSIndel = ($insertion/$refH_file->{$k_file}{'TotalMutGenomic'})*100; $percentSBSIndel = sprintf("%.2f", $percentSBSIndel);
1769 print SBSINDEL "Insertion\t$insertion\t$percentSBSIndel\n";
1770 $percentSBSIndel = ($refH_file->{$k_file}{TotalSBSGenomic}/$refH_file->{$k_file}{'TotalMutGenomic'})*100; $percentSBSIndel = sprintf("%.2f", $percentSBSIndel);
1771 print SBSINDEL "SBS\t$refH_file->{$k_file}{TotalSBSGenomic}\t$percentSBSIndel\n";
1772 close SBSINDEL;
1773
1774 ###########################################################################################################################################################
1775 ######################################################## SEQUENCE CONTEXT ON GENOMIC STRAND ###############################################################
1776 ###########################################################################################################################################################
1777 my $row_SeqContext6 = 4;
1778 # Count the total of mutations for 6 mutation types on genomic strand
1779 my ($c_ca6_g, $c_cg6_g, $c_ct6_g, $c_ta6_g, $c_tc6_g, $c_tg6_g) = (0,0,0, 0,0,0);
1780 my ($p_ca6_g, $p_cg6_g, $p_ct6_g, $p_ta6_g, $p_tc6_g, $p_tg6_g) = (0,0,0, 0,0,0);
1781 my $maxValue = 0; # For the heatmap
1782
1783 # For checking if the total number of SBS is correct
1784 my $total_SBS_genomic = 0;
1785
1786
1787 open(HEATMAPCGENOMIC, ">", "$folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-HeatmapCount-Genomic.txt") or die "$!: $folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-HeatmapCount-Genomic.txt\n";
1788 print HEATMAPCGENOMIC "\tC>A\tC>G\tC>T\tT>A\tT>C\tT>G\n";
1789 open(HEATMAPPGENOMIC, ">", "$folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-HeatmapPercent-Genomic.txt") or die "$!: $folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-HeatmapPercent-Genomic.txt\n";
1790 print HEATMAPPGENOMIC "\tC>A\tC>G\tC>T\tT>A\tT>C\tT>G\n";
1791
1792 ## Bar plot NMF like
1793 open(BARPLOTNMFLIKE, ">", "$folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-MutationSpectraPercent-Genomic.txt") or die "$!: $folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-MutationSpectraPercent-Genomic.txt\n";
1794 print BARPLOTNMFLIKE "alteration\tcontext\tvalue\n";
1795
1796 foreach my $k_context (sort keys $refH_file->{$k_file}{'SeqContextG'})
1797 {
1798 if( ($k_context =~ /N/) || (length($k_context) != 3) ) { next; }
1799
1800 # Write the context: 6 mut type on genomic strand
1801 $ws->write($row_SeqContext6 , $colStart_matrixSeqContext+3, $k_context, $format_A10); $ws->write($row_SeqContext6 , $colStart_matrixSeqContext+13, $k_context, $format_A10);
1802
1803 foreach my $k_mutation (sort keys $refH_file->{$k_file}{'SeqContextG'}{$k_context})
1804 {
1805 # For checking the total number of SBS
1806 $total_SBS_genomic += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1807
1808 # Calculate the percentages
1809 my $percent = 0;
1810 if($refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation} == 0) { $percent = 0; }
1811 else
1812 {
1813 $percent = ($refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation} / $refH_file->{$k_file}{'TotalSBSGenomic'}) * 100;
1814 $percent = sprintf("%.2f", $percent);
1815 }
1816
1817 # For representing the sequence context with a bar plot (NMF like style)
1818 print BARPLOTNMFLIKE $k_mutation,"\t", $k_context,"\t", $percent,"\n";
1819
1820 if($k_mutation eq "C>A")
1821 {
1822 ### COUNT
1823 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+4, $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}, $format_A10);
1824 # Write the count for the heatmap
1825 print HEATMAPCGENOMIC "$k_context\t$refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}\t";
1826
1827 ### PERCENTAGE
1828 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+14, $percent, $format_A10);
1829 print HEATMAPPGENOMIC "$k_context\t$percent\t";
1830
1831 # For NMF input
1832 my $count = $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1833 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{'C>A'}}, $count); }
1834 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{'C>A'}}, $percent); }
1835
1836 # For the heatmap
1837 if($percent >= $maxValue) { $maxValue = $percent; }
1838
1839 # For the total amount per mutation types
1840 $c_ca6_g += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1841 $p_ca6_g += $percent;
1842 }
1843 if($k_mutation eq "C>G")
1844 {
1845 ### COUNT
1846 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+5, $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}, $format_A10);
1847 # Write the count for the heatmap
1848 print HEATMAPCGENOMIC "$refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}\t";
1849
1850 ### PERCENTAGE
1851 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+15, $percent, $format_A10);
1852 print HEATMAPPGENOMIC "$percent\t";
1853
1854 # For NMF input
1855 my $count = $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1856 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{'C>G'}}, $count); }
1857 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{'C>G'}}, $percent); }
1858
1859 # For the heatmap
1860 if($percent >= $maxValue) { $maxValue = $percent; }
1861
1862 # For the total amount per mutation types
1863 $c_cg6_g += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1864 $p_cg6_g += $percent;
1865 }
1866 if($k_mutation eq "C>T")
1867 {
1868 ### COUNT
1869 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+6, $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}, $format_A10);
1870 # Write the count for the heatmap
1871 print HEATMAPCGENOMIC "$refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}\t";
1872
1873 ### PERCENTAGE
1874 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+16, $percent, $format_A10);
1875 print HEATMAPPGENOMIC "$percent\t";
1876
1877 # For NMF input
1878 my $count = $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1879 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{'C>T'}}, $count); }
1880 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{'C>T'}}, $percent); }
1881
1882 # For the heatmap
1883 if($percent >= $maxValue) { $maxValue = $percent; }
1884
1885 # For the total amount per mutation types
1886 $c_ct6_g += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1887 $p_ct6_g += $percent;
1888 }
1889 if($k_mutation eq "T>A")
1890 {
1891 ### COUNT
1892 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+7, $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}, $format_A10);
1893 # Write the count for the heatmap
1894 print HEATMAPCGENOMIC "$refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}\t";
1895
1896 ### PERCENTAGE
1897 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+17, $percent, $format_A10);
1898 print HEATMAPPGENOMIC "$percent\t";
1899
1900 # For NMF input
1901 my $count = $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1902 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{'T>A'}}, $count); }
1903 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{'T>A'}}, $percent); }
1904
1905 # For the heatmap
1906 if($percent >= $maxValue) { $maxValue = $percent; }
1907
1908 # For the total amount per mutation types
1909 $c_ta6_g += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1910 $p_ta6_g += $percent;
1911 }
1912 if($k_mutation eq "T>C")
1913 {
1914 ### COUNT
1915 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+8, $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}, $format_A10);
1916 # Write the count for the heatmap
1917 print HEATMAPCGENOMIC "$refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}\t";
1918
1919 ### PERCENTAGE
1920 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+18, $percent, $format_A10);
1921 print HEATMAPPGENOMIC "$percent\t";
1922
1923 # For NMF input
1924 my $count = $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1925 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{'T>C'}}, $count); }
1926 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{'T>C'}}, $percent); }
1927
1928 # For the heatmap
1929 if($percent >= $maxValue) { $maxValue = $percent; }
1930
1931 # For the total amount per mutation types
1932 $c_tc6_g += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1933 $p_tc6_g += $percent;
1934 }
1935 if($k_mutation eq "T>G")
1936 {
1937 ### COUNT
1938 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+9, $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}, $format_A10);
1939 # Write the count for the heatmap
1940 print HEATMAPCGENOMIC "$refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}\n";
1941
1942 ### PERCENTAGE
1943 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+19, $percent, $format_A10);
1944 print HEATMAPPGENOMIC "$percent\n";
1945
1946 # For NMF input
1947 my $count = $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1948 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{'T>G'}}, $count); }
1949 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{'T>G'}}, $percent); }
1950
1951 # For the heatmap
1952 if($percent >= $maxValue) { $maxValue = $percent; }
1953
1954 # For the total amount per mutation types
1955 $c_tg6_g += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation};
1956 $p_tg6_g += $percent;
1957 }
1958 }
1959 $row_SeqContext6++;
1960 }
1961 close HEATMAPCGENOMIC; close HEATMAPPGENOMIC;
1962 close BARPLOTNMFLIKE;
1963
1964
1965 # Write the total number of SBS per mutation type: COUNT
1966 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+4, $c_ca6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+5, $c_cg6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+6, $c_ct6_g, $formatT_bottomHeader2);
1967 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+7, $c_ta6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+8, $c_tc6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+9, $c_tg6_g, $formatT_bottomHeader2);
1968 if($total_SBS_genomic != $refH_file->{$k_file}{'TotalSBSGenomic'}) { print STDERR "Error in the calculation of the total number of SBS on the genomic strand!!!!\nFrom hash table $refH_file->{$k_file}{'TotalSBSGenomic'}\tVS\t$total_SBS_genomic\n"; exit; }
1969
1970 # Write the total number of SBS per mutation type: PERCENT
1971 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+14, $p_ca6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+15, $p_cg6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+16, $p_ct6_g, $formatT_bottomHeader2);
1972 $ws->write($row_SeqContext6, $colStart_matrixSeqContext+17, $p_ta6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+18, $p_tc6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+19, $p_tg6_g, $formatT_bottomHeader2);
1973 my $totalPercent_genomic = $p_ca6_g + $p_cg6_g + $p_ct6_g + $p_ta6_g + $p_tc6_g + $p_tg6_g; $totalPercent_genomic = sprintf("%.0f", $totalPercent_genomic);
1974 if($totalPercent_genomic != 100) { print STDERR "Error in the calculation of the total percentages on the genomic strand!!!\nThe total is equal to=\t$totalPercent_genomic\n"; exit; }
1975
1976
1977 #----------------------------------------------------------------------------------------------------------------------------------------------------------------#
1978 # For the input matrix for NMF
1979 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Sample'}}, $k_file); }
1980
1981
1982 ###########################################################################################################################################################
1983 ######################################################## SEQUENCE CONTEXT ON CODING STRAND ###############################################################
1984 ###########################################################################################################################################################
1985 my $row_SeqContext12 = $rowStart_SBSdistrBySeg+6; my $row_SeqContext12Percent = $rowStart_SBSdistrBySeg+27;
1986 # Reset the total count and percent calculated for the strand bias
1987 ($ca_NonTr, $ca_Tr, $cg_NonTr, $cg_Tr, $ct_NonTr, $ct_Tr, $ta_NonTr, $ta_Tr, $tc_NonTr, $tc_Tr, $tg_NonTr, $tg_Tr) = (0,0,0, 0,0,0, 0,0,0, 0,0,0);
1988 ($percent_ca_NonTr, $percent_ca_Tr, $percent_cg_NonTr, $percent_cg_Tr, $percent_ct_NonTr, $percent_ct_Tr, $percent_ta_NonTr, $percent_ta_Tr, $percent_tc_NonTr, $percent_tc_Tr, $percent_tg_NonTr, $percent_tg_Tr) = (0,0,0, 0,0,0, 0,0,0, 0,0,0);
1989
1990 # For checking if the total number of SBS is correct
1991 my $total_SBS_coding = 0;
1992
1993 open(COUNT, ">", "$folderFigure/Stranded_Analysis/$k_file/$k_file-StrandedSignatureCount.txt") or die "$!: $folderFigure/Stranded_Analysis/$k_file/$k_file-StrandedSignatureCount.txt\n";
1994 print COUNT "MutationTypeContext\tStrand\tValue\tSample\n";
1995 open(PERCENT, ">", "$folderFigure/Stranded_Analysis/$k_file/$k_file-StrandedSignaturePercent.txt") or die "$!: $folderFigure/Stranded_Analysis/$k_file/$k_file-StrandedSignaturePercent.txt\n";
1996 print PERCENT "MutationTypeContext\tStrand\tValue\tSample\n";
1997
1998 foreach my $k_context (sort keys $refH_file->{$k_file}{'SeqContextC'})
1999 {
2000 if( ($k_context =~ /N/) || (length($k_context) != 3) ) { next; }
2001
2002 # Write the context: 12 mut type on coding strand
2003 $ws->write($row_SeqContext12 , $colStart_matrixSeqContext, $k_context, $formatT_left); $ws->write($row_SeqContext12Percent , $colStart_matrixSeqContext, $k_context, $formatT_left);
2004
2005 foreach my $k_mutation (sort keys $refH_file->{$k_file}{'SeqContextC'}{$k_context})
2006 {
2007 # Percent: 12 mut type on coding strand
2008 my ($percent_NonTr, $percent_Tr) = (0, 0);
2009 if($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'} == 0) { $percent_NonTr = 0; }
2010 else { $percent_NonTr = ( $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'TotalSBSCoding'} ) * 100 }
2011 if($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'} == 0) { $percent_Tr = 0; }
2012 else { $percent_Tr = ( $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'} / $refH_file->{$k_file}{'TotalSBSCoding'} ) * 100 }
2013
2014
2015 # Calculate the total number for each mutation types
2016 if($k_mutation eq "C>A")
2017 {
2018 $ca_NonTr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'};
2019 $ca_Tr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'};
2020
2021 # COUNT : 12 mutation type (stranded bar graph)
2022 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+1, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}, $format_A10);
2023 print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$k_file\n";
2024 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+2, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}, $format_A10);
2025 print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$k_file\n";
2026
2027 ## PERCENT : 12 mutation type (stranded bar graph)
2028 my $percent_NonTr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100;
2029 $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_ca_NonTr += $percent_NonTr;
2030 my $percent_Tr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100;
2031 $percent_Tr = sprintf("%.2f", $percent_Tr); $percent_ca_Tr += $percent_Tr;
2032 print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$k_file\n";
2033 print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$k_file\n";
2034
2035 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+1, $percent_NonTr, $format_A10);
2036 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+2, $percent_Tr, $format_A10);
2037 }
2038 if($k_mutation eq "C>G")
2039 {
2040 $cg_NonTr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'};
2041 $cg_Tr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'};
2042
2043 # COUNT : 12 mutation type (stranded bar graph)
2044 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+3, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}, $format_A10);
2045 print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$k_file\n";
2046 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+4, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}, $format_A10);
2047 print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$k_file\n";
2048
2049 ## PERCENT : 12 mutation type (stranded bar graph)
2050 my $percent_NonTr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100;
2051 $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_cg_NonTr += $percent_NonTr;
2052 my $percent_Tr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100;
2053 $percent_Tr = sprintf("%.2f", $percent_Tr); $percent_cg_Tr += $percent_Tr;
2054 print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$k_file\n";
2055 print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$k_file\n";
2056
2057 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+3, $percent_NonTr, $format_A10);
2058 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+4, $percent_Tr, $format_A10);
2059 }
2060 if($k_mutation eq "C>T")
2061 {
2062 $ct_NonTr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'};
2063 $ct_Tr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'};
2064
2065 # COUNT : 12 mutation type (stranded bar graph)
2066 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+5, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}, $format_A10);
2067 print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$k_file\n";
2068 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+6, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}, $format_A10);
2069 print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$k_file\n";
2070
2071 ## PERCENT : 12 mutation type (stranded bar graph)
2072 my $percent_NonTr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100;
2073 $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_ct_NonTr += $percent_NonTr;
2074 my $percent_Tr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100;
2075 $percent_Tr = sprintf("%.2f", $percent_Tr); $percent_ct_Tr += $percent_Tr;
2076 print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$k_file\n";
2077 print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$k_file\n";
2078
2079 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+5, $percent_NonTr, $format_A10);
2080 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+6, $percent_Tr, $format_A10);
2081 }
2082 if($k_mutation eq "T>A")
2083 {
2084 $ta_NonTr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'};
2085 $ta_Tr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'};
2086
2087 # COUNT : 12 mutation type (stranded bar graph)
2088 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+7, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}, $format_A10);
2089 print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$k_file\n";
2090 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+8, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}, $format_A10);
2091 print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$k_file\n";
2092
2093 ## PERCENT : 12 mutation type (stranded bar graph)
2094 my $percent_NonTr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100;
2095 $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_ta_NonTr += $percent_NonTr;
2096 my $percent_Tr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100;
2097 $percent_Tr = sprintf("%.2f", $percent_Tr); $percent_ta_Tr += $percent_Tr;
2098 print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$k_file\n";
2099 print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$k_file\n";
2100
2101 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+7, $percent_NonTr, $format_A10);
2102 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+8, $percent_Tr, $format_A10);
2103 }
2104 if($k_mutation eq "T>C")
2105 {
2106 $tc_NonTr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'};
2107 $tc_Tr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'};
2108
2109 # COUNT : 12 mutation type (stranded bar graph)
2110 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+9, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}, $format_A10);
2111 print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$k_file\n";
2112 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+10, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}, $format_A10);
2113 print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$k_file\n";
2114
2115 ## PERCENT : 12 mutation type (stranded bar graph)
2116 my $percent_NonTr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100;
2117 $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_tc_NonTr += $percent_NonTr;
2118 my $percent_Tr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100;
2119 $percent_Tr = sprintf("%.2f", $percent_Tr); $percent_tc_Tr += $percent_Tr;
2120 print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$k_file\n";
2121 print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$k_file\n";
2122
2123 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+9, $percent_NonTr, $format_A10);
2124 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+10, $percent_Tr, $format_A10);
2125 }
2126 if($k_mutation eq "T>G")
2127 {
2128 $tg_NonTr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'};
2129 $tg_Tr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'};
2130
2131 # COUNT : 12 mutation type (stranded bar graph)
2132 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+11, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}, $format_A10);
2133 print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$k_file\n";
2134 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+12, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}, $format_A10);
2135 print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$k_file\n";
2136
2137 ## PERCENT : 12 mutation type (stranded bar graph)
2138 my $percent_NonTr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100;
2139 $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_tg_NonTr += $percent_NonTr;
2140 my $percent_Tr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100;
2141 $percent_Tr = sprintf("%.2f", $percent_Tr); $percent_tg_Tr += $percent_Tr;
2142 print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$k_file\n";
2143 print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$k_file\n";
2144
2145 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+11, $percent_NonTr, $format_A10);
2146 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+12, $percent_Tr, $format_A10);
2147 }
2148
2149 # For checking if the total number of SBS is correct
2150 $total_SBS_coding += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'} + $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'};
2151 }
2152 $row_SeqContext12++; $row_SeqContext12Percent++;
2153 }
2154 close COUNT; close PERCENT;
2155
2156 ## Write the total of each mutation types: 12 mut type on coding strand
2157 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+1, $ca_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $colStart_matrixSeqContext+2, $ca_Tr, $formatT_bottomHeader2);
2158 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+3, $cg_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $colStart_matrixSeqContext+4, $cg_Tr, $formatT_bottomHeader2);
2159 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+5, $ct_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $colStart_matrixSeqContext+6, $ct_Tr, $formatT_bottomHeader2);
2160 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+7, $ta_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $colStart_matrixSeqContext+8, $ta_Tr, $formatT_bottomHeader2);
2161 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+9, $tc_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $colStart_matrixSeqContext+10, $tc_Tr, $formatT_bottomHeader2);
2162 $ws->write($row_SeqContext12, $colStart_matrixSeqContext+11, $tg_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $colStart_matrixSeqContext+12, $tg_Tr, $formatT_bottomHeader2);
2163 # Write the total percentages of each mutation types: 12 mut type on coding strand
2164 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+1, $percent_ca_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+2, $percent_ca_Tr, $formatT_bottomHeader);
2165 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+3, $percent_cg_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+4, $percent_cg_Tr, $formatT_bottomHeader);
2166 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+5, $percent_ct_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+6, $percent_ct_Tr, $formatT_bottomHeader);
2167 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+7, $percent_ta_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+8, $percent_ta_Tr, $formatT_bottomHeader);
2168 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+9, $percent_tc_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+10, $percent_tc_Tr, $formatT_bottomHeader);
2169 $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+11, $percent_tg_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+12, $percent_tg_Tr, $formatT_bottomHeader);
2170
2171 if($total_SBS_coding == $refH_file->{$k_file}{'TotalSBSCoding'}) { $ws->write($row_SeqContext12, $colStart_matrixSeqContext+13, $refH_file->{$k_file}{'TotalSBSCoding'}, $formatT_bottomHeader2) }
2172 else { print STDERR "Error in the calculation of the total number of SBS on the coding strand!!!!\nFrom hash table $refH_file->{$k_file}{'TotalSBSCoding'}\tVS\t$total_SBS_coding\n"; exit; }
2173
2174
2175 my $totalP_SBS_coding = $percent_ca_NonTr + $percent_ca_Tr + $percent_cg_NonTr + $percent_cg_Tr + $percent_ct_NonTr + $percent_ct_Tr + $percent_ta_NonTr + $percent_ta_Tr + $percent_tc_NonTr + $percent_tc_Tr + $percent_tg_NonTr + $percent_tg_Tr; $totalP_SBS_coding = sprintf("%.0f", $totalP_SBS_coding);
2176 if($totalP_SBS_coding != 100) { print STDERR "The percentages for the trinucleotide sequence context on the coding strand for 12 mutation types is not equal to 100!!!\n$totalP_SBS_coding\n"; exit; }
2177
2178
2179 ###########################################################################################################################################################
2180 ################################################################### GRAPHS & TABLES #######################################################################
2181 ###########################################################################################################################################################
2182 Create_Graph($folderFigure, $k_file, $maxValue);
2183
2184 ## Distribution of SBS into the Excel report (Figure 1 + Table 1)
2185 $ws->write(0, 0, "Graph 1. SBS distribution", $formatT_graphTitle); $ws->set_row(0, 18);
2186 $ws->insert_image(1, 0, "$folder_temp/$k_file-SBS_distribution-Report.png", 0, 0, .2, .2);
2187 $ws->write(29, 0, "Table 1. Frequency and counts of all SBS", $format_A10Boldleft);
2188 $ws->write(30, 0, "Mutation type", $table_topleft); $ws->write(30, 1, "Percentage", $table_top); $ws->write(30, 2, "Count", $table_topRight);
2189 $ws->write(31, 0, "C:G>A:T", $table_left); $ws->write(31, 1, $percent_ca, $format_A10); $ws->write(31, 2, $ca_genomique, $table_right);
2190 $ws->write(32, 0, "C:G>G:C", $table_left); $ws->write(32, 1, $percent_cg, $format_A10); $ws->write(32, 2, $cg_genomique, $table_right);
2191 $ws->write(33, 0, "C:G>T:A", $table_left); $ws->write(33, 1, $percent_ct, $format_A10); $ws->write(33, 2, $ct_genomique, $table_right);
2192 $ws->write(34, 0, "T:A>A:T", $table_left); $ws->write(34, 1, $percent_ta, $format_A10); $ws->write(34, 2, $ta_genomique, $table_right);
2193 $ws->write(35, 0, "T:A>C:G", $table_left); $ws->write(35, 1, $percent_tc, $format_A10); $ws->write(35, 2, $tc_genomique, $table_right);
2194 $ws->write(36, 0, "T:A>G:C", $table_left); $ws->write(36, 1, $percent_tg, $format_A10); $ws->write(36, 2, $tg_genomique, $table_right);
2195 $ws->write(37, 0, "", $table_bottomleft); $ws->write(37, 1, "", $table_bottom); $ws->write(37, 2, $refH_file->{$k_file}{'TotalSBSGenomic'}, $table_bottomrightHeader);
2196
2197 ## Impact of the SBS on the protein
2198 $ws->write(0, 6, "Graph 2. Impact on protein sequence", $formatT_graphTitle);
2199 $ws->insert_image(1, 6, "$folder_temp/$k_file-DistributionExoFunc-Report.png", 0, 0, .2, .2);
2200
2201 ## Strand Bias
2202 $ws->write(0, 11, "Graph 3. Stranded distribution of SBS", $formatT_graphTitle);
2203 $ws->insert_image(1, 11, "$folder_temp/$k_file-StrandBias-Report.png", 0, 0, .2, .2);
2204
2205 ## Stranded signature (Scale the inserted image: width x 0.7, height x 0.8)
2206 $ws->insert_image($rowStart_SBSdistrBySeg+3, $colStart_matrixSeqContext+15, "$folder_temp/$k_file-StrandedSignatureCount-Report.png", 0, 0, .16, .16);
2207 $ws->insert_image($rowStart_SBSdistrBySeg+24, $colStart_matrixSeqContext+15, "$folder_temp/$k_file-StrandedSignaturePercent-Report.png", 0, 0, .16, .16);
2208
2209
2210 # Heatamp for the sequence context on the genomic strand (6 mutation types)
2211 $ws->insert_image(4, $colStart_matrixSeqContext, "$folder_temp/$k_file-HeatmapCount-Genomic-Report.png");
2212 $ws->insert_image(4, $colStart_matrixSeqContext+10, "$folder_temp/$k_file-HeatmapPercent-Genomic-Report.png");
2213
2214
2215 ## Bar plot for representing the sequence context (NMF like style)
2216 `Rscript $pathRScriptMutSpectrum $folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-MutationSpectraPercent-Genomic.txt $k_file $folderFigure/Trinucleotide_Sequence_Context/$k_file $folder_temp $c_ca6_g $c_cg6_g $c_ct6_g $c_ta6_g $c_tc6_g $c_tg6_g 2>&1`;
2217
2218 # Bar plot for the sequence context on the genomic strand (6 mutation types)
2219 $ws->insert_image(27, $colStart_matrixSeqContext+3, "$folder_temp/$k_file-MutationSpectraPercent-Genomic-Report.png");
2220
2221 # Next sample
2222 $row_SumSheet++;
2223 } # End $k_file
2224
2225 #----------------------------------------------------------------------------------------------------------------------------------------------------------------#
2226 # Write the input matrix for NMF
2227 open(OUTINPUTNMFC, ">", "$folderNMF/Input_NMF_Count.txt") or die "$!: $folderNMF/Input_NMF_Count.txt\n"; # with the count
2228 open(OUTINPUTNMFP, ">", "$folderNMF/Input_NMF_Frequency.txt") or die "$!: $folderNMF/Input_NMF_Frequency.txt\n"; # With the frequency un-normalized
2229
2230 foreach my $k_sample (@{$h_inputNMF{'Sample'}}) { print OUTINPUTNMFC "\t$k_sample"; print OUTINPUTNMFP "\t$k_sample"; }
2231 print OUTINPUTNMFC "\n"; print OUTINPUTNMFP "\n";
2232
2233 my $row_inputNMF = 1;
2234 foreach my $k_context (sort keys $h_inputNMF{'Count'})
2235 {
2236 $k_context =~ /(\w)_(\w)/; my ($base5, $base3) = ($1, $2);
2237 foreach my $k_mutation (sort keys $h_inputNMF{'Count'}{$k_context})
2238 {
2239 my ($col_inputNMF_Count, $col_inputNMF_Percent) = (1, 1);
2240 my $contextNMF = $base5."[$k_mutation]".$base3;
2241 $ws_inputNMF_count->write($row_inputNMF, 0, $contextNMF); $ws_inputNMF_percent->write($row_inputNMF, 0, $contextNMF);
2242 print OUTINPUTNMFC $contextNMF,"\t"; print OUTINPUTNMFP $contextNMF,"\t";
2243
2244 foreach (@{$h_inputNMF{'Count'}{$k_context}{$k_mutation}}) { print OUTINPUTNMFC "$_\t"; } print OUTINPUTNMFC "\n";
2245 foreach (@{$h_inputNMF{'Percent'}{$k_context}{$k_mutation}}) { print OUTINPUTNMFP "$_\t"; } print OUTINPUTNMFP "\n";
2246
2247 foreach (@{$h_inputNMF{'Count'}{$k_context}{$k_mutation}})
2248 {
2249 # print "\t$k_context\t$k_mutation\t";
2250 # print "\t$row_inputNMF\t$col_inputNMF_Count\t$_\n";
2251 $ws_inputNMF_count->write($row_inputNMF, $col_inputNMF_Count, $_); $col_inputNMF_Count++;
2252 }
2253 foreach (@{$h_inputNMF{'Percent'}{$k_context}{$k_mutation}}) { $ws_inputNMF_percent->write($row_inputNMF, $col_inputNMF_Percent, $_); $col_inputNMF_Percent++; }
2254 $row_inputNMF++;
2255 }
2256 }
2257 close OUTINPUTNMFP; close OUTINPUTNMFC;
2258
2259
2260 # Close the workbook
2261 $wb->close();
2262 }
2263 # Calculate the chi2 for the strand bias
2264 sub CalculateChi2
2265 {
2266 my ($refH_file, $folderChi2) = @_;
2267
2268 # No value for the chi2
2269 if(scalar (keys $refH_file) == 0) { print STDERR "No value for calculating the chi2 for the strand bias\n"; exit; }
2270
2271 # Strand bias for one mutation type for all the samples
2272 my %h_tempchi2 = ();
2273 my ($ca_NonTr, $ca_Tr, $cg_NonTr, $cg_Tr, $ct_NonTr, $ct_Tr, $ta_NonTr, $ta_Tr, $tc_NonTr, $tc_Tr, $tg_NonTr, $tg_Tr) = (0,0,0,0,0,0, 0,0,0,0,0,0);
2274
2275 my $nb_file = 0;
2276
2277 foreach my $k_file (sort keys $refH_file)
2278 {
2279 $nb_file++;
2280 foreach my $k_func (sort keys $refH_file->{$k_file}{'6mutType'})
2281 {
2282 foreach my $k_mutation (sort keys $refH_file->{$k_file}{'6mutType'}{$k_func})
2283 {
2284 if($k_mutation eq "C:G>A:T")
2285 {
2286 $h_tempchi2{'C>A'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'};
2287 $h_tempchi2{'C>A'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
2288 }
2289 if($k_mutation eq "C:G>G:C")
2290 {
2291 $h_tempchi2{'C>G'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'};
2292 $h_tempchi2{'C>G'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
2293 }
2294 if($k_mutation eq "C:G>T:A")
2295 {
2296 $h_tempchi2{'C>T'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'};
2297 $h_tempchi2{'C>T'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
2298 }
2299 if($k_mutation eq "T:A>A:T")
2300 {
2301 $h_tempchi2{'T>A'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'};
2302 $h_tempchi2{'T>A'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
2303 }
2304 if($k_mutation eq "T:A>C:G")
2305 {
2306 $h_tempchi2{'T>C'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'};
2307 $h_tempchi2{'T>C'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
2308 }
2309 if($k_mutation eq "T:A>G:C")
2310 {
2311 $h_tempchi2{'T>G'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'};
2312 $h_tempchi2{'T>G'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
2313 }
2314 }
2315 }
2316 }
2317
2318 # Create the input file for NMF
2319 open(CHI2, ">", "$folderChi2/Input_chi2_strandBias.txt") or die "$!: $folderChi2/Input_chi2_strandBias.txt\n";
2320 print CHI2 "SampleName\tNonTr\tTr\tAlteration\n";
2321
2322 foreach my $k_mutation (sort keys %h_tempchi2)
2323 {
2324 foreach my $k_file (sort keys $h_tempchi2{$k_mutation})
2325 {
2326 print CHI2 "$k_file\t$h_tempchi2{$k_mutation}{$k_file}{'NonTr'}\t$h_tempchi2{$k_mutation}{$k_file}{'Tr'}\t$k_mutation\n";
2327 }
2328 }
2329 close CHI2;
2330
2331
2332 # Open the connection with R
2333 my $R = Statistics::R->new() or die "Impossible to create a communication bridge with R\n";
2334
2335 $R->send(qq`## Load the data. There is one column with the mutation type and the sample name but it's just for knowing what is corresponding to each line. The two columns with the number of variant per strand would be sufficient.
2336 strBias<-read.delim("$folderChi2/Input_chi2_strandBias.txt", dec=".");`);
2337 $R->send(q`# Chi2
2338 pValChi2 <- c() # First I create an empty vector and then I apply a for on the data load
2339 pValChi2_round <- c() # Empty vector with the rounded p-values
2340 confInt <- c() # Empty vector for the confident interval
2341 proportion <- c() # Empty vector for the proportion of NonTr compared to the (NonTr+Tr)
2342 sampleSize <- c() # Empty vector for the count of samples in NonTr and Tr
2343 # For Pool_Data save the p-values in a different vector for not having them for the FDR
2344 pValChi2_PoolData <- c()
2345 pValChi2_PoolData_Round <- c()
2346
2347 j = 1 # Timer for pValChi2_PoolData vector
2348 k = 1 # Timer for pValChi2
2349
2350 for(i in 1:nrow(strBias))
2351 {
2352 if(! sum(strBias[i,2:3]) == 0)
2353 {
2354 # For Pool_Data
2355 if(strBias[i,1] == "Pool_Data")
2356 {
2357 pValChi2_PoolData[j] <- prop.test(x=strBias[i,2],n=sum(strBias[i,2:3]),p=0.5)$p.value
2358 j <- j+1
2359 }
2360 # For the other sample(s)
2361 else
2362 {
2363 # Calculate the p-value
2364 pValChi2[k] <- prop.test(x=strBias[i,2],n=sum(strBias[i,2:3]),p=0.5)$p.value
2365 k <- k+1
2366 }
2367
2368 # Calculate the confidence interval
2369 temp <- prop.test(x=strBias[i,2],n=sum(strBias[i,2:3]),p=0.5)$conf.int
2370 confInt[i] <- paste0("[", round(temp[1],2), "-", round(temp[2],2), "]") # Same as paste(sep="")
2371
2372 # Save the proportion
2373 proportion[i] <- strBias[i,2] / sum(strBias[i,2:3])
2374
2375 # Save the sample size (count on NonTr and Tr)
2376 sampleSize[i] <- paste(strBias[i,2], strBias[i,3], sep="-")
2377 } else
2378 {
2379 if(strBias[i,1] == "Pool_Data")
2380 {
2381 pValChi2_PoolData[j] <- NA
2382 pValChi2_PoolData_Round[j] <- NA
2383 j <- j+1
2384 }
2385 else
2386 {
2387 # Not enough effective for the test
2388 pValChi2[k] <- NA
2389 confInt[k] <- NA
2390 proportion[k] <- NA
2391 sampleSize[k] <- NA
2392 pValChi2_round[k] <- NA
2393 k <- k+1
2394 }
2395 }
2396 }
2397 # Adjust with FDR
2398 FDR<-p.adjust(pValChi2, method="BH")
2399
2400 # Rount the p-value
2401 for(i in 1:nrow(strBias))
2402 {
2403 if( (! is.na(pValChi2[i])) && (pValChi2[i] < 0.0001) )
2404 {
2405 pValChi2_round[i] <- format(pValChi2[i], scientific=T, digits=3)
2406 } else if(! is.na(pValChi2[i]))
2407 {
2408 pValChi2_round[i] <- as.character(round(pValChi2[i], 3))
2409 }
2410 }
2411
2412 # The option for the pool is specified
2413 if(!is.null(pValChi2_PoolData))
2414 {
2415 # Round the p-value for Pool_Data
2416 for(i in 1:6)
2417 {
2418 if( (! is.na(pValChi2_PoolData[i])) && (pValChi2_PoolData[i] < 0.0001) )
2419 {
2420 pValChi2_PoolData_Round[i] <- format(pValChi2_PoolData[i], scientific=T, digits=3)
2421 } else if(! is.na(pValChi2_PoolData[i]))
2422 {
2423 pValChi2_PoolData_Round[i] <- as.character(round(pValChi2_PoolData[i], 3))
2424 }
2425 }
2426 }
2427
2428
2429 # I create a dataframe for add what I want
2430 outputChi2 <- data.frame(round(strBias[,2]/strBias[,3], digits=2), sampleSize, round(proportion, 3), confInt)
2431 outputChi2$Mut.type <- strBias$Alteration
2432 outputChi2$SampleName <- strBias$SampleName
2433 colnames(outputChi2)[1:6]<-c("Strand_Bias", "NonTr-Tr", "Proportion", "Confidence Interval", "Mutation_Type", "SampleName")
2434
2435 # Transform the data frame into a matrix for adding the p-value for the samples and Pool_Data
2436 matrix <- as.matrix(outputChi2)
2437 tempColPValFDR <- matrix(, nrow=length(sampleSize), ncol = 2) # Create an empty matrix with 2 columns for adding the p-value and the FDR
2438 matrix <- cbind(matrix, tempColPValFDR)
2439 j = 1 # Timer for all the sample
2440 k = 1 # Timer for Pool_Data
2441 for(i in 1:nrow(matrix))
2442 {
2443 if(matrix[i,6] == "Pool_Data")
2444 {
2445 matrix[i,7] <- pValChi2_PoolData_Round[k]
2446 matrix[i,8] <- "NA" # No FDR for Pool_Data
2447 k = k+1
2448 }
2449 else
2450 {
2451 matrix[i,7] <- pValChi2_round[j]
2452 matrix[i,8] <- round(FDR[j], 3)
2453 j = j+1
2454 }
2455 }
2456 # Reorder the columns
2457 matrix <- cbind(matrix[,1:3], matrix[,7], matrix[,8], matrix[,4:6])
2458 colnames(matrix)[4] <- "P-val-Chi2"
2459 colnames(matrix)[5] <- "FDR"`);
2460
2461 $R->send(qq`# Export the file
2462 # dec=".": Set the separator for the decimal by "."
2463 write.table(matrix,file="$folderChi2/Output_chi2_strandBias.txt",quote = FALSE,sep="\t",row.names = FALSE,dec=".");`);
2464
2465 # Stop the connection with R
2466 $R->stop();
2467 }
2468 # Pearson correlation
2469 sub PearsonCoefficient
2470 {
2471 our ($refH_file, $filename) = @_;
2472
2473 #### Calculate the Pearson coefficient
2474 my @total_SBS = (); # Pearson for all mutation types
2475
2476 # Create a 2D array
2477 foreach my $k_mutation (sort keys $refH_file->{$filename}{'SBSPerChr'})
2478 {
2479 my $x = [];
2480 my $correlation = 0;
2481
2482 if($k_mutation eq "AllMutType") { next; }
2483 elsif($k_mutation eq "TotalPerChr") { next; }
2484 elsif($k_mutation eq "ChrSize") { next; }
2485 else
2486 {
2487 my $testZero = 0; # The correlation function doesn't works if all the variables are equal to zero
2488 # generate an anonymous 2D array where $x->[1] is the row
2489 # $x->[1][1] is the value in row 1 column 1 and $x->[1][2] is the value of row 1 column 2
2490 # once you build the entire array, pass it to the correlation subroutine
2491 my $i=1;
2492 while ( my ($chromosome, $lenght) = each (%chromosomes))
2493 {
2494 $x->[$i][1] = $lenght; # First column contains the chromosome size
2495 $x->[$i][2] = $refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'CHR'}{$chromosome}{'chr'}; # Second column contains the count of SBS
2496 if($refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'CHR'}{$chromosome}{'chr'}==0) { $testZero++; }
2497 $i++;
2498 }
2499 if( $testZero == keys $refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'CHR'} ) { $correlation = 0; }
2500 # Pass the 2D array to the correlation subroutine
2501 else { $correlation = correlation($x); }
2502
2503 $refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'Pearson'} = $correlation; # Pearson per mutation type
2504 }
2505 }
2506
2507 #generate an anonymous 2D array for all mutation type
2508 my $testZero = 0;
2509 my $x = [];
2510 my $correlation = 0;
2511 my $i=1;
2512 while ( my ($chromosome, $lenght) = each (%chromosomes))
2513 {
2514 $x->[$i][1] = $lenght;
2515 $x->[$i][2] = $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'};
2516 $i++;
2517 }
2518 if($testZero == keys $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}) { $correlation = 0; }
2519 else { $correlation = correlation($x); }
2520 # Pass the 2D array to the correlation subroutine
2521 $refH_file->{$filename}{'SBSPerChr'}{'AllMutType'} = $correlation;
2522
2523 sub correlation
2524 {
2525 my ($x) = @_;
2526 my ($mean_x,$mean_y) = mean($x);
2527 my $ssxx=ss($x,$mean_x,$mean_y,1,1);
2528 my $ssyy=ss($x,$mean_x,$mean_y,2,2);
2529 my $ssxy=ss($x,$mean_x,$mean_y,1,2);
2530 my $correl=correl($ssxx,$ssyy,$ssxy);;
2531 my $xcorrel=sprintf("%.2f",$correl);
2532 return($xcorrel);
2533
2534 sub mean
2535 {
2536 my ($x)=@_;
2537 my $num = scalar(@{$x}) - 2;
2538 my $sum_x = '0';
2539 my $sum_y = '0';
2540 for (my $i = 2; $i < scalar(@{$x}); ++$i)
2541 {
2542 $sum_x += $x->[$i][1];
2543 $sum_y += $x->[$i][2];
2544 }
2545 my $mu_x = $sum_x / $num;
2546 my $mu_y = $sum_y / $num;
2547 return($mu_x,$mu_y);
2548 }
2549
2550 ### ss = sum of squared (deviations to the mean)
2551 sub ss
2552 {
2553 my ($x,$mean_x,$mean_y,$one,$two)=@_;
2554 my $sum = '0';
2555 for (my $i=2;$i<scalar(@{$x});++$i)
2556 {
2557 $sum += ($x->[$i][$one]-$mean_x)*($x->[$i][$two]-$mean_y);
2558 }
2559 return $sum;
2560 }
2561
2562 sub correl
2563 {
2564 my($ssxx,$ssyy,$ssxy)=@_;
2565
2566 my ($sign, $correl) = (0,0);
2567 if(abs($ssxy) == 0) { $sign = 0 }
2568 else { $sign=$ssxy/abs($ssxy); }
2569
2570 if( ($ssxx==0) || ($ssyy==0) ) { $correl = 0 }
2571 else { $correl=$sign*sqrt($ssxy*$ssxy/($ssxx*$ssyy)); }
2572
2573 return $correl;
2574 }
2575 }
2576 }
2577 # Complement bases (for the sequence context)
2578 sub complement
2579 {
2580 if($_[0] eq "A") { return "T"; }
2581 if($_[0] eq "C") { return "G"; }
2582 if($_[0] eq "G") { return "C"; }
2583 if($_[0] eq "T") { return "A"; }
2584 }
2585 # Create and write some graphics
2586 sub Create_Graph
2587 {
2588 our ($folderFigure, $filename, $maxValue) = @_;
2589
2590 # Open the connection with R
2591 my $R = Statistics::R->new() or die "Impossible to create a communication bridge with R\n";
2592 $R->startR() ;
2593 # Load the Library
2594 $R->send(q`library(ggplot2)`);
2595 $R->send(q`library(gplots)`);
2596 $R->send(q`library(gtable)`);
2597
2598
2599 $R->send(qq`##########################################
2600 ## OVERALL MUTATION DISTRIBUTION ##
2601 ##########################################
2602 distrMut <- read.table("$folderFigure/Overall_mutation_distribution/$filename/$filename-OverallMutationDistribution.txt", header=T)`);
2603 $R->send(q`# Add the count of each category in the legend
2604 distrMut$Legend[[1]] <- paste0(distrMut$Variant_type[[1]], " (", distrMut$Count[[1]], ")")
2605 distrMut$Legend[[2]] <- paste0(distrMut$Variant_type[[2]], " (", distrMut$Count[[2]], ")")
2606 distrMut$Legend[[3]] <- paste0(distrMut$Variant_type[[3]], " (", distrMut$Count[[3]], ")")`);
2607
2608 $R->send(qq`# Base plot
2609 pie <- ggplot(distrMut, aes(x=factor(""), fill=Legend, weight=Count)) + geom_bar(width=1) + coord_polar(theta="y") + scale_x_discrete("", breaks=NULL) + scale_y_continuous("", breaks=NULL) + labs(fill="")
2610 # Background of the plot entire white
2611 pie <- pie + theme(panel.grid.major = element_line(colour="white"), panel.grid.minor = element_line(colour="white"), panel.background = element_rect(fill="white"))
2612 # Legend on right in 3 rows
2613 pie <- pie + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=3))
2614 # Change the color and the title of the legend
2615 pie <- pie + scale_fill_brewer("Variant type", palette="Set1")
2616 # Remove all the margins
2617 pie <- pie + theme(plot.margin=unit(c(-1, 0, -1.5, 0), "cm"))
2618 # Save the pie chart for the HTML page (higher resolution)
2619 options(bitmapType='cairo') # Use cairo device as isn't possible to install X11 on the server...
2620 png("$folderFigure/Overall_mutation_distribution/$filename/$filename-OverallMutationDistribution.png", width=700, height=1100, res=300)
2621 print(pie)
2622 dev.off()
2623
2624
2625 ##########################################
2626 ## SBS MUTATION DISTRIBUTION ##
2627 ##########################################
2628 distrSBS <- read.delim("$folderFigure/SBS_distribution/$filename/$filename-SBS_distribution.txt")
2629 distrSBS <- data.frame(distrSBS)
2630 bar <- ggplot(distrSBS, aes(x=Mutation_Type, y=Percentage, fill=Mutation_Type))
2631 bar <- bar + geom_bar(stat="identity", width=0.5)
2632 # Theme classic
2633 bar <- bar + theme_classic()
2634 # Remove the axis legend
2635 bar <- bar + xlab("")
2636 # Set the color of the bars and Changing the labels in the legend
2637 bar <- bar + scale_fill_manual(values=c("blue", "black", "red", "gray", "#00CC33", "pink"),
2638 labels=c("C:G>A:T", "C:G>G:C", "C:G>T:A", "T:A>A:T", "T:A>C:G", "T:A>G:C")
2639 )
2640 # Remove the label in x axis
2641 bar <- bar + theme(axis.text.x = element_blank())
2642 # Change the name of the y label
2643 bar <- bar + ylab("Percent")
2644 # Save the plot for the HTML page (higher resolution)
2645 options(bitmapType='cairo')
2646 png("$folderFigure/SBS_distribution/$filename/$filename-SBS_distribution.png", width=1800, height=1500, res=300)
2647 print(bar);
2648 dev.off()
2649 # Save the plot for the report
2650 bar
2651 ggsave("$folder_temp/$filename-SBS_distribution-Report.png")
2652
2653
2654 ##########################################
2655 ## IMPACT ON PROTEIN ##
2656 ##########################################
2657 impactProt <- read.table("$folderFigure/Impact_protein_sequence/$filename/$filename-DistributionExoFunc.txt", header=T)
2658 # Custom palette: black, orange, dark green, yellow, light blue, dark blue, darkslateblue, red, purple, pink, light green, turquoise, gray
2659 cb_palette <- c("#000000", "#E69F00", "#006600", "#660000", "#F0E442", "#56B4E9", "#3300FF", "#483D8B", "#FF0000", "#9900CC", "#FF66CC", "#00CC00", "#66FFFF", "#C0C0C0")
2660 pie <- ggplot(impactProt, aes(x=factor(""), fill=AA_Change, weight=Count)) + geom_bar(width=1) + coord_polar(theta="y") + scale_x_discrete("", breaks=NULL)+ scale_y_continuous("", breaks=NULL) + scale_fill_manual(values=cb_palette)
2661 # Background of the plot entire white
2662 pie <- pie + theme(panel.grid.major = element_line(colour="white"), panel.grid.minor = element_line(colour="white"), panel.background = element_rect(fill="white"))
2663 # Legend in two column
2664 pie <- pie + guides(fill=guide_legend(ncol=2)) + theme(legend.position="bottom")
2665 # Remove the legend title
2666 pie <- pie + labs(fill="")
2667 # Save the plot for the HTML page (higher resolution)
2668 options(bitmapType='cairo')
2669 png("$folderFigure/Impact_protein_sequence/$filename/$filename-DistributionExoFunc.png", width=1600, height=1800, res=300)
2670 print(pie)
2671 dev.off()
2672 # Save the plot for the report
2673 pie
2674 ggsave("$folder_temp/$filename-DistributionExoFunc-Report.png")
2675
2676
2677 ##########################################
2678 ## STRAND BIAS ##
2679 ##########################################
2680 cb_palette_SB <- c("#0072B2", "#CC0000")
2681 file_sb <- read.table("$folderFigure/Stranded_Analysis/$filename/$filename-StrandBias.txt", header=T);
2682 p_sb <- ggplot(file_sb, aes(x=Alteration, y=Count, fill=Strand)) + theme_classic() + geom_bar(stat="identity", position="dodge") + scale_fill_manual(values=cb_palette_SB) + theme(axis.text.x = element_text(angle=60, hjust=1)) + xlab("") + theme(legend.position="bottom")
2683 # Save the plot for the HTML page (higher resolution)
2684 options(bitmapType='cairo')
2685 png("$folderFigure/Stranded_Analysis/$filename/$filename-StrandBias.png", width=1000, height=1200, res=300)
2686 print(p_sb)
2687 dev.off()
2688 # Save the plot for the report
2689 p_sb
2690 ggsave("$folder_temp/$filename-StrandBias-Report.png")
2691
2692
2693 ##########################################
2694 ## HEATMAP SEQUENCE CONTEXT ##
2695 ## GENOMIC STRAND ##
2696 ##########################################
2697 ## COUNT
2698 heatmap_G <- read.table("$folderFigure/Trinucleotide_Sequence_Context/$filename/$filename-HeatmapCount-Genomic.txt", header=T)
2699 # Save the plot for the report
2700 options(bitmapType='cairo')
2701 png(filename="$folder_temp/$filename-HeatmapCount-Genomic-Report.png", bg="transparent", width=240, height=360)
2702 # Heatmap with an absolute scale
2703 heatmap.2(as.matrix(heatmap_G),Rowv=F,Colv=F,col=colorpanel(384,low="yellow",high="red"),dendrogram="none",scale="none",trace="none",key=F,labRow=rownames(as.matrix(heatmap_G)),labCol=colnames(as.matrix(heatmap_G)),lmat=rbind(c(5,1,4),c(3,1,2)), lhei=c(0.75,0.75),lwid=c(0.5,1.5,0.5))
2704 dev.off()
2705 # Save the plot for the HTML page (higher resolution)
2706 options(bitmapType='cairo')
2707 png(filename="$folderFigure/Trinucleotide_Sequence_Context/$filename/$filename-HeatmapCount-Genomic.png", width=1100, height=1600, res=300)
2708 heatmap.2(as.matrix(heatmap_G),Rowv=F,Colv=F,col=colorpanel(384,low="yellow",high="red"),dendrogram="none",scale="none",trace="none",key=F,labRow=rownames(as.matrix(heatmap_G)),labCol=colnames(as.matrix(heatmap_G)),lmat=rbind(c(5,1,4),c(3,1,2)), lhei=c(0.75,0.75),lwid=c(0.5,1.5,0.5))
2709 dev.off()
2710
2711 ## PERCENT
2712 heatmap_G <- read.table("$folderFigure/Trinucleotide_Sequence_Context/$filename/$filename-HeatmapPercent-Genomic.txt", header=T)
2713 # Save the plot for the report
2714 options(bitmapType='cairo')
2715 png(filename="$folder_temp/$filename-HeatmapPercent-Genomic-Report.png",bg="transparent", width=240, height=360)
2716 # Heatmap with an absolute scale
2717 heatmap.2(as.matrix(heatmap_G),Rowv=F,Colv=F,col=colorpanel(384,low="yellow",high="red"),dendrogram="none",scale="none",trace="none",key=F,labRow=rownames(as.matrix(heatmap_G)),labCol=colnames(as.matrix(heatmap_G)),lmat=rbind(c(5,1,4),c(3,1,2)), lhei=c(0.75,0.75),lwid=c(0.5,1.5,0.5))
2718 dev.off()
2719 # Save the plot for the HTML page (higher resolution)
2720 options(bitmapType='cairo')
2721 png(filename="$folderFigure/Trinucleotide_Sequence_Context/$filename/$filename-HeatmapPercent-Genomic.png", width=1100, height=1600, res=300)
2722 heatmap.2(as.matrix(heatmap_G),Rowv=F,Colv=F,col=colorpanel(384,low="yellow",high="red"),dendrogram="none",scale="none",trace="none",key=F,labRow=rownames(as.matrix(heatmap_G)),labCol=colnames(as.matrix(heatmap_G)),lmat=rbind(c(5,1,4),c(3,1,2)), lhei=c(0.75,0.75),lwid=c(0.5,1.5,0.5))
2723 dev.off()`);
2724 $R->stopR() ;
2725
2726 ## Plot the transcriptional strand bias in mutation signature
2727 `Rscript $pathRScriptTxnSB $folderFigure/Stranded_Analysis/$filename/$filename-StrandedSignatureCount.txt $folderFigure/Stranded_Analysis/$filename/$filename-StrandedSignatureCount $folder_temp/$filename-StrandedSignatureCount Count 2>&1`;
2728 `Rscript $pathRScriptTxnSB $folderFigure/Stranded_Analysis/$filename/$filename-StrandedSignaturePercent.txt $folderFigure/Stranded_Analysis/$filename/$filename-StrandedSignaturePercent $folder_temp/$filename-StrandedSignaturePercent Percent 2>&1`;
2729 }
2730 # Write the titles of the different sections of the report
2731 sub WriteBoderSection
2732 {
2733 our ($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext) = @_;
2734
2735 our ($format_topLeft, $format_topRight, $format_bottomLeft, $format_bottomRight, $format_top, $format_right, $format_bottom, $format_left);
2736 Format_section($wb, \$format_topLeft, \$format_topRight, \$format_bottomLeft, \$format_bottomRight, \$format_top, \$format_right, \$format_bottom, \$format_left);
2737
2738 TableSBSDistrBySeg();
2739 TableStrandBiasBySegment();
2740 CountSBSPerChr();
2741 ShortTriNtContext(); # 6 mut type
2742 LongTriNtContext(); # 12 mut type
2743
2744 sub TableSBSDistrBySeg
2745 {
2746 # Top-Left
2747 $ws->write($rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, "Table 4. SBS distribution by functional region", $format_topLeft); $ws->set_row($rowStart_SBSdistrBySeg, 18); # Set the height of the row to 0.25"
2748 # Top
2749 for(my $i=1; $i<=13; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg+$i, $format_top); }
2750 # Top-Right
2751 $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg+13, $format_topRight);
2752 # Right
2753 $ws->write_blank($rowStart_SBSdistrBySeg+1, $colStart_SBSdistrBySeg+13, $format_right);
2754 # Bottom-left
2755 $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+5, $colStart_SBSdistrBySeg, $format_bottomLeft);
2756 # Left
2757 $ws->write_blank($rowStart_SBSdistrBySeg+1, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg, $format_left);
2758 }
2759
2760 sub TableStrandBiasBySegment
2761 {
2762 # Top-Left
2763 $ws->write($rowStart_SBSdistrBySeg+$nb_func+8, $colStart_SBSdistrBySeg, "Table 5. Strand bias by functional region", $format_topLeft); $ws->set_row($rowStart_SBSdistrBySeg+$nb_func+8, 18); # Set the height of the row to 0.25"
2764 # Top
2765 for(my $i=1; $i<=10; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+8, $colStart_SBSdistrBySeg+$i, $format_top); }
2766 # Top-Right
2767 $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+8, $colStart_SBSdistrBySeg+11, $format_topRight);
2768 # Right
2769 $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+9, $colStart_SBSdistrBySeg+11, $format_right); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*2)+13, $colStart_SBSdistrBySeg+11, $format_right);
2770 # Left
2771 $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+9, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+10, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*2)+13, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*2)+14, $colStart_SBSdistrBySeg, $format_left);
2772 # Bottom
2773 $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*3)+16, $colStart_SBSdistrBySeg+4, $format_bottom); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*3)+16, $colStart_SBSdistrBySeg+8, $format_bottom);
2774 }
2775
2776 sub CountSBSPerChr
2777 {
2778 #### Top-Left
2779 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, $colStart_SBSdistrBySeg, "Table 6. SBS distribution per chromosome", $format_topLeft); $ws->set_row($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, 18); # Set the height of the row to 0.25"
2780 #### Top
2781 for(my $i=1; $i<8; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, $colStart_SBSdistrBySeg+$i, $format_top); }
2782 #### Top-Right
2783 $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, $colStart_SBSdistrBySeg+8, $format_topRight);
2784 #### Right
2785 $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+5, $colStart_SBSdistrBySeg+8, $format_right); $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+6, $colStart_SBSdistrBySeg+8, $format_right);
2786
2787 #### Bottom-Right
2788 # Human genome = 24 chromosomes
2789 if($refGenome =~ /hg/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg+8, $format_bottomRight); }
2790 # Mouse genome = 21 chromosomes
2791 if($refGenome =~ /mm/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg+8, $format_bottomRight); }
2792 # Rat genome = 22 chromosomes
2793 if($refGenome =~ /rn/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg+8, $format_bottomRight); }
2794
2795 #### Bottom
2796 if($refGenome =~ /hg/)
2797 {
2798 $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg+1, $format_bottom);
2799 for(my $i=3; $i<=7; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg+$i, $format_bottom); }
2800 }
2801 if($refGenome =~ /mm/)
2802 {
2803 $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg+1, $format_bottom);
2804 for(my $i=3; $i<=7; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg+$i, $format_bottom); }
2805 }
2806 if($refGenome =~ /rn/)
2807 {
2808 $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg+1, $format_bottom);
2809 for(my $i=3; $i<=7; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg+$i, $format_bottom); }
2810 }
2811
2812 #### Left
2813 $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+5, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+6, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+7, $colStart_SBSdistrBySeg, $format_left);
2814
2815 #### Bottom-left
2816 if($refGenome =~ /hg/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg, $format_bottomLeft); }
2817 if($refGenome =~ /mm/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg, $format_bottomLeft); }
2818 if($refGenome =~ /rn/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg, $format_bottomLeft); }
2819 }
2820
2821 sub ShortTriNtContext
2822 {
2823 my $format_headerSection = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12);
2824 $format_headerSection->set_left(2); $format_headerSection->set_left_color('blue');
2825
2826 # Top-left
2827 $ws->write(0, $colStart_matrixSeqContext, "Panel 1. Trinucleotide sequence context of SBS on the genomic sequence", $format_topLeft);
2828 # Top
2829 for(my $i=1; $i<=19; $i++) { $ws->write_blank(0, $colStart_matrixSeqContext+$i, $format_top); }
2830 # Top-right
2831 $ws->write_blank(0, $colStart_matrixSeqContext+20, $format_topRight);
2832 # Right
2833 for(my $i=1; $i<=37; $i++) { $ws->write_blank($i, $colStart_matrixSeqContext+20, $format_right); }
2834 # Bottom-right
2835 $ws->write_blank(37, $colStart_matrixSeqContext+20, $format_bottomRight);
2836 # Bottom
2837 for(my $i=1; $i<=19; $i++) { $ws->write_blank(38, $colStart_matrixSeqContext+$i, $format_top); }
2838 # Bottom-left
2839 $ws->write_blank(37, $colStart_matrixSeqContext, $format_bottomLeft);
2840 # Left
2841 $ws->write(1, $colStart_matrixSeqContext, "", $format_left);
2842 for(my $i=3; $i<=36; $i++) { $ws->write_blank($i, $colStart_matrixSeqContext, $format_left); }
2843 }
2844
2845 sub LongTriNtContext
2846 {
2847 # Top-left
2848 $ws->write($rowStart_SBSdistrBySeg, $colStart_matrixSeqContext, "Panel 2. Stranded analysis of trinucleotide sequence context of SBS", $format_topLeft);
2849 # Top
2850 for(my $i=1; $i<=28; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_matrixSeqContext+$i, $format_top); }
2851 # Top-right
2852 $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_matrixSeqContext+29, $format_topRight);
2853 # Right
2854 for(my $i=1; $i<=42; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+$i, $colStart_matrixSeqContext+29, $format_right); }
2855 # Bottom-right
2856 $ws->write_blank(91, $colStart_matrixSeqContext+29, $format_bottomRight);
2857 # Bottom
2858 for(my $i=13; $i<=28; $i++) { $ws->write_blank(92, $colStart_matrixSeqContext+$i, $format_top); }
2859 # Bottom-left
2860 $ws->write_blank(91, $colStart_matrixSeqContext, $format_bottomLeft);
2861 # Left
2862 $ws->write_blank($rowStart_SBSdistrBySeg+1, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext, $format_left);
2863 $ws->write_blank($rowStart_SBSdistrBySeg+22, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+23, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext, $format_left);
2864 }
2865 }
2866 # Write the header for the six mutation types
2867 sub WriteHeaderSection
2868 {
2869 our ($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext) = @_;
2870
2871 our ($format_CA, $format_CG, $format_CT, $format_TA, $format_TC, $format_TG, $format_TG2, $format_LeftHeader, $format_RightHeader, $format_LeftHeader2);
2872 Format_Header($wb, \$format_CA, \$format_CG, \$format_CT, \$format_TA, \$format_TC, \$format_TG, \$format_TG2, \$format_LeftHeader, \$format_RightHeader, \$format_LeftHeader2);
2873
2874 our ($format_LeftCA, $format_LeftCG, $format_LeftCT, $format_LeftTA, $format_LeftTC, $format_LeftTG, $format_RightCA, $format_RightCG, $format_RightCT, $format_RightTA, $format_RightTC, $format_RightTG);
2875 Format_HeaderSBSDistrBySegAndFunc($wb, \$format_LeftCA, \$format_LeftCG, \$format_LeftCT, \$format_LeftTA, \$format_LeftTC, \$format_LeftTG, \$format_RightCA, \$format_RightCG, \$format_RightCT, \$format_RightTA, \$format_RightTC, \$format_RightTG);
2876
2877 our $format_A11Bold = ""; Format_A11Bold($wb, \$format_A11Bold); # Arial 11 bold and center
2878 our $format_A11BoldLeft = ""; Format_A11BoldLeft($wb, \$format_A11BoldLeft); # Arial 11 bold and left
2879
2880 our ($format_header12CA, $format_header12CG, $format_header12CT, $format_header12TA, $format_header12TC, $format_header12TG);
2881 Format_Header12MutType($wb, \$format_header12CA, \$format_header12CG, \$format_header12CT, \$format_header12TA, \$format_header12TC, \$format_header12TG);
2882
2883 ## Header for SBS distribution by segment
2884 HeaderMutTypeSBSDistrBySeg();
2885
2886 ## Header for strand bias by function
2887 $ws->set_column($colStart_SBSdistrBySeg+5, $colStart_SBSdistrBySeg+5, 11);
2888
2889 my $row = $rowStart_SBSdistrBySeg+$nb_func+10; my $col = $colStart_SBSdistrBySeg;
2890 $ws->write($row, $col+1, ' ', $format_CA); $ws->write($row, $col+2, "C>A", $format_CA); $ws->write($row, $col+3, ' ', $format_CA);
2891 $ws->write($row, $col+5, ' ', $format_CG); $ws->write($row, $col+6, "C>G", $format_CG); $ws->write($row, $col+7, ' ', $format_CG);
2892 $ws->write($row, $col+9, ' ', $format_CT); $ws->write($row, $col+10, "C>T", $format_CT); $ws->write($row, $col+11, ' ', $format_RightCT);
2893
2894 $row = $rowStart_SBSdistrBySeg+($nb_func*2)+14;
2895 $ws->write($row, $col+1, ' ', $format_TA); $ws->write($row, $col+2, "T>A", $format_TA); $ws->write($row, $col+3, ' ', $format_TA);
2896 $ws->write($row, $col+5, ' ', $format_TC); $ws->write($row, $col+6, "T>C", $format_TC); $ws->write($row, $col+7, ' ', $format_TC);
2897 $ws->write($row, $col+9, ' ', $format_TG2); $ws->write($row, $col+10, "T>G", $format_TG2); $ws->write($row, $col+11, ' ', $format_RightTG);
2898
2899 $ws->set_row($rowStart_SBSdistrBySeg+$nb_func+11, 18); $ws->set_row($rowStart_SBSdistrBySeg+($nb_func*2)+15, 18);
2900 $ws->set_column($colStart_SBSdistrBySeg+5, $colStart_SBSdistrBySeg+5, 13); $ws->set_column($colStart_SBSdistrBySeg+9, $colStart_SBSdistrBySeg+9, 13);
2901
2902 for(my $i=$rowStart_SBSdistrBySeg+$nb_func+10; $i<=$rowStart_SBSdistrBySeg+($nb_func*2)+14; $i+=$nb_func+4)
2903 {
2904 $ws->write($i+1, $colStart_SBSdistrBySeg, 'Segment', $format_LeftHeader); $ws -> write($i+1, $colStart_SBSdistrBySeg+1, 'Non-Tr/Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+2, 'Non-Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+3, 'Tr', $format_A11Bold);
2905 $ws -> write($i+1, $colStart_SBSdistrBySeg+5, 'Non-Tr/Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+6, 'Non-Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+7, 'Tr', $format_A11Bold);
2906 $ws -> write($i+1, $colStart_SBSdistrBySeg+9, 'Non-Tr/Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+10, 'Non-Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+11, 'Tr', $format_RightHeader);
2907 }
2908
2909
2910 ## Header for Counts of SBS per chromosome and mutation type
2911 HeaderCountSBSPerChr();
2912
2913 ## Header for the short sequence context
2914 HeaderShortTriNtContext();
2915
2916 ## Header for the 12 mutation types with the sequence context (coding strand)
2917 HeaderLongTriNtContext();
2918
2919 sub HeaderMutTypeSBSDistrBySeg
2920 {
2921 $ws->set_row($rowStart_SBSdistrBySeg+2, 18);
2922 $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+2, "C:G>A:T", $format_CA); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+3, $format_CA);
2923 $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+4, "C:G>G:C", $format_CG); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+5, $format_CG);
2924 $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+6, "C:G>T:A", $format_CT); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+7, $format_CT);
2925 $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+8, "T:A>A:T", $format_TA); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+9, $format_TA);
2926 $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+10, "T:A>C:G", $format_TC); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+11, $format_TC);
2927 $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+12, "T:A>G:C", $format_TG); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+13, $format_TG);
2928
2929 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg, "Segment", $format_LeftHeader); $ws->set_column($colStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, 13); $ws->set_row($rowStart_SBSdistrBySeg+3, 18);
2930 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+1, "Total SBS", $format_A11Bold); $ws->set_column($colStart_SBSdistrBySeg+1, $colStart_SBSdistrBySeg+1, 11);
2931 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+2, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+3, "#", $format_A11Bold);
2932 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+4, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+5, "#", $format_A11Bold);
2933 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+6, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+7, "#", $format_A11Bold);
2934 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+8, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+9, "#", $format_A11Bold);
2935 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+10, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+11, "#", $format_A11Bold);
2936 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+12, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, 13, "#", $format_RightHeader);
2937 }
2938
2939 sub HeaderCountSBSPerChr
2940 {
2941 $ws->set_column(3,3, 10); $ws->set_column(4,4, 10);
2942 $ws->set_row($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, 18);
2943 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+7, $colStart_SBSdistrBySeg+1, "Pearson", $format_A11Bold);
2944 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg, "Chr", $format_LeftHeader);
2945 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+1, "Size", $format_A11Bold);
2946 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+2, "All SBS", $format_A11Bold);
2947
2948 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+3, "C:G>A:T", $format_CA);
2949 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+4, "C:G>G:C", $format_CG);
2950 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+5, "C:G>T:A", $format_CT);
2951 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+6, "T:A>A:T", $format_TA);
2952 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+7, "T:A>C:G", $format_TC);
2953 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+8, "T:A>G:C", $format_TG);
2954 }
2955
2956 sub HeaderShortTriNtContext
2957 {
2958 ### GENOMIC STRAND
2959 $ws->write(2, $colStart_matrixSeqContext, 'Count matrix', $format_LeftHeader2);
2960 $ws->write(3, $colStart_matrixSeqContext+4, 'C>A', $format_CA); $ws->write(3, $colStart_matrixSeqContext+5, 'C>G', $format_CG); $ws->write(3, $colStart_matrixSeqContext+6, 'C>T', $format_CT); $ws->write(3, $colStart_matrixSeqContext+7, 'T>A', $format_TA); $ws->write(3, $colStart_matrixSeqContext+8, 'T>C', $format_TC); $ws->write(3, $colStart_matrixSeqContext+9, 'T>G', $format_TG2);
2961
2962 $ws->write(2, $colStart_matrixSeqContext+11, 'Frequency matrix', $format_A11BoldLeft);
2963 $ws->write(3, $colStart_matrixSeqContext+14, 'C>A', $format_CA); $ws->write(3, $colStart_matrixSeqContext+15, 'C>G', $format_CG); $ws->write(3, $colStart_matrixSeqContext+16, 'C>T', $format_CT); $ws->write(3, $colStart_matrixSeqContext+17, 'T>A', $format_TA); $ws->write(3, $colStart_matrixSeqContext+18, 'T>C', $format_TC); $ws->write(3, $colStart_matrixSeqContext+19, 'T>G', $format_TG2);
2964
2965 ### sequence context with a bar graph
2966 $ws->write(25, $colStart_matrixSeqContext+10, "Mutation spectra frequency", $format_A11Bold);
2967 }
2968
2969 sub HeaderLongTriNtContext
2970 {
2971 $ws->set_row($rowStart_SBSdistrBySeg+3, 15); $ws->set_row($rowStart_SBSdistrBySeg+4, 15); $ws->set_row($rowStart_SBSdistrBySeg+5, 15);
2972 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_matrixSeqContext, "Count matrix", $format_LeftHeader2);
2973 $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+1, "C>A", $format_CA); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+2, $format_CA); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+1, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+2, "Tr", $format_A11Bold);
2974 $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+3, "C>G", $format_CG); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+4, $format_CG); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+3, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+4, "Tr", $format_A11Bold);
2975 $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+5, "C>T", $format_CT); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+6, $format_CT); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+5, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+6, "Tr", $format_A11Bold);
2976 $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+7, "T>A", $format_TA); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+8, $format_TA); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+7, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+8, "Tr", $format_A11Bold);
2977 $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+9, "T>C", $format_TC); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+10, $format_TC); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+9, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+10, "Tr", $format_A11Bold);
2978 $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+11, "T>G", $format_TG2); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+12, $format_TG2); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+11, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+12, "Tr", $format_A11Bold);
2979
2980
2981 $ws->set_row($rowStart_SBSdistrBySeg+24, 15); $ws->set_row($rowStart_SBSdistrBySeg+25, 15); $ws->set_row($rowStart_SBSdistrBySeg+26, 15);
2982 $ws->write($rowStart_SBSdistrBySeg+24, $colStart_matrixSeqContext, "Frequency matrix", $format_LeftHeader2);
2983 $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+1, "C>A", $format_CA); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+2, $format_CA); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+1, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+2, "Tr", $format_A11Bold);
2984 $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+3, "C>G", $format_CG); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+4, $format_CG); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+3, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+4, "Tr", $format_A11Bold);
2985 $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+5, "C>T", $format_CT); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+6, $format_CT); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+5, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+6, "Tr", $format_A11Bold);
2986 $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+7, "T>A", $format_TA); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+8, $format_TA); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+7, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+8, "Tr", $format_A11Bold);
2987 $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+9, "T>C", $format_TC); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+10, $format_TC); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+9, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+10, "Tr", $format_A11Bold);
2988 $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+11, "T>G", $format_TG2); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+12, $format_TG2); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+11, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+12, "Tr", $format_A11Bold);
2989 }
2990 }
2991 # Create logo for representing the sequence context with n bases
2992 sub CreateLogo
2993 {
2994 my ($refH_file, $folderWebLogo) = @_;
2995
2996 my $folderSample = "";
2997
2998 foreach my $k_file (sort keys $refH_file)
2999 {
3000 $folderSample = "$folderWebLogo/$k_file";
3001 if(!-e $folderSample) { mkdir($folderSample) or die "Can't create the directory $folderSample\n"; }
3002
3003 my $test_lengthSeqContext = 0;
3004
3005 foreach my $k_mutation (sort keys $refH_file->{$k_file}{'WebLogo3'})
3006 {
3007 open(WEBLOGO, ">", "$folderSample/$k_file-$k_mutation.fa") or die "$!: $folderSample/$k_file-$k_mutation.fa\n";
3008 foreach (@{$refH_file->{$k_file}{'WebLogo3'}{$k_mutation}})
3009 {
3010 print WEBLOGO ">$k_file\n$_\n";
3011
3012 if(length($_) < 10) { $test_lengthSeqContext = 0; }
3013 else { $test_lengthSeqContext = 1; }
3014 }
3015 close WEBLOGO;
3016 }
3017
3018 ## Generate the logo
3019 foreach my $fastaFile (`ls $folderSample/*.fa`)
3020 {
3021 chomp($fastaFile);
3022 my ($filename, $directories, $suffix) = fileparse("$folderSample/$fastaFile", qr/\.[^.]*/);
3023
3024 $filename =~ /(.+)\-/;
3025 my $title = $1;
3026
3027 ## Test if there is fasta sequence for the mutation type
3028 my $nbLigne_temp = `wc -l $fastaFile`;
3029 my @nbLigne = split(" ", $nbLigne_temp);
3030
3031 if($nbLigne[0] == 0) { print "WARNING: No sequence for $filename\n"; next; }
3032
3033 # When length sequence context is lower than 10 the image is to small for adding a title
3034 if($test_lengthSeqContext == 1) { system("weblogo -c classic -F png -U probability --title $title < $fastaFile > $folderSample/$filename-Probability.png"); }
3035 else { system("weblogo -c classic -F png -U probability < $fastaFile > $folderSample/$filename-Probability.png"); }
3036 }
3037 }
3038 }
3039
3040
3041 # Define the format of the worksheet: Arial font size=10
3042 sub Format_A10
3043 {
3044 my ($wb, $format) = @_;
3045 $$format = $wb->add_format(font=>'Arial', size=>10); $$format->set_align('center');
3046 }
3047 # Define the format of the worksheet: Arial font size=11 bold and center
3048 sub Format_A11Bold
3049 {
3050 my ($wb, $format) = @_;
3051 $$format = $wb->add_format(font=>'Arial', size=>11, bold=>1); $$format->set_align('center');
3052 }
3053 # Define the format of the worksheet: Arial font size=10 italic red and center
3054 sub Format_A10ItalicRed
3055 {
3056 my ($wb, $format) = @_;
3057 $$format = $wb->add_format(font=>'Arial', size=>10, italic=>1, color => 'red'); $$format->set_align('center');
3058 }
3059 # Defile the format of the worksheet: Arialt font size=11 bold and left
3060 sub Format_A11BoldLeft
3061 {
3062 my ($wb, $format) = @_;
3063 $$format = $wb->add_format(valign =>'left', font=>'Arial', size=>11, bold=>1);
3064 }
3065 # Defile the format of the worksheet: Arialt font size=10 bold and left
3066 sub Format_A10BoldLeft
3067 {
3068 my ($wb, $format) = @_;
3069 $$format = $wb->add_format(valign =>'left', font=>'Arial', size=>10, bold=>1);
3070 }
3071 # Define the format of the border of the section (for delimiting the different section of the report)
3072 sub Format_section
3073 {
3074 my ($wb, $format_topLeft, $format_topRight, $format_bottomLeft, $format_bottomRight, $format_top, $format_right, $format_bottom, $format_left) = @_;
3075
3076 $$format_topLeft = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12);
3077 $$format_topLeft->set_top(2); $$format_topLeft->set_top_color('blue');
3078 $$format_topLeft->set_left(2); $$format_topLeft->set_left_color('blue');
3079
3080 $$format_topRight = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12);
3081 $$format_topRight->set_top(2); $$format_topRight->set_top_color('blue');
3082 $$format_topRight->set_right(2); $$format_topRight->set_right_color('blue');
3083
3084 $$format_bottomLeft = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12);
3085 $$format_bottomLeft->set_bottom(2); $$format_bottomLeft->set_bottom_color('blue');
3086 $$format_bottomLeft->set_left(2); $$format_bottomLeft->set_left_color('blue');
3087
3088 $$format_bottomRight = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12);
3089 $$format_bottomRight->set_bottom(2); $$format_bottomRight->set_bottom_color('blue');
3090 $$format_bottomRight->set_right(2); $$format_bottomRight->set_right_color('blue');
3091
3092 $$format_top = $wb->add_format(); $$format_top->set_top(2); $$format_top->set_top_color('blue');
3093 $$format_right = $wb->add_format(); $$format_right->set_right(2); $$format_right->set_right_color('blue');
3094 $$format_bottom = $wb->add_format(); $$format_bottom->set_bottom(2); $$format_bottom->set_bottom_color('blue');
3095 $$format_left = $wb->add_format(); $$format_left->set_left(2); $$format_left->set_left_color('blue');
3096 }
3097 # Define the header
3098 sub Format_Header
3099 {
3100 my ($wb, $format_CA, $format_CG, $format_CT, $format_TA, $format_TC, $format_TG, $format_TG2, $format_LeftHeader, $format_RightHeader, $format_LeftHeader2) = @_;
3101
3102 my ($blue, $black, $red, $gray, $green, $pink);
3103 Color($wb, \$blue, \$black, \$red, \$gray, \$green, \$pink);
3104
3105 my ($bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink);
3106 BackgroundColor($wb, \$bgColor_blue, \$bgColor_black, \$bgColor_red, \$bgColor_gray, \$bgColor_green, \$bgColor_pink);
3107
3108
3109 $$format_CA = $wb->add_format(bg_color => $blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CA->set_align('center'); $$format_CA->set_center_across();
3110 $$format_CG = $wb->add_format(bg_color => $black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CG->set_align('center'); $$format_CG->set_center_across();
3111 $$format_CT = $wb->add_format(bg_color => $red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CT->set_align('center'); $$format_CT->set_center_across();
3112 $$format_TA = $wb->add_format(bg_color => $gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TA->set_align('center'); $$format_TA->set_center_across();
3113 $$format_TC = $wb->add_format(bg_color => $green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TC->set_align('center'); $$format_TC->set_center_across();
3114 $$format_TG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TG->set_align('center'); $$format_TG->set_center_across();
3115 $$format_TG->set_right(2); $$format_TG->set_right_color('blue');
3116
3117 $$format_TG2 = $wb->add_format(bg_color => $pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TG2->set_align('center'); $$format_TG2->set_center_across();
3118
3119 $$format_LeftHeader = $wb->add_format(bold=>1, font=>'Arial', size=>11); $$format_LeftHeader->set_align('center'); $$format_LeftHeader->set_left(2); $$format_LeftHeader->set_left_color('blue');
3120 $$format_LeftHeader2 = $wb->add_format(bold=>1, font=>'Arial', size=>11); $$format_LeftHeader2->set_left(2); $$format_LeftHeader2->set_left_color('blue');
3121 $$format_RightHeader = $wb->add_format(bold=>1, font=>'Arial', size=>11); $$format_RightHeader->set_align('center'); $$format_RightHeader->set_right(2); $$format_RightHeader->set_right_color('blue');
3122 }
3123 # Define the mutation type header for the Strand bias by segment
3124 sub Format_HeaderSBSDistrBySegAndFunc
3125 {
3126 my ($wb, $format_LeftCA, $format_LeftCG, $format_LeftCT, $format_LeftTA, $format_LeftTC, $format_LeftTG, $format_RightCA, $format_RightCG, $format_RightCT, $format_RightTA, $format_RightTC, $format_RightTG) = @_;
3127
3128 my ($bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink);
3129 BackgroundColor($wb, \$bgColor_blue, \$bgColor_black, \$bgColor_red, \$bgColor_gray, \$bgColor_green, \$bgColor_pink);
3130
3131 $$format_LeftCA = $wb->add_format(bg_color=>$bgColor_blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftCA->set_align('center'); $$format_LeftCA->set_left(2); $$format_LeftCA->set_left_color('blue');
3132 $$format_LeftCG = $wb->add_format(bg_color=>$bgColor_black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftCG->set_align('center'); $$format_LeftCG->set_left(2); $$format_LeftCG->set_left_color('blue');
3133 $$format_LeftCT = $wb->add_format(bg_color=>$bgColor_red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftCT->set_align('center'); $$format_LeftCT->set_left(2); $$format_LeftCT->set_left_color('blue');
3134 $$format_LeftTA = $wb->add_format(bg_color=>$bgColor_gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftTA->set_align('center'); $$format_LeftTA->set_left(2); $$format_LeftTA->set_left_color('blue');
3135 $$format_LeftTC = $wb->add_format(bg_color=>$bgColor_green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftTC->set_align('center'); $$format_LeftTC->set_left(2); $$format_LeftTC->set_left_color('blue');
3136 $$format_LeftTG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftTG->set_align('center'); $$format_LeftTG->set_left(2); $$format_LeftTG->set_left_color('blue');
3137
3138
3139 $$format_RightCA = $wb->add_format(bg_color=>$bgColor_blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightCA->set_align('center'); $$format_RightCA->set_right(2); $$format_RightCA->set_right_color('blue');
3140 $$format_RightCG = $wb->add_format(bg_color=>$bgColor_black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightCG->set_align('center'); $$format_RightCG->set_right(2); $$format_RightCG->set_right_color('blue');
3141 $$format_RightCT = $wb->add_format(bg_color=>$bgColor_red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightCT->set_align('center'); $$format_RightCT->set_right(2); $$format_RightCT->set_right_color('blue');
3142 $$format_RightTA = $wb->add_format(bg_color=>$bgColor_gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightTA->set_align('center'); $$format_RightTA->set_right(2); $$format_RightTA->set_right_color('blue');
3143 $$format_RightTC = $wb->add_format(bg_color=>$bgColor_green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightTC->set_align('center'); $$format_RightTC->set_right(2); $$format_RightTC->set_right_color('blue');
3144 $$format_RightTG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightTG->set_align('center'); $$format_RightTG->set_right(2); $$format_RightTG->set_right_color('blue');
3145 }
3146 # Define the mutation type header for the trinucleotide sequence context on the coding strand
3147 sub Format_Header12MutType
3148 {
3149 my ($wb, $format_CA, $format_CG, $format_CT, $format_TA, $format_TC, $format_TG) = @_;
3150
3151 my ($bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink);
3152 BackgroundColor($wb, \$bgColor_blue, \$bgColor_black, \$bgColor_red, \$bgColor_gray, \$bgColor_green, \$bgColor_pink);
3153
3154 $$format_CA = $wb->add_format(bg_color=>$bgColor_blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CA->set_align('center');
3155 $$format_CG = $wb->add_format(bg_color=>$bgColor_black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CG->set_align('center');
3156 $$format_CT = $wb->add_format(bg_color=>$bgColor_red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CT->set_align('center');
3157 $$format_TA = $wb->add_format(bg_color=>$bgColor_gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TA->set_align('center');
3158 $$format_TC = $wb->add_format(bg_color=>$bgColor_green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TC->set_align('center');
3159 $$format_TG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TG->set_align('center');
3160 }
3161 # Define the format for the text that needs a section border
3162 sub Format_TextSection
3163 {
3164 my ($wb, $formatT_left, $formatT_right, $formatT_bottomRight, $formatT_bottomLeft, $formatT_bottom, $formatT_bottomHeader, $formatT_bottomRightHeader, $formatT_bottomHeader2, $formatT_rightHeader) = @_;
3165
3166 $$formatT_left = $wb->add_format(valign=>'center', font=>'Arial', size=>10);
3167 $$formatT_left->set_left(2); $$formatT_left->set_left_color('blue');
3168
3169 $$formatT_right = $wb->add_format(valign=>'center', font=>'Arial', size=>10);
3170 $$formatT_right->set_right(2); $$formatT_right->set_right_color('blue');
3171
3172 $$formatT_bottomRight = $wb->add_format(valign=>'center', font=>'Arial', size=>10);
3173 $$formatT_bottomRight->set_bottom(2); $$formatT_bottomRight->set_bottom_color('blue');
3174 $$formatT_bottomRight->set_right(2); $$formatT_bottomRight->set_right_color('blue');
3175
3176 $$formatT_bottomLeft = $wb->add_format(valign=>'center', font=>'Arial', size=>10);
3177 $$formatT_bottomLeft->set_bottom(2); $$formatT_bottomLeft->set_bottom_color('blue');
3178 $$formatT_bottomLeft->set_left(2); $$formatT_bottomLeft->set_left_color('blue');
3179
3180 $$formatT_bottom = $wb->add_format(valign=>'center', font=>'Arial', size=>10);
3181 $$formatT_bottom->set_bottom(2); $$formatT_bottom->set_bottom_color('blue');
3182
3183 my $bgColor_totallighGray = $wb->set_custom_color(54, 230, 230, 230);
3184 $$formatT_bottomHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_bottomHeader->set_align('center');
3185 $$formatT_bottomHeader->set_bottom(2); $$formatT_bottomHeader->set_bottom_color('blue');
3186
3187 $$formatT_bottomRightHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_bottomRightHeader->set_align('center');
3188 $$formatT_bottomRightHeader->set_bottom(2); $$formatT_bottomRightHeader->set_bottom_color('blue');
3189 $$formatT_bottomRightHeader->set_right(2); $$formatT_bottomRightHeader->set_right_color('blue');
3190
3191 $$formatT_bottomHeader2 = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_bottomHeader2->set_align('center');
3192
3193 $$formatT_rightHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_rightHeader->set_align('center');
3194 $$formatT_rightHeader->set_right(2); $$formatT_rightHeader->set_right_color('blue');
3195 }
3196 # Define the format for the graphs titles
3197 sub Format_GraphTitle
3198 {
3199 my ($wb, $formatT_graphTitle) = @_;
3200
3201 $$formatT_graphTitle = $wb->add_format(font=>'Arial', size=>12, bold=>1);
3202 }
3203 # Define the format of the border of the tables
3204 sub Format_Table
3205 {
3206 my ($wb, $table_topleft, $table_topRight, $table_bottomleft, $table_bottomRight, $table_top, $table_right, $table_bottom, $table_bottomItalicRed, $table_left, $table_bottomrightHeader, $table_left2, $table_middleHeader, $table_middleHeader2) = @_;
3207
3208 $$table_topleft = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_topleft->set_top(1); $$table_topleft->set_left(1);
3209 $$table_topRight = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_topRight->set_top(1); $$table_topRight->set_right(1);
3210 $$table_bottomleft = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_bottomleft->set_bottom(1); $$table_bottomleft->set_left(1);
3211 $$table_bottomRight = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$table_bottomRight->set_bottom(1); $$table_bottomRight->set_right(1);
3212
3213 $$table_top = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_top->set_top(1);
3214 $$table_right = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$table_right->set_right(1);
3215 $$table_bottom = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$table_bottom->set_bottom(1);
3216 $$table_bottomItalicRed = $wb->add_format(valign=>'center', font=>'Arial', size=>10, italic=>1, color => 'red'); $$table_bottomItalicRed->set_bottom(1);
3217 $$table_left = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_left->set_left(1);
3218
3219 my $bgColor_totallighGray = $wb->set_custom_color(54, 230, 230, 230);
3220 $$table_bottomrightHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>10); $$table_bottomrightHeader->set_bottom(1); $$table_bottomrightHeader->set_right(1);
3221
3222 $$table_left2 = $wb->add_format(valign=>'left', font=>'Arial', size=>10); $$table_left2->set_left(1);
3223
3224 $$table_middleHeader = $wb->add_format(valign=>'center', bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>10);
3225 $$table_middleHeader2 = $wb->add_format(valign=>'center', bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>10); $$table_middleHeader2->set_bottom(1);
3226 }
3227
3228 # Define the color
3229 sub Color
3230 {
3231 my ($wb, $blue, $black, $red, $gray, $green, $pink) = @_;
3232
3233 $$blue = $wb->set_custom_color(40, 0, 0, 204);# C:G>A:T in blue
3234 $$black = $wb->set_custom_color(41, 0, 0, 0);# C:G>G:C in black
3235 $$red = $wb->set_custom_color(42, 255, 0, 0);# C:G>T:A in red
3236 $$gray = $wb->set_custom_color(43, 205, 205, 205); # T:A>A:T in light gray
3237 $$green = $wb->set_custom_color(44, 0, 204, 51);# T:A>C:G in green
3238 $$pink = $wb->set_custom_color(45, 255, 192, 203);# T:A>G:C in pink
3239 }
3240 sub BackgroundColor
3241 {
3242 my ($wb, $bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink) = @_;
3243
3244 $$bgColor_blue = $wb->set_custom_color(48, 0, 0, 204);
3245 $$bgColor_black = $wb->set_custom_color(49, 0, 0, 0);
3246 $$bgColor_red = $wb->set_custom_color(50, 255, 0, 0);
3247 $$bgColor_gray = $wb->set_custom_color(51, 205, 205, 205);
3248 $$bgColor_green = $wb->set_custom_color(52, 0, 204, 51);
3249 $$bgColor_pink = $wb->set_custom_color(53, 255, 192, 203);
3250 }
3251 }
3252
3253
3254 sub recoverNumCol
3255 {
3256 my ($input, $name_of_column) = @_;
3257
3258 open(F1,$input) or die "recoverNumCol: $!: $input\n";
3259 # For having the name of the columns
3260 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
3261 close F1;
3262 # The number of the column
3263 my $name_of_column_NB = "toto";
3264 for(my $i=0; $i<=$#tab_search_header; $i++)
3265 {
3266 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; }
3267 }
3268 if($name_of_column_NB eq "toto") { print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
3269 else { return $name_of_column_NB; }
3270 }
3271
3272
3273
3274
3275 =head1 NAME
3276
3277 mutSpec-Stat
3278
3279 =head1 SYNOPSIS
3280
3281 mutSpecstat.pl [arguments] <query-file>
3282
3283 <query-file> can be a folder with multiple VCF or a single VCF
3284
3285 Arguments:
3286 -h, --help print help message
3287 -m, --man print complete documentation
3288 -v, --verbose use verbose output
3289 --refGenome the reference genome to use (human, mouse or rat genomes)
3290 -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file
3291 -temp --pathTemporary <string> the path for saving the temporary files
3292 --pathSeqRefGenome the path to the fasta reference sequences
3293 --poolData generate the pool of all the samples (optional)
3294 --reportSample generate a report for each sample (optional)
3295
3296
3297 Function: automatically run a pipeline and calculate various statistics on mutations
3298
3299 Example: mutSpecstat.pl --refGenome hg19 --outfile output_directory --temp path_to_temporary_directory --pathRscript path_to_R_scripts --pathSeqRefGenome path_fasta_ref_seq --poolData --reportSample input
3300
3301 Version: 04-2016 (April 2016)
3302
3303
3304 =head1 OPTIONS
3305
3306 =over 8
3307
3308 =item B<--help>
3309
3310 print a brief usage message and detailed explanation of options.
3311
3312 =item B<--man>
3313
3314 print the complete manual of the program.
3315
3316 =item B<--verbose>
3317
3318 use verbose output.
3319
3320 =item B<--refGenome>
3321
3322 the reference genome to use, could be human, mouse or rat genomes.
3323
3324 =item B<--outfile>
3325
3326 the directory of output file names. If it is nor specify the same directory as the input file is used.
3327
3328 =item B<--pathTemporary>
3329
3330 the path for saving temporary files generated by the script.
3331 If any is specify a temporary folder is created in the same directory where the script is running.
3332 Deleted when the script is finish
3333
3334 =item B<--pathSeqRefGenome>
3335
3336 The path to the fasta reference sequences
3337
3338 =item B<--poolData only for the report>
3339
3340 calculate the statistics on the pool of all the data pass in input
3341
3342 =item B<--reportSample only for the report>
3343
3344 generate a report for each samples
3345
3346 =head1 DESCRIPTION
3347
3348 mutSpecstat is a perl script for calculated various statistics on mutations
3349 An Excel report containing the mutation type distribution per functional region, the strand bias and the sequence context on genomic and coding sequence is created.
3350 The different statistics are illustrated using ggplot2.
3351
3352 =cut