7
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 #-----------------------------------#
|
|
4 # Author: Maude #
|
|
5 # Script: mutspecStat.pl #
|
|
6 # Last update: 09/02/17 #
|
|
7 #-----------------------------------#
|
|
8
|
|
9 use strict;
|
|
10 use warnings;
|
|
11 use Getopt::Long;
|
|
12 use Pod::Usage;
|
|
13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
|
|
14 use File::Path;
|
|
15 use Spreadsheet::WriteExcel;
|
|
16
|
|
17 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
|
|
18 our ($refGenome, $output, $folder_temp, $path_R_Scripts, $path_SeqrefGenome) = ("empty", "empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path for saving the temporary files; The path to R scripts; The path to the fasta reference sequences
|
|
19 our ($poolData, $oneReportPerSample) = (2, 2); # If a folder is pass as input file pool all the data and generate the report on the pool and for each samples; # Generate one report for each samples
|
|
20
|
|
21
|
|
22 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'outfile|o=s' => \$output, 'temp=s' => \$folder_temp, 'pathRscript=s' => \$path_R_Scripts, 'pathSeqRefGenome=s' => \$path_SeqrefGenome, 'poolData' => \$poolData, 'reportSample' => \$oneReportPerSample) or pod2usage(2);
|
|
23
|
|
24 our ($input) = @ARGV;
|
|
25
|
|
26 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
|
|
27 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
|
|
28 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script
|
|
29 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)
|
|
30
|
|
31
|
|
32
|
|
33
|
|
34 ####### The input must be a folder with one or several annotated files
|
|
35 if(!-d $input)
|
|
36 {
|
|
37 print STDERR "Error: The input must be a Dataset List\n";
|
|
38 print STDERR "Even for 1 file, please create a Dataset List\n";
|
|
39 exit;
|
|
40 }
|
|
41
|
|
42
|
|
43 ######################################################################################################################################################
|
|
44 # GLOBAL VARIABLES #
|
|
45 ######################################################################################################################################################
|
|
46 # Recover the current path
|
|
47 our $pwd = `pwd`;
|
|
48 chomp($pwd);
|
|
49
|
|
50
|
|
51 # Path to R scripts
|
|
52 our $pathRscriptChi2test = "$path_R_Scripts/R/chi2test_MutSpecStat_Galaxy.r";
|
|
53 our $pathRScriptFigs = "$path_R_Scripts/R/figs_MutSpecStat_Galaxy.r";
|
|
54 our $pathRScriptTxnSB = "$path_R_Scripts/R/transciptionalStrandBias.r";
|
|
55 our $pathRScriptMutSpectrum = "$path_R_Scripts/R/mutationSpectra_Galaxy.r";
|
|
56
|
|
57
|
|
58 # The path for saving the files with enough mutations for calculating the statistics;
|
|
59 our $folderCheckedForStat = "$pwd/folder_checked";
|
|
60 if(!-e $folderCheckedForStat) { mkdir($folderCheckedForStat) or die "$!: $folderCheckedForStat\n"; }
|
|
61
|
|
62 # Output dir with all the results
|
|
63 our $folderMutAnalysis = "";
|
|
64
|
|
65 # Hash table with the length of each chromosomes
|
|
66 our %chromosomes;
|
|
67 # Define the name of the column containing the chromosome, start, ref and alt alleles (based on Annovar output)
|
|
68 our ($chr_name, $start_name, $ref_name, $alt_name) = qw(Chr Start Ref Alt);
|
|
69 # Annovar annotation used
|
|
70 our $func_name = "Func.refGene";
|
|
71 our $exonicFunc_name = "ExonicFunc.refGene";
|
|
72 our $strand_name = "Strand";
|
|
73 our $context_name = "context";
|
|
74 # Font formats
|
|
75 our ($format_A10, $format_A10Boldleft, $format_A10ItalicRed) = ("", "", "");
|
|
76 our ($formatT_left, $formatT_right, $formatT_bottomRight, $formatT_bottomLeft, $formatT_bottom, $formatT_bottomHeader, $formatT_bottomRightHeader, $formatT_bottomHeader2, $formatT_rightHeader);
|
|
77 our ($formatT_graphTitle);
|
|
78 our ($table_topleft, $table_topRight, $table_bottomleft, $table_bottomRight, $table_top, $table_right, $table_bottom, $table_bottomItalicRed, $table_left, $table_bottomrightHeader, $table_left2, $table_middleHeader, $table_middleHeader2);
|
|
79 # Hash table with the result of chi2 test for the strand bias
|
|
80 our %h_chi2 = ();
|
|
81 # For NMF input
|
|
82 our %h_inputNMF = ();
|
|
83
|
|
84
|
|
85 ######################################################################################################################################################
|
|
86 # MAIN #
|
|
87 ######################################################################################################################################################
|
|
88 # Check the presence of the flags and create the output and temp directories
|
|
89 CheckFlags();
|
|
90
|
|
91 # First check if the files are annotated or not.
|
|
92 # If the files are annotated check there is enough mutations for generating the statistics, otherwise remove the samples from the analysis
|
|
93 checkVariants();
|
|
94
|
|
95 # Retrieve chromosomes length
|
|
96 checkChrDir();
|
|
97
|
|
98 # Calculate the statistics and generate the report
|
|
99 ReportMutDist();
|
|
100
|
|
101 # Remove the temporary directory
|
|
102 rmtree($folder_temp);
|
|
103 rmtree($folderCheckedForStat);
|
|
104
|
|
105
|
|
106 ######################################################################################################################################################
|
|
107 # FUNCTIONS #
|
|
108 ######################################################################################################################################################
|
|
109
|
|
110 # Check the presence of the flags and create the output and temp directories
|
|
111 sub CheckFlags
|
|
112 {
|
|
113 # Check the reference genome
|
|
114 if($refGenome eq "empty")
|
|
115 {
|
|
116 print STDERR "Missing flag !\n";
|
|
117 print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n";
|
|
118 exit;
|
|
119 }
|
|
120
|
|
121 # If no output is specified write the result as the same place as the input file
|
|
122 if($output eq "empty")
|
|
123 {
|
|
124 # The input is a folder with one or more annotated files
|
|
125 my $directory = dirname( $input );
|
|
126
|
|
127 $folderMutAnalysis = "$directory/Mutational_Analysis";
|
|
128 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; }
|
|
129 }
|
|
130 else
|
|
131 {
|
|
132 if(!-e $output) { mkdir($output) or die "$!: $output\n"; }
|
|
133
|
|
134 $folderMutAnalysis = "$output/Mutational_Analysis";
|
|
135 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; }
|
|
136 }
|
|
137
|
|
138 # If no temp folder is specified write the result in the current path
|
|
139 my ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/);
|
|
140 if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$filename"; }
|
|
141 if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; }
|
|
142
|
|
143 # Check the path to the R scripts
|
|
144 if($path_R_Scripts eq "empty")
|
|
145 {
|
|
146 print STDERR "Missing flag !\n";
|
|
147 print STDERR "You forget to specify the path for the R scripts!!!\nPlease specify it with the flag --pathRscript\n";
|
|
148 exit;
|
|
149 }
|
|
150
|
|
151
|
|
152 foreach my $file (`ls $input/*`)
|
|
153 {
|
|
154 chomp($file);
|
|
155
|
|
156 ## Verify the name of file, must be <= 31 chars for the sheet name
|
|
157 my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
|
|
158
|
|
159 if(length($filename) > 31)
|
|
160 {
|
|
161 print STDERR "Error: The filename of: $file\nMust be <= 31 chars\nPlease modify it before running the script\n";
|
|
162 exit;
|
|
163 }
|
|
164 }
|
|
165 }
|
|
166
|
|
167 # Check input file(s)
|
|
168 sub checkVariants
|
|
169 {
|
|
170 # Count the number of file(s) with enough mutations (at least 1 with a strand orientation)
|
|
171 my $timerFile = 0;
|
|
172 my @listRemovedFile = ();
|
|
173
|
|
174 foreach my $file (`ls $input/*`)
|
|
175 {
|
|
176 chomp($file);
|
|
177
|
|
178 ### Check if the file is annotated
|
|
179 my $testAnnotation = "toto";
|
|
180 $testAnnotation = `grep 'Func.refGene' $file`;
|
|
181
|
|
182 if($testAnnotation eq "toto")
|
|
183 {
|
|
184 print STDERR "Error: The input file you specify is not annotated!\nThe file concerned is: $file !!!!\nPlease first annotate your file before trying to generate the report on mutation spectra\n";
|
|
185 exit;
|
|
186 }
|
|
187 else
|
|
188 {
|
|
189 ### check if there is at least 1 mutation with a strand info
|
|
190 my $strand_value = recoverNumCol($file, "Strand");
|
|
191 my $nbSBScoding = 0;
|
|
192
|
|
193 open(F1, $file) or die "$!: $file\n";
|
|
194 my $header = <F1>;
|
|
195 while(<F1>)
|
|
196 {
|
|
197 $_ =~ s/[\r\n]+$//;
|
|
198 my @tab = split("\t", $_);
|
|
199
|
|
200 if($tab[$strand_value] ne "NA")
|
|
201 {
|
|
202 $nbSBScoding++;
|
|
203 }
|
|
204 }
|
|
205 close F1;
|
|
206
|
|
207 if($nbSBScoding != 0)
|
|
208 {
|
|
209 $timerFile++;
|
|
210 `cp $file $folderCheckedForStat/`;
|
|
211 }
|
|
212 else
|
|
213 {
|
|
214 print STDOUT "\n\nWarning: There is no variant to compute statistics for $file\n\n";
|
|
215 push(@listRemovedFile, $file);
|
|
216 }
|
|
217 }
|
|
218 }
|
|
219
|
|
220 if($timerFile == 0)
|
|
221 {
|
|
222 print STDERR "\n\nError: No variants to compute statistics for:\n";
|
|
223
|
|
224 foreach (@listRemovedFile)
|
|
225 {
|
|
226 print STDERR $_."\n";
|
|
227 }
|
|
228 exit;
|
|
229 }
|
|
230 }
|
|
231
|
|
232 # Retrieve chromosomes length
|
|
233 sub checkChrDir
|
|
234 {
|
|
235 my @files = `ls $path_SeqrefGenome/$refGenome"_seq"/*.fa`;
|
|
236 foreach my $file (@files)
|
|
237 {
|
|
238 if ($file !~ /chr(\d+|x|y)\.fa/i){next;}
|
|
239 open(FILE,$file);
|
|
240 <FILE>;
|
|
241 my $seq="";
|
|
242 while (<FILE>){ chomp; $seq.=$_;}
|
|
243 $file =~ /chr(.*)\.fa/;
|
|
244 $chromosomes{"chr".$1}=length($seq);
|
|
245 }
|
|
246 }
|
|
247
|
|
248 # Calculate the statistics and generate the report
|
|
249 sub ReportMutDist
|
|
250 {
|
|
251 print STDOUT "-----------------------------------------------------------------\n";
|
|
252 print STDOUT "-----------------Report Mutational Analysis----------------------\n";
|
|
253 print STDOUT "-----------------------------------------------------------------\n";
|
|
254
|
|
255 my $folderFigure = "$folderMutAnalysis/Figures";
|
|
256 if(-e $folderFigure) { rmtree($folderFigure); mkdir($folderFigure) or die "Can't create the directory $folderFigure\n"; }
|
|
257 else { mkdir($folderFigure) or die "Can't create the directory $folderFigure\n"; }
|
|
258 my $folderChi2 = "$folderFigure/Chi2";
|
|
259 if(!-e $folderChi2) { mkdir($folderChi2) or die "Can't create the directory $folderChi2\n"; }
|
|
260 my $folderWebLogo = "$folderFigure/WebLogo";
|
|
261 if(!-e $folderWebLogo) { mkdir($folderWebLogo) or die "Can't create the directory $folderWebLogo\n"; }
|
|
262 my $folderNMF = "$folderFigure/Input_NMF";
|
|
263 if(!-e $folderNMF) { mkdir($folderNMF) or die "Can't create the directory $folderNMF\n"; }
|
|
264
|
|
265
|
|
266 ################################################################################################
|
|
267 ### Calculates all the statistics ###
|
|
268 ################################################################################################
|
|
269
|
|
270 ########### Recover the functional region for all the samples. Allows to thave the same annotations for the pie chart "Impact on protein sequence"
|
|
271 my @tab_func = recoverAnnovarAnnotation($func_name);
|
|
272 if(@tab_func == 0)
|
|
273 {
|
|
274 print STDERR "Error: the table for the functional region is empty!!!!! check $folderCheckedForStat\n$func_name\n";
|
|
275 exit;
|
|
276 }
|
|
277
|
|
278 ############ Calculate the different statistics present in the report
|
|
279 my %h_file = ();
|
|
280 CalculateStatistics(\%h_file, \@tab_func);
|
|
281
|
|
282 ############ Calculate the chi2 for the strand bias
|
|
283 CalculateChi2(\%h_file, $folderChi2);
|
|
284
|
|
285 ############ Write the different statistics present in the report
|
|
286 WriteStatistics(\%h_file, $#tab_func, $folderFigure, $folderChi2, $folderNMF);
|
|
287
|
|
288 ############ Create logo for studying the 10 flanking bases of the mutation
|
|
289 CreateLogo(\%h_file, $folderWebLogo);
|
|
290 }
|
|
291
|
|
292
|
|
293 # Calculate the different statistics present in the report
|
|
294 sub CalculateStatistics
|
|
295 {
|
|
296 my ($refH_file, $refT_func) = @_;
|
|
297
|
|
298 our ($chr_value, $start_value, $ref_value, $alt_value, $func_value, $exonicFunc_value, $strand_value, $contextSeq_value) = ("", "", "", "", "", "", "", "", "", "");
|
|
299
|
|
300 # Generate the pool of all the data
|
|
301 if($poolData == 1)
|
|
302 {
|
|
303 my @listFile = `ls $folderCheckedForStat`;
|
|
304
|
|
305 # For keeping the header only one time
|
|
306 chomp($listFile[0]);
|
|
307 system("cp $folderCheckedForStat/$listFile[0] $folderCheckedForStat/Pool_Data.txt");
|
|
308
|
|
309 open(OUT, ">>", "$folderCheckedForStat/Pool_Data.txt") or die "$!: $folderCheckedForStat/Pool_Data.txt\n";
|
|
310
|
|
311 for(my $i=1; $i<=$#listFile; $i++)
|
|
312 {
|
|
313 chomp($listFile[$i]);
|
|
314 open(F1, "$folderCheckedForStat/$listFile[$i]") or die "$!: $folderCheckedForStat/$listFile[$i]\n";
|
|
315 my $header = <F1>;
|
|
316 while(<F1>) { print OUT $_; }
|
|
317 close F1;
|
|
318 }
|
|
319 close OUT;
|
|
320 }
|
|
321
|
|
322 foreach my $file (`ls $folderCheckedForStat/*`)
|
|
323 {
|
|
324 chomp($file);
|
|
325 ############ Recover the number of the columns of interest
|
|
326 $chr_value = recoverNumCol($file, $chr_name);
|
|
327 $start_value = recoverNumCol($file, $start_name);
|
|
328 $ref_value = recoverNumCol($file, $ref_name);
|
|
329 $alt_value = recoverNumCol($file, $alt_name);
|
|
330 $func_value = recoverNumCol($file, $func_name);
|
|
331 $exonicFunc_value = recoverNumCol($file, $exonicFunc_name);
|
|
332 $strand_value = recoverNumCol($file, $strand_name);
|
|
333 $contextSeq_value = recoverNumCol($file, $context_name);
|
|
334 ############ Recover the number of the columns of interest
|
|
335
|
|
336 ############ Calculate the statistics for each file
|
|
337 File2Hash($file, $func_value, $exonicFunc_value, $chr_value, $ref_value, $alt_value, $strand_value, $contextSeq_value, $refH_file, $refT_func);
|
|
338 }
|
|
339 }
|
|
340
|
|
341 # Calculate the chi2 for the strand bias
|
|
342 sub CalculateChi2
|
|
343 {
|
|
344 my ($refH_file, $folderChi2) = @_;
|
|
345
|
|
346 # No value for the chi2
|
|
347 if(scalar (keys %{$refH_file}) == 0)
|
|
348 {
|
|
349 print STDERR "Error: No value for calculating the chi2 for the strand bias\n";
|
|
350 exit;
|
|
351 }
|
|
352
|
|
353 # Strand bias for one mutation type for all the samples
|
|
354 my %h_tempchi2 = ();
|
|
355 my ($ca_NonTr, $ca_Tr, $cg_NonTr, $cg_Tr, $ct_NonTr, $ct_Tr, $ta_NonTr, $ta_Tr, $tc_NonTr, $tc_Tr, $tg_NonTr, $tg_Tr) = (0,0,0,0,0,0, 0,0,0,0,0,0);
|
|
356
|
|
357 my $nb_file = 0;
|
|
358
|
|
359 foreach my $k_file (sort keys %{$refH_file})
|
|
360 {
|
|
361 $nb_file++;
|
|
362 foreach my $k_func (sort keys %{$refH_file->{$k_file}{'6mutType'}})
|
|
363 {
|
|
364 foreach my $k_mutation (sort keys %{$refH_file->{$k_file}{'6mutType'}{$k_func}})
|
|
365 {
|
|
366 if($k_mutation eq "C:G>A:T")
|
|
367 {
|
|
368 $h_tempchi2{'C>A'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'};
|
|
369 $h_tempchi2{'C>A'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
|
|
370 }
|
|
371 if($k_mutation eq "C:G>G:C")
|
|
372 {
|
|
373 $h_tempchi2{'C>G'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'};
|
|
374 $h_tempchi2{'C>G'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
|
|
375 }
|
|
376 if($k_mutation eq "C:G>T:A")
|
|
377 {
|
|
378 $h_tempchi2{'C>T'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'};
|
|
379 $h_tempchi2{'C>T'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
|
|
380 }
|
|
381 if($k_mutation eq "T:A>A:T")
|
|
382 {
|
|
383 $h_tempchi2{'T>A'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'};
|
|
384 $h_tempchi2{'T>A'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
|
|
385 }
|
|
386 if($k_mutation eq "T:A>C:G")
|
|
387 {
|
|
388 $h_tempchi2{'T>C'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'};
|
|
389 $h_tempchi2{'T>C'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
|
|
390 }
|
|
391 if($k_mutation eq "T:A>G:C")
|
|
392 {
|
|
393 $h_tempchi2{'T>G'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'};
|
|
394 $h_tempchi2{'T>G'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
|
|
395 }
|
|
396 }
|
|
397 }
|
|
398 }
|
|
399
|
|
400 # Create the input file for NMF
|
|
401 open(CHI2, ">", "$folderChi2/Input_chi2_strandBias.txt") or die "$!: $folderChi2/Input_chi2_strandBias.txt\n";
|
|
402 print CHI2 "SampleName\tNonTr\tTr\tAlteration\n";
|
|
403
|
|
404 foreach my $k_mutation (sort keys %h_tempchi2)
|
|
405 {
|
|
406 foreach my $k_file (sort keys %{$h_tempchi2{$k_mutation}})
|
|
407 {
|
|
408 print CHI2 "$k_file\t$h_tempchi2{$k_mutation}{$k_file}{'NonTr'}\t$h_tempchi2{$k_mutation}{$k_file}{'Tr'}\t$k_mutation\n";
|
|
409 }
|
|
410 }
|
|
411 close CHI2;
|
|
412
|
|
413
|
|
414 `Rscript $pathRscriptChi2test --folderChi2 $folderChi2 2>&1`;
|
|
415 # `Rscript $pathRscriptChi2test $folderChi2 2>&1`;
|
|
416
|
|
417
|
|
418 if(!-e "$folderChi2/Output_chi2_strandBias.txt")
|
|
419 {
|
|
420 print STDERR "Error: Chi2 test didn't work !!!\n";
|
|
421 exit;
|
|
422 }
|
|
423 }
|
|
424
|
|
425 # Write the different statistics in the report
|
|
426 sub WriteStatistics
|
|
427 {
|
|
428 my ($refH_file, $nb_func, $folderFigure, $folderChi2, $folderNMF) = @_;
|
|
429
|
|
430 # Save the different graphs in specific folde
|
|
431 if(!-e "$folderFigure/Overall_mutation_distribution") { mkdir("$folderFigure/Overall_mutation_distribution") or die "Can't create the directory $folderFigure/Overall_mutation_distribution\n"; }
|
|
432 if(!-e "$folderFigure/Impact_protein_sequence") { mkdir("$folderFigure/Impact_protein_sequence") or die "Can't create the directory $folderFigure/Impact_protein_sequence\n"; }
|
|
433 if(!-e "$folderFigure/SBS_distribution") { mkdir("$folderFigure/SBS_distribution") or die "Can't create the directory $folderFigure/SBS_distribution\n"; }
|
|
434 if(!-e "$folderFigure/Stranded_Analysis") { mkdir("$folderFigure/Stranded_Analysis") or die "Can't create the directory $folderFigure/Stranded_Analysis\n"; }
|
|
435 if(!-e "$folderFigure/Trinucleotide_Sequence_Context") { mkdir("$folderFigure/Trinucleotide_Sequence_Context") or die "Can't create the directory $folderFigure/Trinucleotide_Sequence_Context\n"; }
|
|
436 if(!-e "$folderFigure/Distribution_SBS_Per_Chromosomes") { mkdir("$folderFigure/Distribution_SBS_Per_Chromosomes") or die "Can't create the directory $folderFigure/Distribution_SBS_Per_Chromosomes\n"; }
|
|
437
|
|
438
|
|
439 # Create a workbook with all the samples
|
|
440 my $wb = ""; my $ws_sum = "";
|
|
441 my ($ws_inputNMF_count, $ws_inputNMF_percent) = ("", "");
|
|
442
|
|
443 # Create one Excel file with all the samples
|
|
444 if($oneReportPerSample == 2)
|
|
445 {
|
|
446 $wb = Spreadsheet::WriteExcel->new("$folderMutAnalysis/Report_Mutation_Spectra.xls");
|
|
447
|
|
448 ############## Set the variables for font formats in the Excel report
|
|
449 Format_A10($wb, \$format_A10); # Text center in Arial 10
|
|
450 Format_A10BoldLeft($wb, \$format_A10Boldleft); # Text on the left in Arial 10 bold
|
|
451 Format_TextSection($wb, \$formatT_left, \$formatT_right, \$formatT_bottomRight, \$formatT_bottomLeft, \$formatT_bottom, \$formatT_bottomHeader, \$formatT_bottomRightHeader, \$formatT_bottomHeader2, \$formatT_rightHeader);
|
|
452 Format_GraphTitle($wb, \$formatT_graphTitle);
|
|
453 Format_Table($wb, \$table_topleft, \$table_topRight, \$table_bottomleft, \$table_bottomRight, \$table_top, \$table_right, \$table_bottom, \$table_bottomItalicRed, \$table_left, \$table_bottomrightHeader, \$table_left2, \$table_middleHeader, \$table_middleHeader2);
|
|
454 Format_A10ItalicRed($wb, \$format_A10ItalicRed);
|
|
455
|
|
456 ############### Worksheet with a summary of the samples
|
|
457 $ws_sum = $wb->add_worksheet("Sample_List");
|
|
458 $ws_sum->write(0, 0, "Samples", $format_A10); $ws_sum->write(0, 1, "Total number SBS", $format_A10); $ws_sum->write(0, 2, "Total number of Indel", $format_A10); $ws_sum->write(0, 3, "Total number of mutations", $format_A10);
|
|
459 $ws_sum->set_column(0,0, 50); $ws_sum->set_column(1,1, 20); $ws_sum->set_column(2,2, 20); $ws_sum->set_column(3,3, 22);
|
|
460
|
|
461 ############### Write the input matrix for NMF for the count and the un-normalized frequency
|
|
462 $ws_inputNMF_count = $wb->add_worksheet("Input_NMF_Count");
|
|
463 $ws_inputNMF_percent = $wb->add_worksheet("Input_NMF_Percent");
|
|
464 }
|
|
465
|
|
466
|
|
467 ################################################ Set the Rows and columns of the different part of the report
|
|
468 my $row_SumSheet = 1; # First row for the summary sheet of the report
|
|
469 my $rowStart_SBSdistrBySeg = 48; my $colStart_SBSdistrBySeg = 0; # For the table SBS distribution by segment
|
|
470 my $colStart_matrixSeqContext = 19; # Sequence context
|
|
471 my $col_inputNMF = 0; # Write the names of the samples with at least 33 SBS
|
|
472
|
|
473
|
|
474 ## For each file
|
|
475 foreach my $k_file (sort keys %{$refH_file})
|
|
476 {
|
|
477 print "File in process: $k_file\n";
|
|
478
|
|
479 # Count the total of mutations for 6 mutation types on genomic strand
|
|
480 my ($c_ca6_g, $c_cg6_g, $c_ct6_g, $c_ta6_g, $c_tc6_g, $c_tg6_g) = (0,0,0, 0,0,0);
|
|
481
|
|
482 if($k_file ne "Pool_Data") { $col_inputNMF++; }
|
|
483
|
|
484 ############### Save the chi2 values into a hash table
|
|
485 if(-e "$folderChi2/Output_chi2_strandBias.txt")
|
|
486 {
|
|
487 chi2hash("$folderChi2/Output_chi2_strandBias.txt", $k_file);
|
|
488 }
|
|
489
|
|
490 # Create one workbook for each sample
|
|
491 if($oneReportPerSample == 1)
|
|
492 {
|
|
493 $wb = Spreadsheet::WriteExcel->new("$folderMutAnalysis/Report_Mutation_Spectra-$k_file.xls");
|
|
494
|
|
495 ############## Set the variables for font formats in the Excel report
|
|
496 Format_A10($wb, \$format_A10); # Text center in Arial 10
|
|
497 Format_A10BoldLeft($wb, \$format_A10Boldleft); # Text on the left in Arial 10 bold
|
|
498 Format_TextSection($wb, \$formatT_left, \$formatT_right, \$formatT_bottomRight, \$formatT_bottomLeft, \$formatT_bottom, \$formatT_bottomHeader, \$formatT_bottomRightHeader, \$formatT_bottomHeader2, \$formatT_rightHeader);
|
|
499 Format_GraphTitle($wb, \$formatT_graphTitle);
|
|
500 Format_Table($wb, \$table_topleft, \$table_topRight, \$table_bottomleft, \$table_bottomRight, \$table_top, \$table_right, \$table_bottom, \$table_bottomItalicRed, \$table_left, \$table_bottomrightHeader, \$table_left2, \$table_middleHeader, \$table_middleHeader2);
|
|
501 Format_A10ItalicRed($wb, \$format_A10ItalicRed);
|
|
502
|
|
503 ############### Worksheet with a summary of the samples
|
|
504 $ws_sum = $wb->add_worksheet("Sample_List");
|
|
505 $ws_sum->write(0, 0, "Samples", $format_A10); $ws_sum->write(0, 1, "Total number SBS", $format_A10); $ws_sum->write(0, 2, "Total number of Indel", $format_A10); $ws_sum->write(0, 3, "Total number of mutations", $format_A10);
|
|
506 $ws_sum->set_column(0,0, 50); $ws_sum->set_column(1,1, 20); $ws_sum->set_column(2,2, 20); $ws_sum->set_column(3,3, 22);
|
|
507 # Write in the Samples sheet the name and the total number of SBS
|
|
508 $ws_sum->write(1, 0, "$k_file", $format_A10);
|
|
509 $ws_sum->write(1, 1, $refH_file->{$k_file}{'TotalSBSGenomic'}, $format_A10); $ws_sum->write(1, 2, $refH_file->{$k_file}{'TotalIndelGenomic'}, $format_A10); $ws_sum->write($row_SumSheet, 3, $refH_file->{$k_file}{'TotalMutGenomic'}, $format_A10);
|
|
510 }
|
|
511 # One workbook with all the samples
|
|
512 else
|
|
513 {
|
|
514 # Write in the Samples sheet the name and the total number of SBS
|
|
515 $ws_sum->write($row_SumSheet, 0, $k_file, $format_A10);
|
|
516 $ws_sum->write($row_SumSheet, 1, $refH_file->{$k_file}{'TotalSBSGenomic'}, $format_A10); $ws_sum->write($row_SumSheet, 2, $refH_file->{$k_file}{'TotalIndelGenomic'}, $format_A10); $ws_sum->write($row_SumSheet, 3, $refH_file->{$k_file}{'TotalMutGenomic'}, $format_A10);
|
|
517
|
|
518 # For NMF don't consider the pool of the samples
|
|
519 if($k_file ne "Pool_Data")
|
|
520 {
|
|
521 # Write in the input NMF sheet the name of the samples
|
|
522 $ws_inputNMF_count->write(0, $col_inputNMF, $k_file);
|
|
523 $ws_inputNMF_percent->write(0, $col_inputNMF, $k_file);
|
|
524 }
|
|
525 }
|
|
526
|
|
527 # Calculate the correlation between the number of SBS and the size of the chromosome
|
|
528 PearsonCoefficient($refH_file, $k_file);
|
|
529
|
|
530 # Add a worksheet to the workbook
|
|
531 my $ws = $wb->add_worksheet($k_file);
|
|
532
|
|
533 # Write the titles of the different sections of the report
|
|
534 WriteBorderSection($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext);
|
|
535
|
|
536 # Write the mutation types (6 types)
|
|
537 WriteHeaderSection($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext);
|
|
538
|
|
539
|
|
540 # Save the figures of each samples in a different folder
|
|
541 if(!-e "$folderFigure/Overall_mutation_distribution/$k_file") { mkdir("$folderFigure/Overall_mutation_distribution/$k_file") or die "Can't create the directory $folderFigure/Overall_mutation_distribution/$k_file\n"; }
|
|
542 if(!-e "$folderFigure/Impact_protein_sequence/$k_file") { mkdir("$folderFigure/Impact_protein_sequence/$k_file") or die "Can't create the directory $folderFigure/Impact_protein_sequence/$k_file\n"; }
|
|
543 if(!-e "$folderFigure/SBS_distribution/$k_file") { mkdir("$folderFigure/SBS_distribution/$k_file") or die "Can't create the directory $folderFigure/SBS_distribution\n"; }
|
|
544 if(!-e "$folderFigure/Stranded_Analysis/$k_file") { mkdir("$folderFigure/Stranded_Analysis/$k_file") or die "Can't create the directory $folderFigure/Stranded_Analysis/$k_file\n"; }
|
|
545 if(!-e "$folderFigure/Trinucleotide_Sequence_Context/$k_file") { mkdir("$folderFigure/Trinucleotide_Sequence_Context/$k_file") or die "Can't create the directory $folderFigure/Trinucleotide_Sequence_Context/$k_file\n"; }
|
|
546
|
|
547
|
|
548
|
|
549 ##################################################################################################################################################
|
|
550 ################################################################# Write the statistics ##########################################################
|
|
551 ##################################################################################################################################################
|
|
552 my $row_SBSDistrBySegAndFunc_CG = $rowStart_SBSdistrBySeg+($nb_func*2)+16;
|
|
553
|
|
554
|
|
555 ######## Count of SBS by functional impact on the protein (Table 2) + Create the input for ggplot2 (pie chart with functional impact) + Create the input for ggplot2 (pie chart of SBS vs. Indels)
|
|
556 writeDistrFuncImpact($ws, $refH_file, $k_file, "$folderFigure/Impact_protein_sequence/$k_file/$k_file-DistributionExoFunc.txt", "$folderFigure/Overall_mutation_distribution/$k_file/$k_file-OverallMutationDistribution.txt");
|
|
557
|
|
558
|
|
559 ######## Result of the chi2 for the strand bias (Table 3) + Create the input for ggplot2 (Strand bias bar graph)
|
|
560 writeChi2result($wb, $ws, "$folderFigure/Stranded_Analysis/$k_file/$k_file-StrandBias.txt", $refH_file, $k_file);
|
|
561
|
|
562
|
|
563 ######## SBS distribution by functional region (Table 4) + Strand bias by functional region (Table 5) + Create the input for ggplot2 (SBS distribution) + Overall count and percent of SBS (Table 1)
|
|
564 writeStatbyFuncRegion($refH_file, $k_file, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, \$row_SBSDistrBySegAndFunc_CG, "$folderFigure/SBS_distribution/$k_file/$k_file-SBS_distribution.txt");
|
|
565
|
|
566
|
|
567 ######## Distribution of SBS per chromosomes and the result of Pearson test (Table 6)
|
|
568 writeDistrByChr($ws, $refH_file, $k_file, $row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg, "$folderFigure/Distribution_SBS_Per_Chromosomes/$k_file-DistributionSNVS_per_chromosome.txt");
|
|
569
|
|
570
|
|
571 ######## Trinucleotide sequence context on genomic strand (Panel 1)
|
|
572 # Represent the trinucleotide with a heatmap with count of SBS
|
|
573 my $heatmapCountggplot2 = "$folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-HeatmapCount-Genomic.txt";
|
|
574 my $heatmapPercentggplot2 = "$folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-HeatmapPercent-Genomic.txt";
|
|
575 my $triNtBarChartggplot2 = "$folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-MutationSpectraPercent-Genomic.txt";
|
|
576
|
|
577 writeTriNtGenomic($ws, $refH_file, $k_file, $colStart_matrixSeqContext, $heatmapCountggplot2, $heatmapPercentggplot2, $triNtBarChartggplot2, \$c_ca6_g, \$c_cg6_g, \$c_ct6_g, \$c_ta6_g, \$c_tc6_g, \$c_tg6_g);
|
|
578
|
|
579 # For the input matrix for NMF
|
|
580 if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Sample'}}, $k_file); }
|
|
581
|
|
582
|
|
583 ######## Trinucleotide sequence context on genomic strand (Panel 2)
|
|
584 my $triNtBarChartCodingCountggplot2 = "$folderFigure/Stranded_Analysis/$k_file/$k_file-StrandedSignatureCount.txt";
|
|
585 my $triNtBarChartCodingPercentggplot2 = "$folderFigure/Stranded_Analysis/$k_file/$k_file-StrandedSignaturePercent.txt";
|
|
586
|
|
587 writeTriNtCoding($ws, $rowStart_SBSdistrBySeg, $colStart_matrixSeqContext, $refH_file, $k_file, $triNtBarChartCodingCountggplot2, $triNtBarChartCodingPercentggplot2);
|
|
588
|
|
589
|
|
590 ######## Generate the figures and include them in the report
|
|
591 createWriteFigs($ws, $rowStart_SBSdistrBySeg, $colStart_matrixSeqContext, $folderFigure, $k_file, $c_ca6_g, $c_cg6_g, $c_ct6_g, $c_ta6_g, $c_tc6_g, $c_tg6_g);
|
|
592
|
|
593
|
|
594 # Next sample
|
|
595 $row_SumSheet++;
|
|
596 } # End $k_file
|
|
597
|
|
598 ######## Write the input matrix for NMF
|
|
599 # One workbook with all the samples
|
|
600 writeInputNMF($ws_inputNMF_count, $ws_inputNMF_percent, "$folderNMF/Input_NMF_Count.txt", "$folderNMF/Input_NMF_Frequency.txt");
|
|
601
|
|
602
|
|
603 # Close the workbook
|
|
604 $wb->close();
|
|
605 }
|
|
606
|
|
607 # Create logo for representing the sequence context with n bases
|
|
608 sub CreateLogo
|
|
609 {
|
|
610 my ($refH_file, $folderWebLogo) = @_;
|
|
611
|
|
612 my $folderSample = "";
|
|
613
|
|
614 foreach my $k_file (sort keys %{$refH_file})
|
|
615 {
|
|
616 $folderSample = "$folderWebLogo/$k_file";
|
|
617 if(!-e $folderSample) { mkdir($folderSample) or die "Can't create the directory $folderSample\n"; }
|
|
618
|
|
619 my $test_lengthSeqContext = 0;
|
|
620
|
|
621
|
|
622 foreach my $k_mutation (sort keys %{$refH_file->{$k_file}{'WebLogo3'}})
|
|
623 {
|
|
624 $k_mutation =~ /(\w)>(\w)/;
|
|
625 my ($ref, $alt) = ($1, $2);
|
|
626
|
|
627 open(WEBLOGO, ">", "$folderSample/$k_file-$ref$alt.fa") or die "$!: $folderSample/$k_file-$ref$alt.fa\n";
|
|
628 foreach (@{$refH_file->{$k_file}{'WebLogo3'}{$k_mutation}})
|
|
629 {
|
|
630 print WEBLOGO ">$k_file\n$_\n";
|
|
631
|
|
632 if(length($_) < 10) { $test_lengthSeqContext = 0; }
|
|
633 else { $test_lengthSeqContext = 1; }
|
|
634 }
|
|
635 close WEBLOGO;
|
|
636 }
|
|
637
|
|
638 ## Generate the logo
|
|
639 foreach my $fastaFile (`ls $folderSample/*.fa`)
|
|
640 {
|
|
641 chomp($fastaFile);
|
|
642 my ($filename, $directories, $suffix) = fileparse("$folderSample/$fastaFile", qr/\.[^.]*/);
|
|
643
|
|
644 $filename =~ /(.+)\-/;
|
|
645 my $title = $1;
|
|
646
|
|
647 ## Test if there is fasta sequences for the mutation type
|
|
648 my $nbLigne_temp = `wc -l $fastaFile`;
|
|
649 my @nbLigne = split(" ", $nbLigne_temp);
|
|
650
|
|
651
|
|
652 if($nbLigne[0] == 0) { print "WARNING: No sequence for $filename\n"; next; }
|
|
653
|
|
654 # When length sequence context is lower than 10 the image is to small for adding a title
|
|
655 if($test_lengthSeqContext == 1) { system("weblogo -c classic -F png_print -U probability --title $title < $fastaFile > $folderSample/$filename-Probability.png"); }
|
|
656 else { system("weblogo -c classic -F png_print -U probability < $fastaFile > $folderSample/$filename-Probability.png"); }
|
|
657 }
|
|
658 }
|
|
659 }
|
|
660
|
|
661
|
|
662
|
|
663 ### Save the count of SBS for each file into a hash table
|
|
664 sub File2Hash
|
|
665 {
|
|
666 my ($inputFile, $func_value, $exonicFunc_value, $chr_value, $ref_value, $alt_value, $strand_value, $contextSeq_value, $refH_file, $refT_func) = @_;
|
|
667
|
|
668 my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/);
|
|
669
|
|
670 # Initialisation of the hash
|
|
671 my @tab_mutation = qw(C:G>A:T C:G>G:C C:G>T:A T:A>A:T T:A>C:G T:A>G:C);
|
|
672 my @tab_aaChange = ("NonTr", "Tr", "TotalMutG");
|
|
673 my @tabExoFunc = ("frameshift insertion", "frameshift deletion", "frameshift block substitution", "frameshift substitution", "stopgain", "stoploss", "nonframeshift insertion", "nonframeshift deletion", "nonframeshift substitution", "nonframeshift block substitution", "nonsynonymous SNV", "synonymous SNV", "unknown", "NA");
|
|
674
|
|
675 # Total number of SBS on the genomic strand
|
|
676 $refH_file->{$filename}{'TotalSBSGenomic'} = 0;
|
|
677 # Total number of Indel on the genomic strand
|
|
678 $refH_file->{$filename}{'TotalIndelGenomic'} = 0;
|
|
679 # Total number of SBS on the coding strand
|
|
680 $refH_file->{$filename}{'TotalSBSCoding'} = 0;
|
|
681 # Total number of SBS and Indel on the genomic strand
|
|
682 $refH_file->{$filename}{'TotalMutGenomic'} = 0;
|
|
683
|
|
684
|
|
685 #####################################################
|
|
686 # Initialisation of the tables and hash tables #
|
|
687 #####################################################
|
|
688
|
|
689 ## SBS by segment (6 mutation types)
|
|
690 foreach my $elt_tabFunc (@$refT_func)
|
|
691 {
|
|
692 foreach my $elt_tabMutation (@tab_mutation)
|
|
693 {
|
|
694 foreach my $elt_aaChange (@tab_aaChange)
|
|
695 {
|
|
696 $refH_file->{$filename}{'6mutType'}{$elt_tabFunc}{$elt_tabMutation}{$elt_aaChange} = 0;
|
|
697 }
|
|
698 }
|
|
699 }
|
|
700
|
|
701 ## Pearson correlation
|
|
702 $refH_file->{$filename}{'SBSPerChr'}{'AllMutType'} = 0;
|
|
703 # Count of SBS per chromosome foreach mutation types
|
|
704 foreach my $elt_tabMutation (@tab_mutation)
|
|
705 {
|
|
706 foreach my $chromosome (sort keys %chromosomes){ $refH_file->{$filename}{'SBSPerChr'}{$elt_tabMutation}{'CHR'}{$chromosome}{'chr'} = 0;}
|
|
707 $refH_file->{$filename}{'SBSPerChr'}{$elt_tabMutation}{'Pearson'} = 0;
|
|
708 }
|
|
709 foreach my $chromosome (sort keys %chromosomes)
|
|
710 {
|
|
711 $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'}=0;
|
|
712 }
|
|
713
|
|
714 ## Impact of SBS on protein
|
|
715 foreach my $elt_exoFunc (@tabExoFunc)
|
|
716 {
|
|
717 $refH_file->{$filename}{'ImpactSBS'}{$elt_exoFunc} = 0;
|
|
718 }
|
|
719
|
|
720 ## Sequence context (genomic strand)
|
|
721 my @tab_mutation2 = qw(C>A C>G C>T T>A T>C T>G);
|
|
722 my @tab_context = qw(A_A A_C A_G A_T C_A C_C C_G C_T G_A G_C G_G G_T T_A T_C T_G T_T);
|
|
723 foreach my $elt_context (@tab_context)
|
|
724 {
|
|
725 foreach my $elt_mutation3 (@tab_mutation2)
|
|
726 {
|
|
727 $refH_file->{$filename}{'SeqContextG'}{$elt_context}{$elt_mutation3} = 0;
|
|
728 }
|
|
729 }
|
|
730
|
|
731 ## Sequence context (coding strand)
|
|
732 my @tab_TrNonTr = qw(NonTr Tr);
|
|
733 foreach my $elt_context (@tab_context)
|
|
734 {
|
|
735 foreach my $elt_mutation2 (@tab_mutation2)
|
|
736 {
|
|
737 foreach my $trNonTr (@tab_TrNonTr)
|
|
738 {
|
|
739 $refH_file->{$filename}{'SeqContextC'}{$elt_context}{$elt_mutation2}{$trNonTr} = 0;
|
|
740 }
|
|
741 }
|
|
742 }
|
|
743
|
|
744
|
|
745 #####################################################
|
|
746 # Parse the intput file #
|
|
747 #####################################################
|
|
748
|
|
749 open(F1,$inputFile) or die "$!: $inputFile\n";
|
|
750 my $header = <F1>;
|
|
751 while(<F1>)
|
|
752 {
|
|
753 $_ =~ s/[\r\n]+$//;
|
|
754 my @tab = split("\t", $_);
|
|
755
|
|
756 ### Don't consider random chromosomes and chromosome M
|
|
757 if( ($tab[$chr_value] =~ /random/i) || ($tab[$chr_value] =~ /M/i) ) { next; }
|
|
758
|
|
759
|
|
760 ### Recover the trinucleotide sequence context: Extract the base just before and after the mutation
|
|
761 my $context = "";
|
|
762 my $contextSequence = $tab[$contextSeq_value]; $contextSequence =~ tr/a-z/A-Z/;
|
|
763 my @tempContextSequence = split("", $contextSequence);
|
|
764 my $total_nbBaseContext = $#tempContextSequence;
|
|
765 my $midlle_totalNbBaseContext = $total_nbBaseContext/2; # For having the middle of the sequence
|
|
766 my $before = $midlle_totalNbBaseContext - 1; my $after = $midlle_totalNbBaseContext + 1;
|
|
767 $context = $tempContextSequence[$before]."_".$tempContextSequence[$after];
|
|
768
|
|
769
|
|
770
|
|
771 ### Recover the annotations on the impact on the protein for creating the pie chart
|
|
772 my $exoFunc = "";
|
|
773 # Sometimes the annotation is repeated frameshift deletion;frameshift deletion
|
|
774 if($tab[$exonicFunc_value] =~ /\;/)
|
|
775 {
|
|
776 my @temp = split(";", $tab[$exonicFunc_value]);
|
|
777 if($temp[0] eq $temp[1]) { $exoFunc = $temp[0]; }
|
|
778 }
|
|
779 # The annotations have changed after MAJ Annovar 2014Jul22 (stopgain SNV => stopgain)
|
|
780 elsif($tab[$exonicFunc_value] eq "stopgain SNV") { $exoFunc = "stopgain"; }
|
|
781 elsif($tab[$exonicFunc_value] eq "stoploss SNV") { $exoFunc = "stoploss"; }
|
|
782 elsif($tab[$exonicFunc_value] eq "nonsynonymous_SNV") { $exoFunc = "nonsynonymous SNV"; }
|
|
783 elsif($tab[$exonicFunc_value] eq "stopgain_SNV") { $exoFunc = "stopgain SNV"; }
|
|
784 elsif($tab[$exonicFunc_value] eq "synonymous_SNV") { $exoFunc = "synonymous SNV"; }
|
|
785 else { $exoFunc = $tab[$exonicFunc_value]; }
|
|
786
|
|
787 if(exists $refH_file->{$filename}{'ImpactSBS'}{$exoFunc})
|
|
788 {
|
|
789 # If the sequence context if not recovered correctly don't considered the variants
|
|
790 if( ($context =~ /N/) || (length($context) != 3) ) { next; }
|
|
791
|
|
792 $refH_file->{$filename}{'ImpactSBS'}{$exoFunc}++;
|
|
793 $refH_file->{$filename}{'TotalMutGenomic'}++;
|
|
794 }
|
|
795 else { print "WARNING: Exonic function not considered: $exoFunc\n"; }
|
|
796
|
|
797 #### Only SBS are considered for the statistics
|
|
798 if( ($tab[$ref_value] =~ /^[ACGT]$/i) && ($tab[$alt_value] =~ /^[ACGT]$/i) )
|
|
799 {
|
|
800 # If the sequence context if not recovered correctly don't considered the variants
|
|
801 if( ($context =~ /N/) || (length($context) != 3) ) { next; }
|
|
802
|
|
803 # Total number of SBS on the genomic strand
|
|
804 $refH_file->{$filename}{'TotalSBSGenomic'}++;
|
|
805
|
|
806 # Total number of SBS on the coding strand with a sequence context
|
|
807 if( ($tab[$strand_value] eq "+") || ($tab[$strand_value] eq "-") )
|
|
808 {
|
|
809 if( ($context ne "NA") && (($context =~ /N/) || (length($context) != 3)) ) { next; }
|
|
810 $refH_file->{$filename}{'TotalSBSCoding'}++;
|
|
811 }
|
|
812 }
|
|
813 else { $refH_file->{$filename}{'TotalIndelGenomic'}++; }
|
|
814
|
|
815 ### Number of SBS per chromosome: remove the "chr"
|
|
816 my $chrNameForH=$tab[$chr_value];
|
|
817 if(exists $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chrNameForH}{'chr'})
|
|
818 {
|
|
819 $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chrNameForH}{'chr'}++;
|
|
820 }
|
|
821
|
|
822
|
|
823 #### Some func value are repeated and separated by ";"
|
|
824 my $funcSegment = "";
|
|
825 if($tab[$func_value] =~ /;/) { my @temp = split(";", $tab[$func_value]); $funcSegment = $temp[0]; }
|
|
826 else { $funcSegment = $tab[$func_value]; }
|
|
827
|
|
828
|
|
829
|
|
830 #####################################################
|
|
831 # Calculate the statistics for each mutation type #
|
|
832 #####################################################
|
|
833 if( (($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "A")) || ( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "T") ) )
|
|
834 {
|
|
835 my $mutation = "C:G>A:T";
|
|
836
|
|
837 statPerMutType($filename, \@tab, $ref_value, $alt_value, \@tempContextSequence, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext);
|
|
838 }
|
|
839 if( (($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "G")) || ( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "C") ) )
|
|
840 {
|
|
841 my $mutation = "C:G>G:C";
|
|
842
|
|
843 statPerMutType($filename, \@tab, $ref_value, $alt_value, \@tempContextSequence, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext);
|
|
844 }
|
|
845 if( (($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "T")) || ( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "A") ) )
|
|
846 {
|
|
847 my $mutation = "C:G>T:A";
|
|
848 statPerMutType($filename, \@tab, $ref_value, $alt_value, \@tempContextSequence, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext);
|
|
849 }
|
|
850 if( (($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "A")) || ( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "T") ) )
|
|
851 {
|
|
852 my $mutation = "T:A>A:T";
|
|
853 statPerMutType($filename, \@tab, $ref_value, $alt_value, \@tempContextSequence, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext);
|
|
854 }
|
|
855 if( (($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "C")) || ( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "G")) )
|
|
856 {
|
|
857 my $mutation = "T:A>C:G";
|
|
858 statPerMutType($filename, \@tab, $ref_value, $alt_value, \@tempContextSequence, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext);
|
|
859 }
|
|
860 if( (($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "G")) || ( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "C")) )
|
|
861 {
|
|
862 my $mutation = "T:A>G:C";
|
|
863 statPerMutType($filename, \@tab, $ref_value, $alt_value, \@tempContextSequence, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext);
|
|
864 }
|
|
865 }
|
|
866 close F1;
|
|
867 }
|
|
868
|
|
869 ### Count the number of SBS for 12 and 6 categories
|
|
870 sub statPerMutType
|
|
871 {
|
|
872 my ($filename, $refTab, $ref_value, $alt_value, $refTab_tempSeqContext, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext) = @_;
|
|
873
|
|
874 my @tab = @$refTab;
|
|
875 my @tempContextSequence = @$refTab_tempSeqContext;
|
|
876
|
|
877
|
|
878 # Split the mutations
|
|
879 $mutation =~ /(\w)\:(\w)\>(\w)\:(\w)/;
|
|
880 my ($ref1, $ref2, $alt1, $alt2) = ($1, $2, $3, $4);
|
|
881
|
|
882 # Count the total number of mutations
|
|
883 $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++;
|
|
884
|
|
885 # Pearson correlation
|
|
886 if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'})
|
|
887 {
|
|
888 $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++;
|
|
889 }
|
|
890
|
|
891 #### Sequence context - 6 mutation types - genomic strand
|
|
892 my $mutationSeqContext6mutType = "$ref1>$alt1";
|
|
893 # We want to express the mutation in C> or T>
|
|
894 if( ($tab[$ref_value] eq $ref2) && ($tab[$alt_value] eq $alt2) )
|
|
895 {
|
|
896 my $base3 = complement($tempContextSequence[$before]);
|
|
897 my $base5 = complement($tempContextSequence[$after]);
|
|
898 my $context_reverse = $base5."_".$base3;
|
|
899 if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType})
|
|
900 {
|
|
901 $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++;
|
|
902 }
|
|
903 }
|
|
904 elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType})
|
|
905 {
|
|
906 $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++;
|
|
907 }
|
|
908
|
|
909
|
|
910 #### Strand analysis C>N and T>N on NonTr strand
|
|
911 if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq $ref1)&&($tab[$alt_value] eq $alt1))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq $ref2)&&($tab[$alt_value] eq $alt2))) )
|
|
912 {
|
|
913 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'})
|
|
914 {
|
|
915 $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++;
|
|
916 }
|
|
917
|
|
918 # C>A With the sequence context (C>N and T>N on strand +)
|
|
919 if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq $ref1)&&($tab[$alt_value] eq $alt1)) )
|
|
920 {
|
|
921 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{$mutationSeqContext6mutType}{'NonTr'})
|
|
922 {
|
|
923 $refH_file->{$filename}{'SeqContextC'}{$context}{$mutationSeqContext6mutType}{'NonTr'}++;
|
|
924 }
|
|
925 }
|
|
926 # C>A With the sequence context (G>N and A>N on strand -)
|
|
927 else
|
|
928 {
|
|
929 my $base3 = complement($tempContextSequence[$before]);
|
|
930 my $base5 = complement($tempContextSequence[$after]);
|
|
931 my $context_reverse = $base5."_".$base3;
|
|
932
|
|
933 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{$mutationSeqContext6mutType}{'NonTr'})
|
|
934 {
|
|
935 $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{$mutationSeqContext6mutType}{'NonTr'}++;
|
|
936 }
|
|
937 }
|
|
938 }
|
|
939
|
|
940 #### Strand analysis C>N and T>N on Tr strand
|
|
941 if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq $ref1)&&($tab[$alt_value] eq $alt1))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq $ref2)&&($tab[$alt_value] eq $alt2))) )
|
|
942 {
|
|
943 if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'})
|
|
944 {
|
|
945 $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++;
|
|
946 }
|
|
947
|
|
948 # C>N and T>N With the sequence context (strand -)
|
|
949 if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq $ref1)&&($tab[$alt_value] eq $alt1)) )
|
|
950 {
|
|
951 if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{$mutationSeqContext6mutType}{'Tr'})
|
|
952 {
|
|
953 $refH_file->{$filename}{'SeqContextC'}{$context}{$mutationSeqContext6mutType}{'Tr'}++;
|
|
954 }
|
|
955 }
|
|
956 # C>N and T>N with the sequence context (strand +)
|
|
957 if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq $ref2)&&($tab[$alt_value] eq $alt2)) )
|
|
958 {
|
|
959 my $base3 = complement($tempContextSequence[$before]);
|
|
960 my $base5 = complement($tempContextSequence[$after]);
|
|
961 my $context_reverse = $base5."_".$base3;
|
|
962 if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{$mutationSeqContext6mutType}{'Tr'})
|
|
963 {
|
|
964 $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{$mutationSeqContext6mutType}{'Tr'}++;
|
|
965 }
|
|
966 }
|
|
967 }
|
|
968
|
|
969 #### WebLogo-3
|
|
970 if(($tab[$ref_value] eq $ref1) && ($tab[$alt_value] eq $alt1))
|
|
971 {
|
|
972 # For the logo all the sequences must have the same length
|
|
973 if(scalar(@tempContextSequence) == 2) { next; }
|
|
974 my ($contextTemp1, $contextTemp2) = ("", "");
|
|
975 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; }
|
|
976 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; }
|
|
977 my $context = $contextTemp1.$ref1.$contextTemp2;
|
|
978 push(@{$refH_file->{$filename}{'WebLogo3'}{$mutationSeqContext6mutType}}, $context);
|
|
979 }
|
|
980 else
|
|
981 {
|
|
982
|
|
983 if(scalar(@tempContextSequence) == 2) { next; }
|
|
984 my ($contextTemp1, $contextTemp2) = ("", "");
|
|
985 for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); }
|
|
986 for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); }
|
|
987 my $context = $contextTemp1.$ref1.$contextTemp2; $context = reverse $context;
|
|
988 push(@{$refH_file->{$filename}{'WebLogo3'}{$mutationSeqContext6mutType}}, $context);
|
|
989 }
|
|
990 }
|
|
991
|
|
992 # Calculate the correlation between the number of SBS and the size of the chromosome
|
|
993 sub PearsonCoefficient
|
|
994 {
|
|
995 our ($refH_file, $filename) = @_;
|
|
996
|
|
997 #### Calculate the Pearson coefficient
|
|
998 my @total_SBS = (); # Pearson for all mutation types
|
|
999
|
|
1000 # Create a 2D array
|
|
1001 foreach my $k_mutation (sort keys %{$refH_file->{$filename}{'SBSPerChr'}})
|
|
1002 {
|
|
1003 my $x = [];
|
|
1004 my $correlation = 0;
|
|
1005
|
|
1006 if($k_mutation eq "AllMutType") { next; }
|
|
1007 elsif($k_mutation eq "TotalPerChr") { next; }
|
|
1008 elsif($k_mutation eq "ChrSize") { next; }
|
|
1009 else
|
|
1010 {
|
|
1011 my $testZero = 0; # The correlation function doesn't works if all the variables are equal to zero
|
|
1012 # generate an anonymous 2D array where $x->[1] is the row
|
|
1013 # $x->[1][1] is the value in row 1 column 1 and $x->[1][2] is the value of row 1 column 2
|
|
1014 # once you build the entire array, pass it to the correlation subroutine
|
|
1015 my $i=1;
|
|
1016 while ( my ($chromosome, $lenght) = each (%chromosomes))
|
|
1017 {
|
|
1018 $x->[$i][1] = $lenght; # First column contains the chromosome size
|
|
1019 $x->[$i][2] = $refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'CHR'}{$chromosome}{'chr'}; # Second column contains the count of SBS
|
|
1020 if($refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'CHR'}{$chromosome}{'chr'}==0) { $testZero++; }
|
|
1021 $i++;
|
|
1022 }
|
|
1023 if( $testZero == keys %{$refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'CHR'}} )
|
|
1024 {
|
|
1025 $correlation = 0;
|
|
1026 }
|
|
1027 # Pass the 2D array to the correlation subroutine
|
|
1028 else
|
|
1029 {
|
|
1030 $correlation = correlation($x);
|
|
1031 }
|
|
1032
|
|
1033 $refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'Pearson'} = $correlation; # Pearson per mutation type
|
|
1034 }
|
|
1035 }
|
|
1036
|
|
1037 #generate an anonymous 2D array for all mutation type
|
|
1038 my $testZero = 0;
|
|
1039 my $x = [];
|
|
1040 my $correlation = 0;
|
|
1041 my $i=1;
|
|
1042 while ( my ($chromosome, $lenght) = each (%chromosomes))
|
|
1043 {
|
|
1044 $x->[$i][1] = $lenght;
|
|
1045 $x->[$i][2] = $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'};
|
|
1046 $i++;
|
|
1047 }
|
|
1048 if($testZero == keys %{$refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}} ) { $correlation = 0; }
|
|
1049 else { $correlation = correlation($x); }
|
|
1050 # Pass the 2D array to the correlation subroutine
|
|
1051 $refH_file->{$filename}{'SBSPerChr'}{'AllMutType'} = $correlation;
|
|
1052
|
|
1053 sub correlation
|
|
1054 {
|
|
1055 my ($x) = @_;
|
|
1056 my ($mean_x,$mean_y) = mean($x);
|
|
1057 my $ssxx=ss($x,$mean_x,$mean_y,1,1);
|
|
1058 my $ssyy=ss($x,$mean_x,$mean_y,2,2);
|
|
1059 my $ssxy=ss($x,$mean_x,$mean_y,1,2);
|
|
1060 my $correl=correl($ssxx,$ssyy,$ssxy);;
|
|
1061 my $xcorrel=sprintf("%.2f",$correl);
|
|
1062 return($xcorrel);
|
|
1063
|
|
1064 sub mean
|
|
1065 {
|
|
1066 my ($x)=@_;
|
|
1067 my $num = scalar(@{$x}) - 2;
|
|
1068 my $sum_x = '0';
|
|
1069 my $sum_y = '0';
|
|
1070 for (my $i = 2; $i < scalar(@{$x}); ++$i)
|
|
1071 {
|
|
1072 $sum_x += $x->[$i][1];
|
|
1073 $sum_y += $x->[$i][2];
|
|
1074 }
|
|
1075 my $mu_x = $sum_x / $num;
|
|
1076 my $mu_y = $sum_y / $num;
|
|
1077 return($mu_x,$mu_y);
|
|
1078 }
|
|
1079
|
|
1080 ### ss = sum of squared (deviations to the mean)
|
|
1081 sub ss
|
|
1082 {
|
|
1083 my ($x,$mean_x,$mean_y,$one,$two)=@_;
|
|
1084 my $sum = '0';
|
|
1085 for (my $i=2;$i<scalar(@{$x});++$i)
|
|
1086 {
|
|
1087 $sum += ($x->[$i][$one]-$mean_x)*($x->[$i][$two]-$mean_y);
|
|
1088 }
|
|
1089 return $sum;
|
|
1090 }
|
|
1091
|
|
1092 sub correl
|
|
1093 {
|
|
1094 my($ssxx,$ssyy,$ssxy)=@_;
|
|
1095
|
|
1096 my ($sign, $correl) = (0,0);
|
|
1097 if(abs($ssxy) == 0) { $sign = 0 }
|
|
1098 else { $sign=$ssxy/abs($ssxy); }
|
|
1099
|
|
1100 if( ($ssxx==0) || ($ssyy==0) ) { $correl = 0 }
|
|
1101 else { $correl=$sign*sqrt($ssxy*$ssxy/($ssxx*$ssyy)); }
|
|
1102
|
|
1103 return $correl;
|
|
1104 }
|
|
1105 }
|
|
1106 }
|
|
1107
|
|
1108 # Save the output of the chi2 into a hash table for writing the results in the Excel file
|
|
1109 sub chi2hash
|
|
1110 {
|
|
1111 my ($outputChi2, $k_file) = @_;
|
|
1112
|
|
1113 open(F1, $outputChi2) or die "$!: $outputChi2\n";
|
|
1114 my $header = <F1>;
|
|
1115 # Strand_Bias($tab[0]) NonTr-Tr($tab[1]) Proportion($tab[2]) P-val-Chi2($tab[3]) FDR($tab[4]) Confidence Interval($tab[5]) Mutation_Type($tab[6]) SampleName($tab[7])
|
|
1116 while(<F1>)
|
|
1117 {
|
|
1118 $_ =~ s/[\r\n]+$//;
|
|
1119 my @tab = split("\t", $_);
|
|
1120
|
|
1121 if($tab[7] eq $k_file)
|
|
1122 {
|
|
1123 if($tab[1] eq "NA")
|
|
1124 {
|
|
1125 $h_chi2{$tab[7]}{$tab[6]}{'NonTr'} = "NA";
|
|
1126 $h_chi2{$tab[7]}{$tab[6]}{'Tr'} = "NA";
|
|
1127 }
|
|
1128 else
|
|
1129 {
|
|
1130 my ($nonTr, $tr) = split("-", $tab[1]);
|
|
1131
|
|
1132 $h_chi2{$tab[7]}{$tab[6]}{'NonTr'} = $nonTr;
|
|
1133 $h_chi2{$tab[7]}{$tab[6]}{'Tr'} = $tr;
|
|
1134 }
|
|
1135
|
|
1136
|
|
1137 $h_chi2{$tab[7]}{$tab[6]}{'p-value'} = $tab[3];
|
|
1138 $h_chi2{$tab[7]}{$tab[6]}{'ConfInt'} = $tab[5];
|
|
1139
|
|
1140 # For the pool data the FDR isn't calculated so replace the NA (=Missing values) by "-"
|
|
1141 if($tab[7] eq "Pool_Data")
|
|
1142 {
|
|
1143 $h_chi2{$tab[7]}{$tab[6]}{'FDR'} = "-";
|
|
1144 }
|
|
1145 else
|
|
1146 {
|
|
1147 $h_chi2{$tab[7]}{$tab[6]}{'FDR'} = $tab[4];
|
|
1148 }
|
|
1149 }
|
|
1150 }
|
|
1151 close F1;
|
|
1152 }
|
|
1153
|
|
1154 ### Complement bases (for the sequence context)
|
|
1155 sub complement
|
|
1156 {
|
|
1157 if($_[0] eq "A") { return "T"; }
|
|
1158 if($_[0] eq "C") { return "G"; }
|
|
1159 if($_[0] eq "G") { return "C"; }
|
|
1160 if($_[0] eq "T") { return "A"; }
|
|
1161 }
|
|
1162
|
|
1163 ### Recover the functional region for all the samples. Allows to thave the same annotations for the pie chart "Impact on protein sequence"
|
|
1164 sub recoverAnnovarAnnotation
|
|
1165 {
|
|
1166 my ($func_name) = @_;
|
|
1167
|
|
1168 my %hash = ();
|
|
1169
|
|
1170 # The input is a folder
|
|
1171 foreach my $file (`ls $folderCheckedForStat/*`)
|
|
1172 {
|
|
1173 chomp($file);
|
|
1174 my $AV_annotation_value = recoverNumCol($file, $func_name);
|
|
1175
|
|
1176 open(F1, $file) or die "$!: $file\n";
|
|
1177 my $header = <F1>;
|
|
1178 while(<F1>)
|
|
1179 {
|
|
1180 $_ =~ s/[\r\n]+$//;
|
|
1181 my @tab = split("\t", $_);
|
|
1182
|
|
1183 # Some files can have an empty line at the end and WE DON'T WANT to consider it
|
|
1184 if(! defined $tab[0]) { next; }
|
|
1185 # Some func value are repeated and separated by ";"
|
|
1186 my $funcSegment = "";
|
|
1187 if($tab[$AV_annotation_value] =~ /;/)
|
|
1188 {
|
|
1189 my @temp = split(";", $tab[$AV_annotation_value]);
|
|
1190 $funcSegment = $temp[0];
|
|
1191 }
|
|
1192 else { $funcSegment = $tab[$AV_annotation_value]; }
|
|
1193
|
|
1194 $hash{$funcSegment} = "";
|
|
1195 }
|
|
1196 close F1;
|
|
1197 }
|
|
1198 my @tab_AVAnnotation = ();
|
|
1199 foreach my $k (sort keys %hash) { push(@tab_AVAnnotation, $k); }
|
|
1200 return @tab_AVAnnotation;
|
|
1201 }
|
|
1202
|
|
1203 sub recoverNumCol
|
|
1204 {
|
|
1205 my ($input, $name_of_column) = @_;
|
|
1206
|
|
1207 open(F1,$input) or die "recoverNumCol: $!: $input\n";
|
|
1208 # For having the name of the columns
|
|
1209 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
|
|
1210 close F1;
|
|
1211 # The number of the column
|
|
1212 my $name_of_column_NB = "toto";
|
|
1213 for(my $i=0; $i<=$#tab_search_header; $i++)
|
|
1214 {
|
|
1215 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; }
|
|
1216 }
|
|
1217 if($name_of_column_NB eq "toto") { print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit 3; }
|
|
1218 else { return $name_of_column_NB; }
|
|
1219 }
|
|
1220
|
|
1221
|
|
1222
|
|
1223 ######################################################################################################################################################
|
|
1224 # Functions for writing in the Excel report #
|
|
1225 ######################################################################################################################################################
|
|
1226 # Write the header for the six mutation types
|
|
1227 sub WriteHeaderSection
|
|
1228 {
|
|
1229 our ($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext) = @_;
|
|
1230
|
|
1231 our ($format_CA, $format_CG, $format_CT, $format_TA, $format_TC, $format_TG, $format_TG2, $format_LeftHeader, $format_RightHeader, $format_LeftHeader2);
|
|
1232 Format_Header($wb, \$format_CA, \$format_CG, \$format_CT, \$format_TA, \$format_TC, \$format_TG, \$format_TG2, \$format_LeftHeader, \$format_RightHeader, \$format_LeftHeader2);
|
|
1233
|
|
1234 our ($format_LeftCA, $format_LeftCG, $format_LeftCT, $format_LeftTA, $format_LeftTC, $format_LeftTG, $format_RightCA, $format_RightCG, $format_RightCT, $format_RightTA, $format_RightTC, $format_RightTG);
|
|
1235 Format_HeaderSBSDistrBySegAndFunc($wb, \$format_LeftCA, \$format_LeftCG, \$format_LeftCT, \$format_LeftTA, \$format_LeftTC, \$format_LeftTG, \$format_RightCA, \$format_RightCG, \$format_RightCT, \$format_RightTA, \$format_RightTC, \$format_RightTG);
|
|
1236
|
|
1237 our $format_A11Bold = ""; Format_A11Bold($wb, \$format_A11Bold); # Arial 11 bold and center
|
|
1238 our $format_A11BoldLeft = ""; Format_A11BoldLeft($wb, \$format_A11BoldLeft); # Arial 11 bold and left
|
|
1239
|
|
1240 our ($format_header12CA, $format_header12CG, $format_header12CT, $format_header12TA, $format_header12TC, $format_header12TG);
|
|
1241 Format_Header12MutType($wb, \$format_header12CA, \$format_header12CG, \$format_header12CT, \$format_header12TA, \$format_header12TC, \$format_header12TG);
|
|
1242
|
|
1243 ## Header for SBS distribution by segment
|
|
1244 HeaderMutTypeSBSDistrBySeg();
|
|
1245
|
|
1246 ## Header for strand bias by function
|
|
1247 $ws->set_column($colStart_SBSdistrBySeg+5, $colStart_SBSdistrBySeg+5, 11);
|
|
1248
|
|
1249 my $row = $rowStart_SBSdistrBySeg+$nb_func+10; my $col = $colStart_SBSdistrBySeg;
|
|
1250 $ws->write($row, $col+1, ' ', $format_CA); $ws->write($row, $col+2, "C>A", $format_CA); $ws->write($row, $col+3, ' ', $format_CA);
|
|
1251 $ws->write($row, $col+5, ' ', $format_CG); $ws->write($row, $col+6, "C>G", $format_CG); $ws->write($row, $col+7, ' ', $format_CG);
|
|
1252 $ws->write($row, $col+9, ' ', $format_CT); $ws->write($row, $col+10, "C>T", $format_CT); $ws->write($row, $col+11, ' ', $format_RightCT);
|
|
1253
|
|
1254 $row = $rowStart_SBSdistrBySeg+($nb_func*2)+14;
|
|
1255 $ws->write($row, $col+1, ' ', $format_TA); $ws->write($row, $col+2, "T>A", $format_TA); $ws->write($row, $col+3, ' ', $format_TA);
|
|
1256 $ws->write($row, $col+5, ' ', $format_TC); $ws->write($row, $col+6, "T>C", $format_TC); $ws->write($row, $col+7, ' ', $format_TC);
|
|
1257 $ws->write($row, $col+9, ' ', $format_TG2); $ws->write($row, $col+10, "T>G", $format_TG2); $ws->write($row, $col+11, ' ', $format_RightTG);
|
|
1258
|
|
1259 $ws->set_row($rowStart_SBSdistrBySeg+$nb_func+11, 18); $ws->set_row($rowStart_SBSdistrBySeg+($nb_func*2)+15, 18);
|
|
1260 $ws->set_column($colStart_SBSdistrBySeg+5, $colStart_SBSdistrBySeg+5, 13); $ws->set_column($colStart_SBSdistrBySeg+9, $colStart_SBSdistrBySeg+9, 13);
|
|
1261
|
|
1262 for(my $i=$rowStart_SBSdistrBySeg+$nb_func+10; $i<=$rowStart_SBSdistrBySeg+($nb_func*2)+14; $i+=$nb_func+4)
|
|
1263 {
|
|
1264 $ws->write($i+1, $colStart_SBSdistrBySeg, 'Segment', $format_LeftHeader); $ws -> write($i+1, $colStart_SBSdistrBySeg+1, 'Non-Tr/Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+2, 'Non-Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+3, 'Tr', $format_A11Bold);
|
|
1265 $ws -> write($i+1, $colStart_SBSdistrBySeg+5, 'Non-Tr/Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+6, 'Non-Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+7, 'Tr', $format_A11Bold);
|
|
1266 $ws -> write($i+1, $colStart_SBSdistrBySeg+9, 'Non-Tr/Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+10, 'Non-Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+11, 'Tr', $format_RightHeader);
|
|
1267 }
|
|
1268
|
|
1269
|
|
1270 ## Header for Counts of SBS per chromosome and mutation type
|
|
1271 HeaderCountSBSPerChr();
|
|
1272
|
|
1273 ## Header for the short sequence context
|
|
1274 HeaderShortTriNtContext();
|
|
1275
|
|
1276 ## Header for the 12 mutation types with the sequence context (coding strand)
|
|
1277 HeaderLongTriNtContext();
|
|
1278
|
|
1279 sub HeaderMutTypeSBSDistrBySeg
|
|
1280 {
|
|
1281 $ws->set_row($rowStart_SBSdistrBySeg+2, 18);
|
|
1282 $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+2, "C:G>A:T", $format_CA); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+3, $format_CA);
|
|
1283 $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+4, "C:G>G:C", $format_CG); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+5, $format_CG);
|
|
1284 $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+6, "C:G>T:A", $format_CT); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+7, $format_CT);
|
|
1285 $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+8, "T:A>A:T", $format_TA); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+9, $format_TA);
|
|
1286 $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+10, "T:A>C:G", $format_TC); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+11, $format_TC);
|
|
1287 $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+12, "T:A>G:C", $format_TG); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+13, $format_TG);
|
|
1288
|
|
1289 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg, "Segment", $format_LeftHeader); $ws->set_column($colStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, 13); $ws->set_row($rowStart_SBSdistrBySeg+3, 18);
|
|
1290 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+1, "Total SBS", $format_A11Bold); $ws->set_column($colStart_SBSdistrBySeg+1, $colStart_SBSdistrBySeg+1, 11);
|
|
1291 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+2, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+3, "#", $format_A11Bold);
|
|
1292 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+4, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+5, "#", $format_A11Bold);
|
|
1293 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+6, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+7, "#", $format_A11Bold);
|
|
1294 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+8, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+9, "#", $format_A11Bold);
|
|
1295 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+10, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+11, "#", $format_A11Bold);
|
|
1296 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+12, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, 13, "#", $format_RightHeader);
|
|
1297 }
|
|
1298
|
|
1299 sub HeaderCountSBSPerChr
|
|
1300 {
|
|
1301 $ws->set_column(3,3, 10); $ws->set_column(4,4, 10);
|
|
1302 $ws->set_row($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, 18);
|
|
1303 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+7, $colStart_SBSdistrBySeg+1, "Pearson", $format_A11Bold);
|
|
1304 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg, "Chr", $format_LeftHeader);
|
|
1305 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+1, "Size", $format_A11Bold);
|
|
1306 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+2, "All SBS", $format_A11Bold);
|
|
1307
|
|
1308 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+3, "C:G>A:T", $format_CA);
|
|
1309 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+4, "C:G>G:C", $format_CG);
|
|
1310 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+5, "C:G>T:A", $format_CT);
|
|
1311 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+6, "T:A>A:T", $format_TA);
|
|
1312 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+7, "T:A>C:G", $format_TC);
|
|
1313 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+8, "T:A>G:C", $format_TG);
|
|
1314 }
|
|
1315
|
|
1316 sub HeaderShortTriNtContext
|
|
1317 {
|
|
1318 ### GENOMIC STRAND
|
|
1319 $ws->write(2, $colStart_matrixSeqContext, 'Count matrix', $format_LeftHeader2);
|
|
1320 $ws->write(3, $colStart_matrixSeqContext+4, 'C>A', $format_CA); $ws->write(3, $colStart_matrixSeqContext+5, 'C>G', $format_CG); $ws->write(3, $colStart_matrixSeqContext+6, 'C>T', $format_CT); $ws->write(3, $colStart_matrixSeqContext+7, 'T>A', $format_TA); $ws->write(3, $colStart_matrixSeqContext+8, 'T>C', $format_TC); $ws->write(3, $colStart_matrixSeqContext+9, 'T>G', $format_TG2);
|
|
1321
|
|
1322 $ws->write(2, $colStart_matrixSeqContext+11, 'Frequency matrix', $format_A11BoldLeft);
|
|
1323 $ws->write(3, $colStart_matrixSeqContext+14, 'C>A', $format_CA); $ws->write(3, $colStart_matrixSeqContext+15, 'C>G', $format_CG); $ws->write(3, $colStart_matrixSeqContext+16, 'C>T', $format_CT); $ws->write(3, $colStart_matrixSeqContext+17, 'T>A', $format_TA); $ws->write(3, $colStart_matrixSeqContext+18, 'T>C', $format_TC); $ws->write(3, $colStart_matrixSeqContext+19, 'T>G', $format_TG2);
|
|
1324
|
|
1325 ### sequence context with a bar graph
|
|
1326 $ws->write(25, $colStart_matrixSeqContext+10, "Mutation spectra frequency", $format_A11Bold);
|
|
1327 }
|
|
1328
|
|
1329 sub HeaderLongTriNtContext
|
|
1330 {
|
|
1331 $ws->set_row($rowStart_SBSdistrBySeg+3, 15); $ws->set_row($rowStart_SBSdistrBySeg+4, 15); $ws->set_row($rowStart_SBSdistrBySeg+5, 15);
|
|
1332 $ws->write($rowStart_SBSdistrBySeg+3, $colStart_matrixSeqContext, "Count matrix", $format_LeftHeader2);
|
|
1333 $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+1, "C>A", $format_CA); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+2, $format_CA); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+1, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+2, "Tr", $format_A11Bold);
|
|
1334 $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+3, "C>G", $format_CG); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+4, $format_CG); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+3, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+4, "Tr", $format_A11Bold);
|
|
1335 $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+5, "C>T", $format_CT); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+6, $format_CT); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+5, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+6, "Tr", $format_A11Bold);
|
|
1336 $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+7, "T>A", $format_TA); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+8, $format_TA); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+7, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+8, "Tr", $format_A11Bold);
|
|
1337 $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+9, "T>C", $format_TC); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+10, $format_TC); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+9, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+10, "Tr", $format_A11Bold);
|
|
1338 $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+11, "T>G", $format_TG2); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+12, $format_TG2); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+11, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+12, "Tr", $format_A11Bold);
|
|
1339
|
|
1340
|
|
1341 $ws->set_row($rowStart_SBSdistrBySeg+24, 15); $ws->set_row($rowStart_SBSdistrBySeg+25, 15); $ws->set_row($rowStart_SBSdistrBySeg+26, 15);
|
|
1342 $ws->write($rowStart_SBSdistrBySeg+24, $colStart_matrixSeqContext, "Frequency matrix", $format_LeftHeader2);
|
|
1343 $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+1, "C>A", $format_CA); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+2, $format_CA); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+1, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+2, "Tr", $format_A11Bold);
|
|
1344 $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+3, "C>G", $format_CG); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+4, $format_CG); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+3, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+4, "Tr", $format_A11Bold);
|
|
1345 $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+5, "C>T", $format_CT); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+6, $format_CT); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+5, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+6, "Tr", $format_A11Bold);
|
|
1346 $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+7, "T>A", $format_TA); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+8, $format_TA); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+7, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+8, "Tr", $format_A11Bold);
|
|
1347 $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+9, "T>C", $format_TC); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+10, $format_TC); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+9, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+10, "Tr", $format_A11Bold);
|
|
1348 $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+11, "T>G", $format_TG2); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+12, $format_TG2); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+11, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+12, "Tr", $format_A11Bold);
|
|
1349 }
|
|
1350 }
|
|
1351 # Write the titles of the different sections of the report
|
|
1352 sub WriteBorderSection
|
|
1353 {
|
|
1354 our ($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext) = @_;
|
|
1355
|
|
1356 our ($format_topLeft, $format_topRight, $format_bottomLeft, $format_bottomRight, $format_top, $format_right, $format_bottom, $format_left);
|
|
1357 Format_section($wb, \$format_topLeft, \$format_topRight, \$format_bottomLeft, \$format_bottomRight, \$format_top, \$format_right, \$format_bottom, \$format_left);
|
|
1358
|
|
1359 TableSBSDistrBySeg();
|
|
1360 TableStrandBiasBySegment();
|
|
1361 CountSBSPerChr();
|
|
1362 ShortTriNtContext(); # 6 mut type
|
|
1363 LongTriNtContext(); # 12 mut type
|
|
1364
|
|
1365 sub TableSBSDistrBySeg
|
|
1366 {
|
|
1367 # Top-Left
|
|
1368 $ws->write($rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, "Table 4. SBS distribution by functional region", $format_topLeft); $ws->set_row($rowStart_SBSdistrBySeg, 18); # Set the height of the row to 0.25"
|
|
1369 # Top
|
|
1370 for(my $i=1; $i<=13; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg+$i, $format_top); }
|
|
1371 # Top-Right
|
|
1372 $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg+13, $format_topRight);
|
|
1373 # Right
|
|
1374 $ws->write_blank($rowStart_SBSdistrBySeg+1, $colStart_SBSdistrBySeg+13, $format_right);
|
|
1375 # Bottom-left
|
|
1376 $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+5, $colStart_SBSdistrBySeg, $format_bottomLeft);
|
|
1377 # Left
|
|
1378 $ws->write_blank($rowStart_SBSdistrBySeg+1, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg, $format_left);
|
|
1379 }
|
|
1380
|
|
1381 sub TableStrandBiasBySegment
|
|
1382 {
|
|
1383 # Top-Left
|
|
1384 $ws->write($rowStart_SBSdistrBySeg+$nb_func+8, $colStart_SBSdistrBySeg, "Table 5. Strand bias by functional region", $format_topLeft); $ws->set_row($rowStart_SBSdistrBySeg+$nb_func+8, 18); # Set the height of the row to 0.25"
|
|
1385 # Top
|
|
1386 for(my $i=1; $i<=10; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+8, $colStart_SBSdistrBySeg+$i, $format_top); }
|
|
1387 # Top-Right
|
|
1388 $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+8, $colStart_SBSdistrBySeg+11, $format_topRight);
|
|
1389 # Right
|
|
1390 $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+9, $colStart_SBSdistrBySeg+11, $format_right); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*2)+13, $colStart_SBSdistrBySeg+11, $format_right);
|
|
1391 # Left
|
|
1392 $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+9, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+10, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*2)+13, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*2)+14, $colStart_SBSdistrBySeg, $format_left);
|
|
1393 # Bottom
|
|
1394 $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*3)+16, $colStart_SBSdistrBySeg+4, $format_bottom); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*3)+16, $colStart_SBSdistrBySeg+8, $format_bottom);
|
|
1395 }
|
|
1396
|
|
1397 sub CountSBSPerChr
|
|
1398 {
|
|
1399 #### Top-Left
|
|
1400 $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, $colStart_SBSdistrBySeg, "Table 6. SBS distribution per chromosome", $format_topLeft); $ws->set_row($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, 18); # Set the height of the row to 0.25"
|
|
1401 #### Top
|
|
1402 for(my $i=1; $i<8; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, $colStart_SBSdistrBySeg+$i, $format_top); }
|
|
1403 #### Top-Right
|
|
1404 $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, $colStart_SBSdistrBySeg+8, $format_topRight);
|
|
1405 #### Right
|
|
1406 $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+5, $colStart_SBSdistrBySeg+8, $format_right); $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+6, $colStart_SBSdistrBySeg+8, $format_right);
|
|
1407
|
|
1408 #### Bottom-Right
|
|
1409 # Human genome = 24 chromosomes
|
|
1410 if($refGenome =~ /hg/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg+8, $format_bottomRight); }
|
|
1411 # Mouse genome = 21 chromosomes
|
|
1412 if($refGenome =~ /mm/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg+8, $format_bottomRight); }
|
|
1413 # Rat genome = 22 chromosomes
|
|
1414 if($refGenome =~ /rn/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg+8, $format_bottomRight); }
|
|
1415
|
|
1416 #### Bottom
|
|
1417 if($refGenome =~ /hg/)
|
|
1418 {
|
|
1419 $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg+1, $format_bottom);
|
|
1420 for(my $i=3; $i<=7; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg+$i, $format_bottom); }
|
|
1421 }
|
|
1422 if($refGenome =~ /mm/)
|
|
1423 {
|
|
1424 $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg+1, $format_bottom);
|
|
1425 for(my $i=3; $i<=7; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg+$i, $format_bottom); }
|
|
1426 }
|
|
1427 if($refGenome =~ /rn/)
|
|
1428 {
|
|
1429 $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg+1, $format_bottom);
|
|
1430 for(my $i=3; $i<=7; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg+$i, $format_bottom); }
|
|
1431 }
|
|
1432
|
|
1433 #### Left
|
|
1434 $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+5, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+6, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+7, $colStart_SBSdistrBySeg, $format_left);
|
|
1435
|
|
1436 #### Bottom-left
|
|
1437 if($refGenome =~ /hg/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg, $format_bottomLeft); }
|
|
1438 if($refGenome =~ /mm/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg, $format_bottomLeft); }
|
|
1439 if($refGenome =~ /rn/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg, $format_bottomLeft); }
|
|
1440 }
|
|
1441
|
|
1442 sub ShortTriNtContext
|
|
1443 {
|
|
1444 my $format_headerSection = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12);
|
|
1445 $format_headerSection->set_left(2); $format_headerSection->set_left_color('blue');
|
|
1446
|
|
1447 # Top-left
|
|
1448 $ws->write(0, $colStart_matrixSeqContext, "Panel 1. Trinucleotide sequence context of SBS on the genomic sequence", $format_topLeft);
|
|
1449 # Top
|
|
1450 for(my $i=1; $i<=19; $i++) { $ws->write_blank(0, $colStart_matrixSeqContext+$i, $format_top); }
|
|
1451 # Top-right
|
|
1452 $ws->write_blank(0, $colStart_matrixSeqContext+20, $format_topRight);
|
|
1453 # Right
|
|
1454 for(my $i=1; $i<=37; $i++) { $ws->write_blank($i, $colStart_matrixSeqContext+20, $format_right); }
|
|
1455 # Bottom-right
|
|
1456 $ws->write_blank(37, $colStart_matrixSeqContext+20, $format_bottomRight);
|
|
1457 # Bottom
|
|
1458 for(my $i=1; $i<=19; $i++) { $ws->write_blank(38, $colStart_matrixSeqContext+$i, $format_top); }
|
|
1459 # Bottom-left
|
|
1460 $ws->write_blank(37, $colStart_matrixSeqContext, $format_bottomLeft);
|
|
1461 # Left
|
|
1462 $ws->write(1, $colStart_matrixSeqContext, "", $format_left);
|
|
1463 for(my $i=3; $i<=36; $i++) { $ws->write_blank($i, $colStart_matrixSeqContext, $format_left); }
|
|
1464 }
|
|
1465
|
|
1466 sub LongTriNtContext
|
|
1467 {
|
|
1468 # Top-left
|
|
1469 $ws->write($rowStart_SBSdistrBySeg, $colStart_matrixSeqContext, "Panel 2. Stranded analysis of trinucleotide sequence context of SBS", $format_topLeft);
|
|
1470 # Top
|
|
1471 for(my $i=1; $i<=28; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_matrixSeqContext+$i, $format_top); }
|
|
1472 # Top-right
|
|
1473 $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_matrixSeqContext+29, $format_topRight);
|
|
1474 # Right
|
|
1475 for(my $i=1; $i<=42; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+$i, $colStart_matrixSeqContext+29, $format_right); }
|
|
1476 # Bottom-right
|
|
1477 $ws->write_blank(91, $colStart_matrixSeqContext+29, $format_bottomRight);
|
|
1478 # Bottom
|
|
1479 for(my $i=13; $i<=28; $i++) { $ws->write_blank(92, $colStart_matrixSeqContext+$i, $format_top); }
|
|
1480 # Bottom-left
|
|
1481 $ws->write_blank(91, $colStart_matrixSeqContext, $format_bottomLeft);
|
|
1482 # Left
|
|
1483 $ws->write_blank($rowStart_SBSdistrBySeg+1, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext, $format_left);
|
|
1484 $ws->write_blank($rowStart_SBSdistrBySeg+22, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+23, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext, $format_left);
|
|
1485 }
|
|
1486 }
|
|
1487
|
|
1488 ############################################################################################################
|
|
1489 # Write count of SBS by functional impact on the protein (Table 2) + Create the input for ggplot2 (pie chart with functional impact) + Create the input for ggplot2 (pie chart of SBS vs. Indels)
|
|
1490 sub writeDistrFuncImpact
|
|
1491 {
|
|
1492 my ($ws, $refH_file, $sample, $funcImpactggplot2, $overallDistrggplot2) = @_;
|
|
1493
|
|
1494 # Set the row for the table 2
|
|
1495 my $lImpactSBS = 31;
|
|
1496 my ($deletion, $insertion) = (0, 0);
|
|
1497
|
|
1498
|
|
1499 $ws->write(29, 6, "Table 2. Frequency and counts of functional impact", $format_A10Boldleft);
|
|
1500 $ws->set_column(6, 6, 13); $ws->set_column(10, 10, 15);
|
|
1501 $ws->write(30, 6, "RefSeq gene", $table_topleft);
|
|
1502 $ws->write(30, 7, "", $table_top);
|
|
1503 $ws->write(30, 8, "Percent", $table_top);
|
|
1504 $ws->write(30, 9, "Count", $table_topRight);
|
|
1505
|
|
1506
|
|
1507
|
|
1508 open(IMPACTSBS, ">", $funcImpactggplot2) or die "$!: $funcImpactggplot2\n";
|
|
1509 print IMPACTSBS "AA_Change\tCount\tPercent\n";
|
|
1510
|
|
1511 # Pie chart with the distribution of SBS vs Indel
|
|
1512 open(SBSINDEL, ">", $overallDistrggplot2) or die "$!: $overallDistrggplot2\n";
|
|
1513 print SBSINDEL "Variant_type\tCount\tPercent\n";
|
|
1514
|
|
1515
|
|
1516 foreach my $k_exoFunc(sort keys %{$refH_file->{$sample}{'ImpactSBS'}})
|
|
1517 {
|
|
1518 my $percent = ($refH_file->{$sample}{'ImpactSBS'}{$k_exoFunc} / $refH_file->{$sample}{'TotalMutGenomic'})*100;
|
|
1519 $percent = sprintf("%.2f", $percent);
|
|
1520
|
|
1521 if($k_exoFunc eq "NA")
|
|
1522 {
|
|
1523 print IMPACTSBS "Not_Applicable\t$percent\t$refH_file->{$sample}{'ImpactSBS'}{$k_exoFunc}\n";
|
|
1524 }
|
|
1525 else
|
|
1526 {
|
|
1527 my $temp = $k_exoFunc;
|
|
1528 $temp =~ s/ /_/g;
|
|
1529 print IMPACTSBS "$temp\t$percent\t$refH_file->{$sample}{'ImpactSBS'}{$k_exoFunc}\n";
|
|
1530 }
|
|
1531
|
|
1532 $ws->write($lImpactSBS, 6, $k_exoFunc, $table_left2);
|
|
1533 $ws->write($lImpactSBS, 8, $percent, $format_A10);
|
|
1534 $ws->write($lImpactSBS, 9, $refH_file->{$sample}{'ImpactSBS'}{$k_exoFunc}, $table_right);
|
|
1535
|
|
1536 $lImpactSBS++;
|
|
1537
|
|
1538 # Pie chart with the distribution of SBS vs Indel
|
|
1539 if($k_exoFunc =~ /deletion/i) { $deletion += $refH_file->{$sample}{'ImpactSBS'}{$k_exoFunc}; }
|
|
1540 elsif($k_exoFunc =~ /insertion/i) { $insertion += $refH_file->{$sample}{'ImpactSBS'}{$k_exoFunc}; }
|
|
1541 }
|
|
1542 close IMPACTSBS;
|
|
1543
|
|
1544 $ws->write($lImpactSBS, 9, $refH_file->{$sample}{'TotalMutGenomic'}, $table_bottomrightHeader);
|
|
1545 $ws->write($lImpactSBS, 6, "", $table_bottomleft); $ws->write($lImpactSBS, 7, "", $table_bottom); $ws->write($lImpactSBS, 8, "", $table_bottom);
|
|
1546
|
|
1547 # Pie chart with the distribution of SBS vs Indel
|
|
1548 my $percentSBSIndel = ($deletion/$refH_file->{$sample}{'TotalMutGenomic'})*100; $percentSBSIndel = sprintf("%.2f", $percentSBSIndel);
|
|
1549 print SBSINDEL "Deletion\t$deletion\t$percentSBSIndel\n";
|
|
1550 $percentSBSIndel = ($insertion/$refH_file->{$sample}{'TotalMutGenomic'})*100; $percentSBSIndel = sprintf("%.2f", $percentSBSIndel);
|
|
1551 print SBSINDEL "Insertion\t$insertion\t$percentSBSIndel\n";
|
|
1552 $percentSBSIndel = ($refH_file->{$sample}{TotalSBSGenomic}/$refH_file->{$sample}{'TotalMutGenomic'})*100; $percentSBSIndel = sprintf("%.2f", $percentSBSIndel);
|
|
1553 print SBSINDEL "SBS\t$refH_file->{$sample}{TotalSBSGenomic}\t$percentSBSIndel\n";
|
|
1554 close SBSINDEL;
|
|
1555 }
|
|
1556
|
|
1557
|
|
1558 ############################################################################################################
|
|
1559 # Write the result of the chi2 for the strand bias (Table 3)
|
|
1560 sub writeChi2result
|
|
1561 {
|
|
1562 my ($wb, $ws, $strandBiasggplot2, $refH_file, $k_file) = @_;
|
|
1563
|
|
1564 # Define the header of the table 3
|
|
1565 $ws->write(28, 11, "Table 3. Significance of the strand biases", $format_A10Boldleft);
|
|
1566 $ws->set_column(11, 11, 13); $ws->set_column(16, 16, 15); $ws->set_column(17, 17, 10);
|
|
1567 $ws->write(29, 11, "Mutation Type", $table_topleft); $ws->write(29, 12, "Non-Tr/Tr", $table_top); $ws->write(29, 13, "Non-Tr", $table_top); $ws->write(29, 14, "Tr", $table_top); $ws->write(29, 15, "P-value", $table_top); $ws->write(29, 16, "FDR q value", $table_top); $ws->write(29, 17, "95% CI", $table_topRight);
|
|
1568 $ws->write(39, 11, "Table 3. Significance of the strand biases", $format_A10Boldleft);
|
|
1569 $ws->write(40, 11, "Mutation Type", $table_topleft); $ws->write(40, 12, "Non-Tr/Tr", $table_top); $ws->write(40, 13, "Non-Tr", $table_top); $ws->write(40, 14, "Tr", $table_top); $ws->write(40, 15, "P-value", $table_top); $ws->write(40, 16, "FDR q value", $table_top); $ws->write(40, 17, "95% CI", $table_topRight);
|
|
1570
|
|
1571
|
|
1572 # Define the count on non-transcribed and transcribed strand
|
|
1573 # C>A
|
|
1574 my ($ca_NonTr, $ca_Tr) = (0,0);
|
|
1575 if( ($h_chi2{$k_file}{"C>A"}{'NonTr'} eq "NA") || ($h_chi2{$k_file}{"C>A"}{'Tr'} eq "NA") )
|
|
1576 {
|
|
1577 $ca_NonTr = 0;
|
|
1578 $ca_Tr = 0;
|
|
1579 }
|
|
1580 else
|
|
1581 {
|
|
1582 $ca_NonTr = $h_chi2{$k_file}{"C>A"}{'NonTr'};
|
|
1583 $ca_Tr = $h_chi2{$k_file}{"C>A"}{'Tr'};
|
|
1584 }
|
|
1585 # C>G
|
|
1586 my ($cg_NonTr, $cg_Tr) = (0,0);
|
|
1587 if( ($h_chi2{$k_file}{"C>G"}{'NonTr'} eq "NA") || ($h_chi2{$k_file}{"C>G"}{'Tr'} eq "NA") )
|
|
1588 {
|
|
1589 $cg_NonTr = 0;
|
|
1590 $cg_Tr = 0;
|
|
1591 }
|
|
1592 else
|
|
1593 {
|
|
1594 $cg_NonTr = $h_chi2{$k_file}{"C>G"}{'NonTr'};
|
|
1595 $cg_Tr = $h_chi2{$k_file}{"C>G"}{'Tr'};
|
|
1596 }
|
|
1597 # C>T
|
|
1598 my ($ct_NonTr, $ct_Tr) = (0,0);
|
|
1599 if( ($h_chi2{$k_file}{"C>T"}{'NonTr'} eq "NA") || ($h_chi2{$k_file}{"C>T"}{'Tr'} eq "NA") )
|
|
1600 {
|
|
1601 $ct_NonTr = 0;
|
|
1602 $ct_Tr = 0;
|
|
1603 }
|
|
1604 else
|
|
1605 {
|
|
1606 $ct_NonTr = $h_chi2{$k_file}{"C>T"}{'NonTr'};
|
|
1607 $ct_Tr = $h_chi2{$k_file}{"C>T"}{'Tr'};
|
|
1608 }
|
|
1609 # T>A
|
|
1610 my ($ta_NonTr, $ta_Tr) = (0,0);
|
|
1611 if( ($h_chi2{$k_file}{"T>A"}{'NonTr'} eq "NA") || ($h_chi2{$k_file}{"T>A"}{'Tr'} eq "NA") )
|
|
1612 {
|
|
1613 $ta_NonTr = 0;
|
|
1614 $ta_Tr = 0;
|
|
1615 }
|
|
1616 else
|
|
1617 {
|
|
1618 $ta_NonTr = $h_chi2{$k_file}{"T>A"}{'NonTr'};
|
|
1619 $ta_Tr = $h_chi2{$k_file}{"T>A"}{'Tr'};
|
|
1620 }
|
|
1621 # T>C
|
|
1622 my ($tc_NonTr, $tc_Tr) = (0,0);
|
|
1623 if( ($h_chi2{$k_file}{"T>C"}{'NonTr'} eq "NA") || ($h_chi2{$k_file}{"T>C"}{'Tr'} eq "NA") )
|
|
1624 {
|
|
1625 $tc_NonTr = 0;
|
|
1626 $tc_Tr = 0;
|
|
1627 }
|
|
1628 else
|
|
1629 {
|
|
1630 $tc_NonTr = $h_chi2{$k_file}{"T>C"}{'NonTr'};
|
|
1631 $tc_Tr = $h_chi2{$k_file}{"T>C"}{'Tr'};
|
|
1632 }
|
|
1633 # T>G
|
|
1634 my ($tg_NonTr, $tg_Tr) = (0,0);
|
|
1635 if( ($h_chi2{$k_file}{"T>G"}{'NonTr'} eq "NA") || ($h_chi2{$k_file}{"T>G"}{'Tr'} eq "NA") )
|
|
1636 {
|
|
1637 $tg_NonTr = 0;
|
|
1638 $tg_Tr = 0;
|
|
1639 }
|
|
1640 else
|
|
1641 {
|
|
1642 $tg_NonTr = $h_chi2{$k_file}{"T>G"}{'NonTr'};
|
|
1643 $tg_Tr = $h_chi2{$k_file}{"T>G"}{'Tr'};
|
|
1644 }
|
|
1645
|
|
1646
|
|
1647
|
|
1648 # Create an input for representing the strand bias with ggplot2
|
|
1649 open(SB, ">", $strandBiasggplot2) or die "$!: $strandBiasggplot2\n";
|
|
1650 print SB "Alteration\tStrand\tCount\n";
|
|
1651
|
|
1652 print SB "C>A\tNonTranscribed\t".$ca_NonTr."\n"."C>A\tTranscribed\t".$ca_Tr."\n";
|
|
1653 print SB "C>G\tNonTranscribed\t".$cg_NonTr."\n"."C>G\tTranscribed\t".$cg_Tr."\n";
|
|
1654 print SB "C>T\tNonTranscribed\t".$ct_NonTr."\n"."C>T\tTranscribed\t".$ct_Tr."\n";
|
|
1655 print SB "T>A\tNonTranscribed\t".$ta_NonTr."\n"."T>A\tTranscribed\t".$ta_Tr."\n";
|
|
1656 print SB "T>C\tNonTranscribed\t".$tc_NonTr."\n"."T>C\tTranscribed\t".$tc_Tr."\n";
|
|
1657 print SB "T>G\tNonTranscribed\t".$tg_NonTr."\n"."T>G\tTranscribed\t".$tg_Tr."\n";
|
|
1658 close SB;
|
|
1659
|
|
1660
|
|
1661 ### Calcul the ratio for the strand bias and write it on the Excel file (Table 3)
|
|
1662 writeRatioSB($ca_NonTr, $ca_Tr, $ws, 30, $refH_file, $k_file, "C>A", "G>T", $format_A10, $table_left, $table_middleHeader, $table_right);
|
|
1663 writeRatioSB($cg_NonTr, $cg_Tr, $ws, 31, $refH_file, $k_file, "C>G", "G>C", $format_A10, $table_left, $table_middleHeader, $table_right);
|
|
1664 writeRatioSB($ct_NonTr, $ct_Tr, $ws, 32, $refH_file, $k_file, "C>T", "G>A", $format_A10, $table_left, $table_middleHeader, $table_right);
|
|
1665 writeRatioSB($ta_NonTr, $ta_Tr, $ws, 33, $refH_file, $k_file, "T>A", "A>T", $format_A10, $table_left, $table_middleHeader, $table_right);
|
|
1666 writeRatioSB($tc_NonTr, $tc_Tr, $ws, 34, $refH_file, $k_file, "T>C", "A>G", $format_A10, $table_left, $table_middleHeader, $table_right);
|
|
1667 writeRatioSB($tg_NonTr, $tg_Tr, $ws, 35, $refH_file, $k_file, "T>G", "A>C", $table_bottom, $table_bottomleft, $table_middleHeader2, $table_bottomRight);
|
|
1668
|
|
1669
|
|
1670 ### Write a warning message when NonTr+Tr < 10
|
|
1671 my $format_italic_red = $wb->add_format(font=>'Arial', size=>10, italic=>1, color => 'red');
|
|
1672
|
|
1673 if( (($ca_NonTr+$ca_Tr)< 10) || (($cg_NonTr+$cg_Tr)< 10) || (($ct_NonTr+$ct_Tr)< 10) || (($ta_NonTr+$ta_Tr)< 10) || (($tc_NonTr+$tc_Tr)< 10) || (($tg_NonTr+$tg_Tr)< 10) )
|
|
1674 {
|
|
1675 $ws->write(36, 11, "Warning message: chi-squared approximation may be incorrect because the number of SBS", $format_italic_red);
|
|
1676 $ws->write(37, 11, "on Non-transcribed and transcribed strand is lower than 10", $format_italic_red);
|
|
1677 }
|
|
1678 }
|
|
1679 # Write values in Table 3 (Sub function of writeChi2result)
|
|
1680 sub writeRatioSB
|
|
1681 {
|
|
1682 my ($count_NonTr, $count_Tr, $ws, $row, $refH_file, $k_file, $mut1, $mut2, $formatText, $formatTextMut1, $formatTextRatio, $formatTable) = @_;
|
|
1683
|
|
1684 my ($ratio_mut1, $ratio_mut2, $percent_NonTr, $percent_Tr) = (0, 0, 0, 0, 0);
|
|
1685 if( ($count_NonTr==0) || ($count_Tr==0) ) { $ratio_mut1 = 0; $ratio_mut2 = 0; $percent_NonTr = 0; $percent_Tr = 0; }
|
|
1686 else
|
|
1687 {
|
|
1688 $ratio_mut1 = $count_NonTr/$count_Tr; $ratio_mut1 = sprintf("%.2f", $ratio_mut1);
|
|
1689 $ratio_mut2 = $count_Tr/$count_NonTr; $ratio_mut2 = sprintf("%.2f", $ratio_mut2);
|
|
1690 $percent_NonTr = ($count_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100;
|
|
1691 $percent_Tr = ($count_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100;
|
|
1692 }
|
|
1693
|
|
1694 # C>N and T>N
|
|
1695 $ws->write($row, 11, $mut1, $formatTextMut1);
|
|
1696 $ws->write($row, 12, $ratio_mut1, $formatTextRatio);
|
|
1697 $ws->write($row, 13, $count_NonTr, $formatText);
|
|
1698 $ws->write($row, 14, $count_Tr, $formatText);
|
|
1699 # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10
|
|
1700 if( ($count_NonTr+$count_Tr) < 10 )
|
|
1701 {
|
|
1702 if($mut1 eq "T>G")
|
|
1703 {
|
|
1704 if(! exists $h_chi2{$k_file}{$mut1}{'p-value'}) { $ws->write_string($row, 15, ""); }
|
|
1705 elsif($h_chi2{$k_file}{$mut1}{'p-value'} eq "NA") { $ws->write_string($row, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $table_bottom); }
|
|
1706 else { $ws->write_string($row, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $table_bottomItalicRed); }
|
|
1707 }
|
|
1708 else
|
|
1709 {
|
|
1710 if(! exists $h_chi2{$k_file}{$mut1}{'p-value'}) { $ws->write_string($row, 15, ""); }
|
|
1711 elsif($h_chi2{$k_file}{$mut1}{'p-value'} eq "NA") { $ws->write_string($row, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $formatText); }
|
|
1712 else { $ws->write_string($row, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $format_A10ItalicRed); }
|
|
1713 }
|
|
1714 }
|
|
1715 else
|
|
1716 {
|
|
1717 $ws->write_string($row, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $formatText);
|
|
1718 }
|
|
1719
|
|
1720 $ws->write($row, 16, $h_chi2{$k_file}{$mut1}{'FDR'}, $formatText);
|
|
1721 $ws->write($row, 17, $h_chi2{$k_file}{$mut1}{'ConfInt'}, $formatTable);
|
|
1722
|
|
1723
|
|
1724 # G>N and A>N
|
|
1725 $ws->write($row+11, 11, $mut2, $formatTextMut1);
|
|
1726 $ws->write($row+11, 12, $ratio_mut2, $formatTextRatio);
|
|
1727 $ws->write($row+11, 13, $count_Tr, $formatText);
|
|
1728 $ws->write($row+11, 14, $count_NonTr, $formatText);
|
|
1729 if( ($count_NonTr+$count_Tr) < 10 )
|
|
1730 {
|
|
1731 if($mut1 eq "T>G")
|
|
1732 {
|
|
1733 if(! exists $h_chi2{$k_file}{$mut1}{'p-value'}) { $ws->write_string($row+11, 15, ""); }
|
|
1734 elsif($h_chi2{$k_file}{$mut1}{'p-value'} eq "NA") { $ws->write_string($row+11, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $table_bottom); }
|
|
1735 else { $ws->write_string($row+11, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $table_bottomItalicRed); }
|
|
1736 }
|
|
1737 else
|
|
1738 {
|
|
1739 if(! exists $h_chi2{$k_file}{$mut1}{'p-value'}) { $ws->write_string($row+11, 15, ""); }
|
|
1740 elsif($h_chi2{$k_file}{$mut1}{'p-value'} eq "NA") { $ws->write_string($row+11, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $formatText); }
|
|
1741 else { $ws->write_string($row+11, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $format_A10ItalicRed); }
|
|
1742 }
|
|
1743 }
|
|
1744 else
|
|
1745 {
|
|
1746 $ws->write_string($row+11, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $formatText);
|
|
1747 }
|
|
1748
|
|
1749 $ws->write($row+11, 16, $h_chi2{$k_file}{$mut1}{'FDR'}, $formatText);
|
|
1750 $ws->write($row+11, 17, $h_chi2{$k_file}{$mut1}{'ConfInt'}, $formatTable);
|
|
1751 }
|
|
1752
|
|
1753
|
|
1754 ############################################################################################################
|
|
1755 # SBS distribution by functional region (Table 4) & Strand bias by functional region (Table 5) & Overall count and percent of SBS (Table 1)
|
|
1756 sub writeStatbyFuncRegion
|
|
1757 {
|
|
1758 my ($refH_file, $sample, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $ref_RowSBSDistrBySegAndFuncCG, $mutDistrggplot2) = @_;
|
|
1759
|
|
1760 my $row_SBSdistrBySeg = $rowStart_SBSdistrBySeg+4;
|
|
1761 my $row_SBSDistrBySegAndFunc_CA = $rowStart_SBSdistrBySeg + $nb_func + 12;
|
|
1762 my $rowEndCG_SBSDistrBySegAndFunc_CG = $$ref_RowSBSDistrBySegAndFuncCG + $nb_func;
|
|
1763 my $row_SBSDistrBySegAndFunc_CT = $rowStart_SBSdistrBySeg + ($nb_func*3) + 20;
|
|
1764 my $colTable4 = 0;
|
|
1765
|
|
1766 my ($count_ca, $count_cg, $count_ct, $count_ta, $count_tc, $count_tg) = (0,0,0,0,0,0);
|
|
1767
|
|
1768
|
|
1769 ## 6 mutation types by segment
|
|
1770 foreach my $k_func (sort keys %{$refH_file->{$sample}{'6mutType'}})
|
|
1771 {
|
|
1772 my $totalSBS_bySegment = 0;
|
|
1773
|
|
1774 # Write the functional region for the section SBS distribution by segment (Table 4)
|
|
1775 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg, $k_func, $formatT_left);
|
|
1776 # Write the exonic func for the section strand bias by segment (Table 5)
|
|
1777 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg, $k_func, $formatT_left);
|
|
1778 # Write the last functional element in the table
|
|
1779 if($$ref_RowSBSDistrBySegAndFuncCG == $rowEndCG_SBSDistrBySegAndFunc_CG)
|
|
1780 {
|
|
1781 $ws->write($$ref_RowSBSDistrBySegAndFuncCG, $colStart_SBSdistrBySeg, $k_func, $formatT_bottomLeft);
|
|
1782 }
|
|
1783 else
|
|
1784 {
|
|
1785 $ws->write($$ref_RowSBSDistrBySegAndFuncCG, $colStart_SBSdistrBySeg, $k_func, $formatT_left);
|
|
1786 }
|
|
1787
|
|
1788 foreach my $k_mutation (sort keys %{$refH_file->{$sample}{'6mutType'}{$k_func}})
|
|
1789 {
|
|
1790 ### Write the count of SBS per mutation on genomic (Table 4) and coding strand (Table 5)
|
|
1791 if($k_mutation eq "C:G>A:T")
|
|
1792 {
|
|
1793 writeCountSBS($refH_file, $sample, $k_func, $k_mutation, $ws, $row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg, $colStart_SBSdistrBySeg+3, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, \$count_ca);
|
|
1794 }
|
|
1795 if($k_mutation eq "C:G>G:C")
|
|
1796 {
|
|
1797 writeCountSBS($refH_file, $sample, $k_func, $k_mutation, $ws, $row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+4, $colStart_SBSdistrBySeg+5, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, \$count_cg);
|
|
1798 }
|
|
1799 if($k_mutation eq "C:G>T:A")
|
|
1800 {
|
|
1801 writeCountSBS($refH_file, $sample, $k_func, $k_mutation, $ws, $row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+8, $colStart_SBSdistrBySeg+7, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, \$count_ct);
|
|
1802 }
|
|
1803 if($k_mutation eq "T:A>A:T")
|
|
1804 {
|
|
1805 writeCountSBS($refH_file, $sample, $k_func, $k_mutation, $ws, $$ref_RowSBSDistrBySegAndFuncCG, $colStart_SBSdistrBySeg, $colStart_SBSdistrBySeg+9, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, \$count_ta);
|
|
1806 }
|
|
1807 if($k_mutation eq "T:A>C:G")
|
|
1808 {
|
|
1809 writeCountSBS($refH_file, $sample, $k_func, $k_mutation, $ws, $$ref_RowSBSDistrBySegAndFuncCG, $colStart_SBSdistrBySeg+4, $colStart_SBSdistrBySeg+11, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, \$count_tc);
|
|
1810 }
|
|
1811 if($k_mutation eq "T:A>G:C")
|
|
1812 {
|
|
1813 writeCountSBS($refH_file, $sample, $k_func, $k_mutation, $ws, $$ref_RowSBSDistrBySegAndFuncCG, $colStart_SBSdistrBySeg+8, $colStart_SBSdistrBySeg+13, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, \$count_tg);
|
|
1814 }
|
|
1815
|
|
1816 # Calculate the total number of SBS on the genomic strand for each mutation types by exonic region
|
|
1817 $totalSBS_bySegment += $refH_file->{$sample}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
|
|
1818 } # End $k_mutation
|
|
1819
|
|
1820 $row_SBSDistrBySegAndFunc_CA++; $$ref_RowSBSDistrBySegAndFuncCG++; #$row_SBSDistrBySegAndFunc_CT++;
|
|
1821
|
|
1822 # Write the percent by exonic region
|
|
1823 writePercentSBS($refH_file, $sample, $k_func, "C:G>A:T", $row_SBSdistrBySeg, $colStart_SBSdistrBySeg+2, $ws, $totalSBS_bySegment);
|
|
1824 writePercentSBS($refH_file, $sample, $k_func, "C:G>G:C", $row_SBSdistrBySeg, $colStart_SBSdistrBySeg+4, $ws, $totalSBS_bySegment);
|
|
1825 writePercentSBS($refH_file, $sample, $k_func, "C:G>T:A", $row_SBSdistrBySeg, $colStart_SBSdistrBySeg+6, $ws, $totalSBS_bySegment);
|
|
1826 writePercentSBS($refH_file, $sample, $k_func, "T:A>A:T", $row_SBSdistrBySeg, $colStart_SBSdistrBySeg+8, $ws, $totalSBS_bySegment);
|
|
1827 writePercentSBS($refH_file, $sample, $k_func, "T:A>C:G", $row_SBSdistrBySeg, $colStart_SBSdistrBySeg+10, $ws, $totalSBS_bySegment);
|
|
1828 writePercentSBS($refH_file, $sample, $k_func, "T:A>G:C", $row_SBSdistrBySeg, $colStart_SBSdistrBySeg+12, $ws, $totalSBS_bySegment);
|
|
1829
|
|
1830 # Write the count of SBS by segment
|
|
1831 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+1, $totalSBS_bySegment, $format_A10);
|
|
1832
|
|
1833 $row_SBSdistrBySeg++;
|
|
1834 } # End $k_func
|
|
1835
|
|
1836
|
|
1837 # Write the total number of SBS on the genomic strand (Table 4)
|
|
1838 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+1, $refH_file->{$sample}{'TotalSBSGenomic'}, $formatT_bottomHeader);
|
|
1839
|
|
1840
|
|
1841 ##### Calculate the total percentages by mutation type
|
|
1842 my $percent_ca = ($count_ca / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent_ca = sprintf("%.2f", $percent_ca);
|
|
1843 my $percent_cg = ($count_cg / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent_cg = sprintf("%.2f", $percent_cg);
|
|
1844 my $percent_ct = ($count_ct / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent_ct = sprintf("%.2f", $percent_ct);
|
|
1845 my $percent_ta = ($count_ta / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent_ta = sprintf("%.2f", $percent_ta);
|
|
1846 my $percent_tc = ($count_tc / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent_tc = sprintf("%.2f", $percent_tc);
|
|
1847 my $percent_tg = ($count_tg / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent_tg = sprintf("%.2f", $percent_tg);
|
|
1848
|
|
1849
|
|
1850 # Write the total percentage (Table 4)
|
|
1851 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+2, $percent_ca, $formatT_bottom);
|
|
1852 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+3, $count_ca, $formatT_bottomHeader);
|
|
1853 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+4, $percent_cg, $formatT_bottom);
|
|
1854 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+5, $count_cg, $formatT_bottomHeader);
|
|
1855 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+6, $percent_ct, $formatT_bottom);
|
|
1856 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+7, $count_ct, $formatT_bottomHeader);
|
|
1857 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+8, $percent_ta, $formatT_bottom);
|
|
1858 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+9, $count_ta, $formatT_bottomHeader);
|
|
1859 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+10, $percent_tc, $formatT_bottom);
|
|
1860 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+11, $count_tc, $formatT_bottomHeader);
|
|
1861 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+12, $percent_tg, $formatT_bottom);
|
|
1862 $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+13, $count_tg, $formatT_bottomRightHeader);
|
|
1863
|
|
1864
|
|
1865
|
|
1866 # Overall distribution of SBS (Table 1)
|
|
1867 $ws->write(0, 0, "Graph 1. SBS distribution", $formatT_graphTitle); $ws->set_row(0, 18);
|
|
1868 $ws->write(29, 0, "Table 1. Frequency and counts of all SBS", $format_A10Boldleft);
|
|
1869 $ws->write(30, 0, "Mutation type", $table_topleft);
|
|
1870 $ws->write(30, 1, "Percentage", $table_top);
|
|
1871 $ws->write(30, 2, "Count", $table_topRight);
|
|
1872 $ws->write(31, 0, "C:G>A:T", $table_left); $ws->write(31, 1, $percent_ca, $format_A10); $ws->write(31, 2, $count_ca, $table_right);
|
|
1873 $ws->write(32, 0, "C:G>G:C", $table_left); $ws->write(32, 1, $percent_cg, $format_A10); $ws->write(32, 2, $count_cg, $table_right);
|
|
1874 $ws->write(33, 0, "C:G>T:A", $table_left); $ws->write(33, 1, $percent_ct, $format_A10); $ws->write(33, 2, $count_ct, $table_right);
|
|
1875 $ws->write(34, 0, "T:A>A:T", $table_left); $ws->write(34, 1, $percent_ta, $format_A10); $ws->write(34, 2, $count_ta, $table_right);
|
|
1876 $ws->write(35, 0, "T:A>C:G", $table_left); $ws->write(35, 1, $percent_tc, $format_A10); $ws->write(35, 2, $count_tc, $table_right);
|
|
1877 $ws->write(36, 0, "T:A>G:C", $table_left); $ws->write(36, 1, $percent_tg, $format_A10); $ws->write(36, 2, $count_tg, $table_right);
|
|
1878 $ws->write(37, 0, "", $table_bottomleft); $ws->write(37, 1, "", $table_bottom); $ws->write(37, 2, $refH_file->{$sample}{'TotalSBSGenomic'}, $table_bottomrightHeader);
|
|
1879
|
|
1880 # Create an input for ggplot2 for representing the distribution of SBS for each mutation types (Figure 1)
|
|
1881 open(DISTRSBS, ">", $mutDistrggplot2) or die "$!: $mutDistrggplot2\n";
|
|
1882 print DISTRSBS "Mutation_Type\tCount\tPercentage\tSample\n";
|
|
1883 print DISTRSBS "C:G>A:T\t$count_ca\t$percent_ca\t$sample\n";
|
|
1884 print DISTRSBS "C:G>G:C\t$count_cg\t$percent_cg\t$sample\n";
|
|
1885 print DISTRSBS "C:G>T:A\t$count_ct\t$percent_ct\t$sample\n";
|
|
1886 print DISTRSBS "T:A>A:T\t$count_ta\t$percent_ta\t$sample\n";
|
|
1887 print DISTRSBS "T:A>C:G\t$count_tc\t$percent_tc\t$sample\n";
|
|
1888 print DISTRSBS "T:A>G:C\t$count_tg\t$percent_tg\t$sample\n";
|
|
1889 close DISTRSBS;
|
|
1890 }
|
|
1891 # Write the percentage in table 4 of the Excel report (Sub function of writeStatbyFuncRegion)
|
|
1892 sub writePercentSBS
|
|
1893 {
|
|
1894 my ($refH_file, $k_file, $k_func, $mutation, $row, $col, $ws, $totalSBS) = @_;
|
|
1895
|
|
1896 my $percent = 0;
|
|
1897 if($refH_file->{$k_file}{'6mutType'}{$k_func}{$mutation}{'TotalMutG'} == 0)
|
|
1898 {
|
|
1899 $percent = 0;
|
|
1900 }
|
|
1901 else
|
|
1902 {
|
|
1903 $percent = ($refH_file->{$k_file}{'6mutType'}{$k_func}{$mutation}{'TotalMutG'} / $totalSBS ) * 100;
|
|
1904 $percent = sprintf("%.2f", $percent);
|
|
1905 }
|
|
1906 $ws->write($row, $col, $percent, $format_A10);
|
|
1907 }
|
|
1908 # Write the count in table 4 and table 5 of the Excel report (Sub function of writeStatbyFuncRegion)
|
|
1909 sub writeCountSBS
|
|
1910 {
|
|
1911 my ($refH_file, $k_file, $k_func, $k_mutation, $ws, $row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg, $colTable4, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, $refCount) = @_;
|
|
1912
|
|
1913 my $ratioSB = 0;
|
|
1914 if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) )
|
|
1915 {
|
|
1916 $ratioSB = 0;
|
|
1917 }
|
|
1918 else
|
|
1919 {
|
|
1920 $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'};
|
|
1921 }
|
|
1922 $ratioSB = sprintf("%.2f", $ratioSB);
|
|
1923
|
|
1924
|
|
1925 if($row_SBSDistrBySegAndFunc_CA == $rowEndCG_SBSDistrBySegAndFunc_CG)
|
|
1926 {
|
|
1927 # Write the ratio of NonTr / Tr (Table 5)
|
|
1928 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+1, $ratioSB, $formatT_bottom);
|
|
1929 # Write the count of SBS in the NonTr and Tr strand
|
|
1930 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $formatT_bottom);
|
|
1931 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_bottom);
|
|
1932
|
|
1933 if( ($k_mutation eq "C:G>T:A") || ($k_mutation eq "T:A>G:C") )
|
|
1934 {
|
|
1935 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_bottomRight);
|
|
1936 }
|
|
1937 }
|
|
1938 else
|
|
1939 {
|
|
1940 # Write the ratio of NonTr / Tr (Table 5)
|
|
1941 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+1, $ratioSB, $format_A10);
|
|
1942 # Write the count of SBS in the NonTr and Tr strand
|
|
1943 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10);
|
|
1944 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $format_A10);
|
|
1945
|
|
1946 if( ($k_mutation eq "C:G>T:A") || ($k_mutation eq "T:A>G:C") )
|
|
1947 {
|
|
1948 $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_right);
|
|
1949 }
|
|
1950 }
|
|
1951
|
|
1952 if($k_mutation eq "C:G>A:T")
|
|
1953 {
|
|
1954 # Calculate the total number of SBS per mut type (genomic strand)
|
|
1955 $$refCount += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
|
|
1956 }
|
|
1957 elsif($k_mutation eq "C:G>G:C")
|
|
1958 {
|
|
1959 $$refCount += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
|
|
1960 }
|
|
1961 elsif($k_mutation eq "C:G>T:A")
|
|
1962 {
|
|
1963 $$refCount += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
|
|
1964 }
|
|
1965 elsif($k_mutation eq "T:A>A:T")
|
|
1966 {
|
|
1967 $$refCount += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
|
|
1968 }
|
|
1969 elsif($k_mutation eq "T:A>C:G")
|
|
1970 {
|
|
1971 $$refCount += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
|
|
1972 }
|
|
1973 elsif($k_mutation eq "T:A>G:C")
|
|
1974 {
|
|
1975 $$refCount += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'};
|
|
1976 }
|
|
1977
|
|
1978
|
|
1979 # Write the count by exonic region (Table 4)
|
|
1980 $ws->write($row_SBSdistrBySeg, $colTable4, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $format_A10);
|
|
1981
|
|
1982 if($k_mutation eq "T:A>G:C")
|
|
1983 {
|
|
1984 $ws->write($row_SBSdistrBySeg, $colTable4, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $formatT_right);
|
|
1985 }
|
|
1986 }
|
|
1987
|
|
1988
|
|
1989 ############################################################################################################
|
|
1990 # SBS distribution by chromosomes (Table 6)
|
|
1991 sub writeDistrByChr
|
|
1992 {
|
|
1993 my ($ws, $refH_file, $sample, $row, $col, $distrByChrggplot2) = @_;
|
|
1994
|
|
1995
|
|
1996 # For the HTML report
|
|
1997 open(SBSPerChr, ">", $distrByChrggplot2) or die "$!: $distrByChrggplot2\n";
|
|
1998 print SBSPerChr "\tPearson\t$refH_file->{$sample}{'SBSPerChr'}{'AllMutType'}\t", $refH_file->{$sample}{'SBSPerChr'}{"C:G>A:T"}{'Pearson'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"C:G>G:C"}{'Pearson'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"C:G>T:A"}{'Pearson'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"T:A>A:T"}{'Pearson'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"T:A>C:G"}{'Pearson'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"T:A>G:C"}{'Pearson'},"\n";
|
|
1999 print SBSPerChr "Chr\tSize\tAll SBS\tC:G>A:T\tC:G>G:C\tC:G>T:A\tT:A>A:T\tT:A>C:G\tT:A>G:C\n";
|
|
2000
|
|
2001
|
|
2002 my $row_SBSPerChr = $row + 8;
|
|
2003
|
|
2004
|
|
2005 # Write the Pearson coefficient
|
|
2006 $ws->write($row+6, $col+3, $refH_file->{$sample}{'SBSPerChr'}{"C:G>A:T"}{'Pearson'}, $format_A10);
|
|
2007 $ws->write($row+6, $col+4, $refH_file->{$sample}{'SBSPerChr'}{"C:G>G:C"}{'Pearson'}, $format_A10);
|
|
2008 $ws->write($row+6, $col+5, $refH_file->{$sample}{'SBSPerChr'}{"C:G>T:A"}{'Pearson'}, $format_A10);
|
|
2009 $ws->write($row+6, $col+6, $refH_file->{$sample}{'SBSPerChr'}{"T:A>A:T"}{'Pearson'}, $format_A10);
|
|
2010 $ws->write($row+6, $col+7, $refH_file->{$sample}{'SBSPerChr'}{"T:A>C:G"}{'Pearson'}, $format_A10);
|
|
2011 $ws->write($row+6, $col+8, $refH_file->{$sample}{'SBSPerChr'}{"T:A>G:C"}{'Pearson'}, $formatT_right);
|
|
2012
|
|
2013 # Write the chromosome number and their sizes / Write count SBS per chromosomes
|
|
2014 my $line=0;
|
|
2015
|
|
2016 foreach my $chromosome (sort keys %chromosomes)
|
|
2017 {
|
|
2018 $ws->write($row_SBSPerChr+($line), $col, $chromosome, $formatT_left);
|
|
2019 $ws->write($row_SBSPerChr+($line), $col+1, $chromosomes{$chromosome}, $format_A10);
|
|
2020 $ws->write($row_SBSPerChr+($line), $col+2, $refH_file->{$sample}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'}, $format_A10);
|
|
2021
|
|
2022 # Write the count per mutation
|
|
2023 $ws->write($row_SBSPerChr+($line), $col+3, $refH_file->{$sample}{'SBSPerChr'}{"C:G>A:T"}{'CHR'}{$chromosome}{'chr'}, $format_A10);
|
|
2024 $ws->write($row_SBSPerChr+($line), $col+4, $refH_file->{$sample}{'SBSPerChr'}{"C:G>G:C"}{'CHR'}{$chromosome}{'chr'}, $format_A10);
|
|
2025 $ws->write($row_SBSPerChr+($line), $col+5, $refH_file->{$sample}{'SBSPerChr'}{"C:G>T:A"}{'CHR'}{$chromosome}{'chr'}, $format_A10);
|
|
2026 $ws->write($row_SBSPerChr+($line), $col+6, $refH_file->{$sample}{'SBSPerChr'}{"T:A>A:T"}{'CHR'}{$chromosome}{'chr'}, $format_A10);
|
|
2027 $ws->write($row_SBSPerChr+($line), $col+7, $refH_file->{$sample}{'SBSPerChr'}{"T:A>C:G"}{'CHR'}{$chromosome}{'chr'}, $format_A10);
|
|
2028 $ws->write($row_SBSPerChr+($line), $col+8, $refH_file->{$sample}{'SBSPerChr'}{"T:A>G:C"}{'CHR'}{$chromosome}{'chr'}, $formatT_right);
|
|
2029
|
|
2030
|
|
2031 # For the HTML report
|
|
2032 print SBSPerChr "$chromosome\t", $chromosomes{$chromosome},"\t", $refH_file->{$sample}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"C:G>A:T"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"C:G>G:C"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"C:G>T:A"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"T:A>A:T"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"T:A>C:G"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"T:A>G:C"}{'CHR'}{$chromosome}{'chr'},"\n";
|
|
2033
|
|
2034 $line++;
|
|
2035 }
|
|
2036
|
|
2037 # Write the Pearson coefficient for the total number of SBS
|
|
2038 $ws->write($row+6, $col+2, $refH_file->{$sample}{'SBSPerChr'}{'AllMutType'}, $format_A10);
|
|
2039 $ws->write($row_SBSPerChr+(keys %chromosomes), $col+2, $refH_file->{$sample}{'TotalSBSGenomic'}, $formatT_bottomHeader);
|
|
2040
|
|
2041 print SBSPerChr "\t\t$refH_file->{$sample}{'TotalSBSGenomic'}\n";
|
|
2042 close SBSPerChr;
|
|
2043 }
|
|
2044
|
|
2045
|
|
2046 ############################################################################################################
|
|
2047 # Trinucleotide sequence context on genomic strand (Panel 1)
|
|
2048 sub writeTriNtGenomic
|
|
2049 {
|
|
2050 my ($ws, $refH_file, $sample, $col, $heatmapCountggplot2, $heatmapPercentggplot2, $triNtBarChartggplot2, $ref_c_ca6_g, $ref_c_cg6_g, $ref_c_ct6_g, $ref_c_ta6_g, $ref_c_tc6_g, $ref_c_tg6_g) = @_;
|
|
2051
|
|
2052 # Initialise the row of the panel 1
|
|
2053 my $row_SeqContext6 = 4;
|
|
2054 # Percent total of mutations for 6 mutation types on genomic strand
|
|
2055 my ($p_ca6_g, $p_cg6_g, $p_ct6_g, $p_ta6_g, $p_tc6_g, $p_tg6_g) = (0,0,0, 0,0,0);
|
|
2056 my $maxValue = 0; # For the heatmap
|
|
2057
|
|
2058 # For checking if the total number of SBS is correct
|
|
2059 my $total_SBS_genomic = 0;
|
|
2060
|
|
2061
|
|
2062 open(HEATMAPCGENOMIC, ">", $heatmapCountggplot2) or die "$!: $heatmapCountggplot2\n";
|
|
2063 print HEATMAPCGENOMIC "\tC>A\tC>G\tC>T\tT>A\tT>C\tT>G\n";
|
|
2064 open(HEATMAPPGENOMIC, ">", $heatmapPercentggplot2) or die "$!: $heatmapPercentggplot2\n";
|
|
2065 print HEATMAPPGENOMIC "\tC>A\tC>G\tC>T\tT>A\tT>C\tT>G\n";
|
|
2066
|
|
2067 ## Bar plot NMF like
|
|
2068 open(BARPLOTNMFLIKE, ">", $triNtBarChartggplot2) or die "$!: $triNtBarChartggplot2\n";
|
|
2069 print BARPLOTNMFLIKE "alteration\tcontext\tvalue\n";
|
|
2070
|
|
2071
|
|
2072 foreach my $k_context (sort keys %{$refH_file->{$sample}{'SeqContextG'}})
|
|
2073 {
|
|
2074 if( ($k_context =~ /N/) || (length($k_context) != 3) ) { next; }
|
|
2075
|
|
2076 # Write the context: 6 mut type on genomic strand
|
|
2077 $ws->write($row_SeqContext6 , $col+3, $k_context, $format_A10);
|
|
2078 $ws->write($row_SeqContext6 , $col+13, $k_context, $format_A10);
|
|
2079
|
|
2080 # Count for the heatmap
|
|
2081 print HEATMAPCGENOMIC $k_context."\t";
|
|
2082 print HEATMAPPGENOMIC $k_context."\t";
|
|
2083
|
|
2084 foreach my $k_mutation (sort keys %{$refH_file->{$sample}{'SeqContextG'}{$k_context}})
|
|
2085 {
|
|
2086 # For checking the total number of SBS
|
|
2087 $total_SBS_genomic += $refH_file->{$sample}{'SeqContextG'}{$k_context}{$k_mutation};
|
|
2088
|
|
2089 # Calculate the percentages
|
|
2090 my $percent = 0;
|
|
2091 if($refH_file->{$sample}{'SeqContextG'}{$k_context}{$k_mutation} == 0) { $percent = 0; }
|
|
2092 else
|
|
2093 {
|
|
2094 $percent = ($refH_file->{$sample}{'SeqContextG'}{$k_context}{$k_mutation} / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100;
|
|
2095 $percent = sprintf("%.2f", $percent);
|
|
2096 }
|
|
2097
|
|
2098 # For representing the sequence context with a bar plot (NMF like style)
|
|
2099 print BARPLOTNMFLIKE $k_mutation,"\t", $k_context,"\t", $percent,"\n";
|
|
2100
|
|
2101 # Write the count for the heatmap
|
|
2102 print HEATMAPCGENOMIC $refH_file->{$sample}{'SeqContextG'}{$k_context}{$k_mutation}."\t";
|
|
2103 print HEATMAPPGENOMIC "$percent\t";
|
|
2104
|
|
2105
|
|
2106 # For NMF input
|
|
2107 my $count = $refH_file->{$sample}{'SeqContextG'}{$k_context}{$k_mutation};
|
|
2108 if($sample ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{$k_mutation}}, $count); }
|
|
2109 if($sample ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{$k_mutation}}, $percent); }
|
|
2110
|
|
2111
|
|
2112 if($k_mutation eq "C>A")
|
|
2113 {
|
|
2114 triNtByMut($ws, $row_SeqContext6, $col+4, $col+14, $refH_file, $sample, $k_context, $k_mutation, $percent, $maxValue, $ref_c_ca6_g, \$p_ca6_g);
|
|
2115 }
|
|
2116 elsif($k_mutation eq "C>G")
|
|
2117 {
|
|
2118 triNtByMut($ws, $row_SeqContext6, $col+5, $col+15, $refH_file, $sample, $k_context, $k_mutation, $percent, $maxValue, $ref_c_cg6_g, \$p_cg6_g);
|
|
2119 }
|
|
2120 elsif($k_mutation eq "C>T")
|
|
2121 {
|
|
2122 triNtByMut($ws, $row_SeqContext6, $col+6, $col+16, $refH_file, $sample, $k_context, $k_mutation, $percent, $maxValue, $ref_c_ct6_g, \$p_ct6_g);
|
|
2123 }
|
|
2124 elsif($k_mutation eq "T>A")
|
|
2125 {
|
|
2126 triNtByMut($ws, $row_SeqContext6, $col+7, $col+17, $refH_file, $sample, $k_context, $k_mutation, $percent, $maxValue, $ref_c_ta6_g, \$p_ta6_g);
|
|
2127 }
|
|
2128 elsif($k_mutation eq "T>C")
|
|
2129 {
|
|
2130 triNtByMut($ws, $row_SeqContext6, $col+8, $col+18, $refH_file, $sample, $k_context, $k_mutation, $percent, $maxValue, $ref_c_tc6_g, \$p_tc6_g);
|
|
2131 }
|
|
2132 elsif($k_mutation eq "T>G")
|
|
2133 {
|
|
2134 triNtByMut($ws, $row_SeqContext6, $col+9, $col+19, $refH_file, $sample, $k_context, $k_mutation, $percent, $maxValue, $ref_c_tg6_g, \$p_tg6_g);
|
|
2135 }
|
|
2136 else
|
|
2137 {
|
|
2138 print STDERR "Error: Mutation type not considered for: $k_mutation\n";
|
|
2139 exit;
|
|
2140 }
|
|
2141 }
|
|
2142 $row_SeqContext6++;
|
|
2143
|
|
2144 print HEATMAPCGENOMIC "\n";
|
|
2145 print HEATMAPPGENOMIC "\n";
|
|
2146 }
|
|
2147 close HEATMAPCGENOMIC; close HEATMAPPGENOMIC;
|
|
2148 close BARPLOTNMFLIKE;
|
|
2149
|
|
2150
|
|
2151 # Write the total number of SBS per mutation type: COUNT
|
|
2152 $ws->write($row_SeqContext6, $col+4, $$ref_c_ca6_g, $formatT_bottomHeader2);
|
|
2153 $ws->write($row_SeqContext6, $col+5, $$ref_c_cg6_g, $formatT_bottomHeader2);
|
|
2154 $ws->write($row_SeqContext6, $col+6, $$ref_c_ct6_g, $formatT_bottomHeader2);
|
|
2155 $ws->write($row_SeqContext6, $col+7, $$ref_c_ta6_g, $formatT_bottomHeader2);
|
|
2156 $ws->write($row_SeqContext6, $col+8, $$ref_c_tc6_g, $formatT_bottomHeader2);
|
|
2157 $ws->write($row_SeqContext6, $col+9, $$ref_c_tg6_g, $formatT_bottomHeader2);
|
|
2158 if($total_SBS_genomic != $refH_file->{$sample}{'TotalSBSGenomic'})
|
|
2159 {
|
|
2160 print STDERR "Error in the calculation of the total number of SBS on the genomic strand!!!!\n";
|
|
2161 print STDERR "From hash table $refH_file->{$sample}{'TotalSBSGenomic'}\tVS\t$total_SBS_genomic\n";
|
|
2162 exit;
|
|
2163 }
|
|
2164
|
|
2165 # Write the total number of SBS per mutation type: PERCENT
|
|
2166 $ws->write($row_SeqContext6, $col+14, $p_ca6_g, $formatT_bottomHeader2);
|
|
2167 $ws->write($row_SeqContext6, $col+15, $p_cg6_g, $formatT_bottomHeader2);
|
|
2168 $ws->write($row_SeqContext6, $col+16, $p_ct6_g, $formatT_bottomHeader2);
|
|
2169 $ws->write($row_SeqContext6, $col+17, $p_ta6_g, $formatT_bottomHeader2);
|
|
2170 $ws->write($row_SeqContext6, $col+18, $p_tc6_g, $formatT_bottomHeader2);
|
|
2171 $ws->write($row_SeqContext6, $col+19, $p_tg6_g, $formatT_bottomHeader2);
|
|
2172
|
|
2173 my $totalPercent_genomic = $p_ca6_g + $p_cg6_g + $p_ct6_g + $p_ta6_g + $p_tc6_g + $p_tg6_g;
|
|
2174 $totalPercent_genomic = sprintf("%.0f", $totalPercent_genomic);
|
|
2175
|
|
2176 if($totalPercent_genomic != 100)
|
|
2177 {
|
|
2178 print STDERR "Error in the calculation of the total percentages on the genomic strand!!!\n";
|
|
2179 print STDERR "The total is equal to=\t$totalPercent_genomic\n";
|
|
2180 exit;
|
|
2181 }
|
|
2182 }
|
|
2183 # Trinucleotide count and percentage by mutation type (Sub function of writeTriNtGenomic)
|
|
2184 sub triNtByMut
|
|
2185 {
|
|
2186 my ($ws, $row, $colC, $colP, $refH_file, $sample, $context, $mutation, $percent, $maxValue, $refCountG, $refPercentG) = @_;
|
|
2187
|
|
2188 ### COUNT
|
|
2189 $ws->write($row, $colC, $refH_file->{$sample}{'SeqContextG'}{$context}{$mutation}, $format_A10);
|
|
2190
|
|
2191 ### PERCENTAGE
|
|
2192 $ws->write($row, $colP, $percent, $format_A10);
|
|
2193
|
|
2194 # For the heatmap
|
|
2195 if($percent >= $maxValue) { $maxValue = $percent; }
|
|
2196
|
|
2197 # For the total amount per mutation types
|
|
2198 $$refCountG += $refH_file->{$sample}{'SeqContextG'}{$context}{$mutation};
|
|
2199 $$refPercentG += $percent;
|
|
2200 }
|
|
2201
|
|
2202
|
|
2203 ############################################################################################################
|
|
2204 # Trinucleotide sequence context on coding strand (Panel 2)
|
|
2205 sub writeTriNtCoding
|
|
2206 {
|
|
2207 my ($ws, $row, $col, $refH_file, $sample, $triNtBarChartCodingCountggplot2, $triNtBarChartCodingPercentggplot2) = @_;
|
|
2208
|
|
2209 # Initialise the row
|
|
2210 my $row_SeqContext12 = $row+6;
|
|
2211 my $row_SeqContext12Percent = $row+27;
|
|
2212
|
|
2213 # Total count and percent calculated for the strand bias
|
|
2214 my ($ca_NonTr, $ca_Tr, $cg_NonTr, $cg_Tr, $ct_NonTr, $ct_Tr, $ta_NonTr, $ta_Tr, $tc_NonTr, $tc_Tr, $tg_NonTr, $tg_Tr) = (0,0,0, 0,0,0, 0,0,0, 0,0,0);
|
|
2215 my ($percent_ca_NonTr, $percent_ca_Tr, $percent_cg_NonTr, $percent_cg_Tr, $percent_ct_NonTr, $percent_ct_Tr, $percent_ta_NonTr, $percent_ta_Tr, $percent_tc_NonTr, $percent_tc_Tr, $percent_tg_NonTr, $percent_tg_Tr) = (0,0,0, 0,0,0, 0,0,0, 0,0,0);
|
|
2216
|
|
2217 # For checking if the total number of SBS is correct
|
|
2218 my $total_SBS_coding = 0;
|
|
2219
|
|
2220
|
|
2221 open(COUNT, ">", $triNtBarChartCodingCountggplot2) or die "$!: $triNtBarChartCodingCountggplot2\n";
|
|
2222 print COUNT "MutationTypeContext\tStrand\tValue\tSample\n";
|
|
2223 open(PERCENT, ">", $triNtBarChartCodingPercentggplot2) or die "$!: $triNtBarChartCodingPercentggplot2\n";
|
|
2224 print PERCENT "MutationTypeContext\tStrand\tValue\tSample\n";
|
|
2225
|
|
2226 foreach my $k_context (sort keys %{$refH_file->{$sample}{'SeqContextC'}})
|
|
2227 {
|
|
2228 if( ($k_context =~ /N/) || (length($k_context) != 3) ) { next; }
|
|
2229
|
|
2230 # Write the context: 12 mut type on coding strand
|
|
2231 $ws->write($row_SeqContext12 , $col, $k_context, $formatT_left);
|
|
2232 $ws->write($row_SeqContext12Percent , $col, $k_context, $formatT_left);
|
|
2233
|
|
2234 foreach my $k_mutation (sort keys %{$refH_file->{$sample}{'SeqContextC'}{$k_context}})
|
|
2235 {
|
|
2236 # Percent: 12 mut type on coding strand
|
|
2237 my ($percent_NonTr, $percent_Tr) = (0, 0);
|
|
2238
|
|
2239 if($refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'} != 0)
|
|
2240 {
|
|
2241 $percent_NonTr = ( $refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'} / $refH_file->{$sample}{'TotalSBSCoding'} ) * 100;
|
|
2242 }
|
|
2243
|
|
2244 if($refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'} != 0)
|
|
2245 {
|
|
2246 $percent_Tr = ( $refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'} / $refH_file->{$sample}{'TotalSBSCoding'} ) * 100;
|
|
2247 }
|
|
2248
|
|
2249
|
|
2250 # Counts
|
|
2251 print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$sample\n";
|
|
2252 print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$sample\n";
|
|
2253
|
|
2254 # Percentages
|
|
2255 $percent_NonTr = sprintf("%.2f", $percent_NonTr);
|
|
2256 $percent_Tr = sprintf("%.2f", $percent_Tr);
|
|
2257 print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$sample\n";
|
|
2258 print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$sample\n";
|
|
2259
|
|
2260 # Calculate the total number for each mutation types
|
|
2261 if($k_mutation eq "C>A")
|
|
2262 {
|
|
2263 triNtByMutCoding($refH_file, $sample, $k_context, $k_mutation, $ws, $row_SeqContext12, $col+1, $row_SeqContext12Percent, \$ca_NonTr, \$ca_Tr, $percent_NonTr, $percent_Tr);
|
|
2264
|
|
2265 $percent_ca_NonTr += $percent_NonTr;
|
|
2266 $percent_ca_Tr += $percent_Tr;
|
|
2267 }
|
|
2268 if($k_mutation eq "C>G")
|
|
2269 {
|
|
2270 triNtByMutCoding($refH_file, $sample, $k_context, $k_mutation, $ws, $row_SeqContext12, $col+3, $row_SeqContext12Percent, \$cg_NonTr, \$cg_Tr, $percent_NonTr, $percent_Tr);
|
|
2271
|
|
2272 $percent_cg_NonTr += $percent_NonTr;
|
|
2273 $percent_cg_Tr += $percent_Tr;
|
|
2274 }
|
|
2275 if($k_mutation eq "C>T")
|
|
2276 {
|
|
2277 triNtByMutCoding($refH_file, $sample, $k_context, $k_mutation, $ws, $row_SeqContext12, $col+5, $row_SeqContext12Percent, \$ct_NonTr, \$ct_Tr, $percent_NonTr, $percent_Tr);
|
|
2278
|
|
2279 $percent_ct_NonTr += $percent_NonTr;
|
|
2280 $percent_ct_Tr += $percent_Tr;
|
|
2281 }
|
|
2282 if($k_mutation eq "T>A")
|
|
2283 {
|
|
2284 triNtByMutCoding($refH_file, $sample, $k_context, $k_mutation, $ws, $row_SeqContext12, $col+7, $row_SeqContext12Percent, \$ta_NonTr, \$ta_Tr, $percent_NonTr, $percent_Tr);
|
|
2285
|
|
2286 $percent_ta_NonTr += $percent_NonTr;
|
|
2287 $percent_ta_Tr += $percent_Tr;
|
|
2288 }
|
|
2289 if($k_mutation eq "T>C")
|
|
2290 {
|
|
2291 triNtByMutCoding($refH_file, $sample, $k_context, $k_mutation, $ws, $row_SeqContext12, $col+9, $row_SeqContext12Percent, \$tc_NonTr, \$tc_Tr, $percent_NonTr, $percent_Tr);
|
|
2292
|
|
2293 $percent_tc_NonTr += $percent_NonTr;
|
|
2294 $percent_tc_Tr += $percent_Tr;
|
|
2295 }
|
|
2296 if($k_mutation eq "T>G")
|
|
2297 {
|
|
2298 triNtByMutCoding($refH_file, $sample, $k_context, $k_mutation, $ws, $row_SeqContext12, $col+11, $row_SeqContext12Percent, \$tg_NonTr, \$tg_Tr, $percent_NonTr, $percent_Tr);
|
|
2299
|
|
2300 $percent_tg_NonTr += $percent_NonTr;
|
|
2301 $percent_tg_Tr += $percent_Tr;
|
|
2302 }
|
|
2303
|
|
2304 # For checking if the total number of SBS is correct
|
|
2305 $total_SBS_coding += $refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'} + $refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'};
|
|
2306 }
|
|
2307 $row_SeqContext12++; $row_SeqContext12Percent++;
|
|
2308 }
|
|
2309 close COUNT; close PERCENT;
|
|
2310
|
|
2311
|
|
2312 ## Write the total of each mutation types: 12 mut type on coding strand
|
|
2313 $ws->write($row_SeqContext12, $col+1, $ca_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+2, $ca_Tr, $formatT_bottomHeader2);
|
|
2314 $ws->write($row_SeqContext12, $col+3, $cg_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+4, $cg_Tr, $formatT_bottomHeader2);
|
|
2315 $ws->write($row_SeqContext12, $col+5, $ct_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+6, $ct_Tr, $formatT_bottomHeader2);
|
|
2316 $ws->write($row_SeqContext12, $col+7, $ta_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+8, $ta_Tr, $formatT_bottomHeader2);
|
|
2317 $ws->write($row_SeqContext12, $col+9, $tc_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+10, $tc_Tr, $formatT_bottomHeader2);
|
|
2318 $ws->write($row_SeqContext12, $col+11, $tg_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+12, $tg_Tr, $formatT_bottomHeader2);
|
|
2319 # Write the total percentages of each mutation types: 12 mut type on coding strand
|
|
2320 $ws->write($row_SeqContext12Percent, $col+1, $percent_ca_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+2, $percent_ca_Tr, $formatT_bottomHeader);
|
|
2321 $ws->write($row_SeqContext12Percent, $col+3, $percent_cg_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+4, $percent_cg_Tr, $formatT_bottomHeader);
|
|
2322 $ws->write($row_SeqContext12Percent, $col+5, $percent_ct_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+6, $percent_ct_Tr, $formatT_bottomHeader);
|
|
2323 $ws->write($row_SeqContext12Percent, $col+7, $percent_ta_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+8, $percent_ta_Tr, $formatT_bottomHeader);
|
|
2324 $ws->write($row_SeqContext12Percent, $col+9, $percent_tc_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+10, $percent_tc_Tr, $formatT_bottomHeader);
|
|
2325 $ws->write($row_SeqContext12Percent, $col+11, $percent_tg_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+12, $percent_tg_Tr, $formatT_bottomHeader);
|
|
2326
|
|
2327 if($total_SBS_coding == $refH_file->{$sample}{'TotalSBSCoding'})
|
|
2328 {
|
|
2329 $ws->write($row_SeqContext12, $col+13, $refH_file->{$sample}{'TotalSBSCoding'}, $formatT_bottomHeader2)
|
|
2330 }
|
|
2331 else
|
|
2332 {
|
|
2333 print STDERR "Error: in the calculation of the total number of SBS on the coding strand!!!!\n";
|
|
2334 print STDERR "From hash table $refH_file->{$sample}{'TotalSBSCoding'}\tVS\t$total_SBS_coding\n";
|
|
2335 exit;
|
|
2336 }
|
|
2337
|
|
2338 my $totalP_SBS_coding = $percent_ca_NonTr + $percent_ca_Tr + $percent_cg_NonTr + $percent_cg_Tr + $percent_ct_NonTr + $percent_ct_Tr + $percent_ta_NonTr + $percent_ta_Tr + $percent_tc_NonTr + $percent_tc_Tr + $percent_tg_NonTr + $percent_tg_Tr;
|
|
2339 $totalP_SBS_coding = sprintf("%.0f", $totalP_SBS_coding);
|
|
2340
|
|
2341 if($totalP_SBS_coding != 100)
|
|
2342 {
|
|
2343 print STDERR "Error: The percentages for the trinucleotide sequence context on the coding strand for 12 mutation types is not equal to 100!!!\n$totalP_SBS_coding\n";
|
|
2344 exit;
|
|
2345 }
|
|
2346 }
|
|
2347 # Trinucleotide count and percentage by mutation type on Coding strand (Sub function of writeTriNtCoding)
|
|
2348 sub triNtByMutCoding
|
|
2349 {
|
|
2350 my ($refH_file, $sample, $context, $mutation, $ws, $row, $col, $rowP, $refNonTr, $refTr, $percent_NonTr, $percent_Tr) = @_;
|
|
2351
|
|
2352 $$refNonTr += $refH_file->{$sample}{'SeqContextC'}{$context}{$mutation}{'NonTr'};
|
|
2353 $$refTr += $refH_file->{$sample}{'SeqContextC'}{$context}{$mutation}{'Tr'};
|
|
2354
|
|
2355 # COUNT : 12 mutation type (stranded bar graph)
|
|
2356 $ws->write($row, $col, $refH_file->{$sample}{'SeqContextC'}{$context}{$mutation}{'NonTr'}, $format_A10);
|
|
2357 $ws->write($row, $col+1, $refH_file->{$sample}{'SeqContextC'}{$context}{$mutation}{'Tr'}, $format_A10);
|
|
2358
|
|
2359
|
|
2360 ## PERCENT : 12 mutation type (stranded bar graph)
|
|
2361 $ws->write($rowP, $col, $percent_NonTr, $format_A10);
|
|
2362 $ws->write($rowP, $col+1, $percent_Tr, $format_A10);
|
|
2363 }
|
|
2364
|
|
2365
|
|
2366 ############################################################################################################
|
|
2367 # Create and write the figures on the Excel report
|
|
2368 sub createWriteFigs
|
|
2369 {
|
|
2370 my ($ws, $row, $col, $folderFigure, $sample, $c_ca6_g, $c_cg6_g, $c_ct6_g, $c_ta6_g, $c_tc6_g, $c_tg6_g) = @_;
|
|
2371
|
|
2372 ######## Create figures
|
|
2373 # Bar char for SBS distribution (Figure 1)
|
|
2374 # Pie char for Impact on protein sequence (Figure 2)
|
|
2375 # Stranded distribution of SBS (Figure 3)
|
|
2376 # Heatmaps for trinucleotide context
|
|
2377 `Rscript $pathRScriptFigs --folderFigure $folderFigure --folderTemp $folder_temp --filename $sample`;
|
|
2378
|
|
2379 # Bar chart for trinucleotide context on coding strand
|
|
2380 `Rscript $pathRScriptTxnSB $folderFigure/Stranded_Analysis/$sample/$sample-StrandedSignatureCount.txt $folderFigure/Stranded_Analysis/$sample/$sample-StrandedSignatureCount $folder_temp/$sample-StrandedSignatureCount Count`;
|
|
2381
|
|
2382 `Rscript $pathRScriptTxnSB $folderFigure/Stranded_Analysis/$sample/$sample-StrandedSignaturePercent.txt $folderFigure/Stranded_Analysis/$sample/$sample-StrandedSignaturePercent $folder_temp/$sample-StrandedSignaturePercent Percent`;
|
|
2383
|
|
2384 # Bar plot for representing the sequence context (NMF like style)
|
|
2385 `Rscript $pathRScriptMutSpectrum $folderFigure/Trinucleotide_Sequence_Context/$sample/$sample-MutationSpectraPercent-Genomic.txt $sample $folderFigure/Trinucleotide_Sequence_Context/$sample $folder_temp $c_ca6_g $c_cg6_g $c_ct6_g $c_ta6_g $c_tc6_g $c_tg6_g`;
|
|
2386
|
|
2387
|
|
2388 ######## Write the figures in the Excel report
|
|
2389 # Bar char for SBS distribution (Figure 1)
|
|
2390 $ws->insert_image(1, 0, "$folder_temp/$sample-SBS_distribution-Report.png", 0, 0, .2, .2);
|
|
2391
|
|
2392 # Impact of the SBS on the protein (Figure 2)
|
|
2393 $ws->write(0, 6, "Graph 2. Impact on protein sequence", $formatT_graphTitle);
|
|
2394 $ws->insert_image(1, 6, "$folder_temp/$sample-DistributionExoFunc-Report.png", 0, 0, .2, .2);
|
|
2395
|
|
2396 # Stranded distribution of SBS (Figure 3)
|
|
2397 $ws->write(0, 11, "Graph 3. Stranded distribution of SBS", $formatT_graphTitle);
|
|
2398 $ws->insert_image(1, 11, "$folder_temp/$sample-StrandBias-Report.png", 0, 0, .2, .2);
|
|
2399
|
|
2400 ## Trinucleotide context on coding strand (Scale the inserted image: width x 0.7, height x 0.8)
|
|
2401 $ws->insert_image($row+3, $col+15, "$folder_temp/$sample-StrandedSignatureCount-Report.png", 0, 0, .16, .16);
|
|
2402 $ws->insert_image($row+24, $col+15, "$folder_temp/$sample-StrandedSignaturePercent-Report.png", 0, 0, .16, .16);
|
|
2403
|
|
2404 # Heatamp for the sequence context on the genomic strand (6 mutation types)
|
|
2405 $ws->insert_image(4, $col, "$folder_temp/$sample-HeatmapCount-Genomic-Report.png");
|
|
2406 $ws->insert_image(4, $col+10, "$folder_temp/$sample-HeatmapPercent-Genomic-Report.png");
|
|
2407
|
|
2408 # Bar plot for the sequence context on the genomic strand (6 mutation types)
|
|
2409 $ws->insert_image(27, $col+3, "$folder_temp/$sample-MutationSpectraPercent-Genomic-Report.png");
|
|
2410 }
|
|
2411
|
|
2412
|
|
2413 ############################################################################################################
|
|
2414 # Write NMF input for count and percentages in the Excel report
|
|
2415 sub writeInputNMF
|
|
2416 {
|
|
2417 my ($ws_inputNMF_count, $ws_inputNMF_percent, $outCount, $outPercent) = @_;
|
|
2418
|
|
2419
|
|
2420 open(OUTINPUTNMFC, ">", $outCount) or die "$!: $outCount\n"; # with the count
|
|
2421 open(OUTINPUTNMFP, ">", $outPercent) or die "$!: $outPercent\n"; # With the frequency un-normalized
|
|
2422
|
|
2423 foreach my $k_sample (@{$h_inputNMF{'Sample'}})
|
|
2424 {
|
|
2425 print OUTINPUTNMFC "\t$k_sample";
|
|
2426 print OUTINPUTNMFP "\t$k_sample";
|
|
2427 }
|
|
2428 print OUTINPUTNMFC "\n"; print OUTINPUTNMFP "\n";
|
|
2429
|
|
2430 my $row_inputNMF = 1;
|
|
2431 foreach my $k_context (sort keys %{$h_inputNMF{'Count'}})
|
|
2432 {
|
|
2433 $k_context =~ /(\w)_(\w)/; my ($base5, $base3) = ($1, $2);
|
|
2434 foreach my $k_mutation (sort keys %{$h_inputNMF{'Count'}{$k_context}})
|
|
2435 {
|
|
2436 my ($col_inputNMF_Count, $col_inputNMF_Percent) = (1, 1);
|
|
2437 my $contextNMF = $base5."[$k_mutation]".$base3;
|
|
2438
|
|
2439 # Write the input in the Excel report, only when all the samples are in the same workbook
|
|
2440 if($oneReportPerSample == 2)
|
|
2441 {
|
|
2442 $ws_inputNMF_count->write($row_inputNMF, 0, $contextNMF);
|
|
2443 $ws_inputNMF_percent->write($row_inputNMF, 0, $contextNMF);
|
|
2444 }
|
|
2445
|
|
2446 print OUTINPUTNMFC $contextNMF,"\t"; print OUTINPUTNMFP $contextNMF,"\t";
|
|
2447
|
|
2448 foreach (@{$h_inputNMF{'Count'}{$k_context}{$k_mutation}}) { print OUTINPUTNMFC "$_\t"; } print OUTINPUTNMFC "\n";
|
|
2449 foreach (@{$h_inputNMF{'Percent'}{$k_context}{$k_mutation}}) { print OUTINPUTNMFP "$_\t"; } print OUTINPUTNMFP "\n";
|
|
2450
|
|
2451 foreach (@{$h_inputNMF{'Count'}{$k_context}{$k_mutation}})
|
|
2452 {
|
|
2453 if($oneReportPerSample == 2)
|
|
2454 {
|
|
2455 $ws_inputNMF_count->write($row_inputNMF, $col_inputNMF_Count, $_);
|
|
2456 }
|
|
2457 $col_inputNMF_Count++;
|
|
2458 }
|
|
2459 foreach (@{$h_inputNMF{'Percent'}{$k_context}{$k_mutation}})
|
|
2460 {
|
|
2461 if($oneReportPerSample == 2)
|
|
2462 {
|
|
2463 $ws_inputNMF_percent->write($row_inputNMF, $col_inputNMF_Percent, $_);
|
|
2464 }
|
|
2465 $col_inputNMF_Percent++;
|
|
2466 }
|
|
2467 $row_inputNMF++;
|
|
2468 }
|
|
2469 }
|
|
2470 close OUTINPUTNMFP; close OUTINPUTNMFC;
|
|
2471 }
|
|
2472
|
|
2473
|
|
2474 ######################################################################################################################################################
|
|
2475 # Define format and background colors for the Excel report #
|
|
2476 ######################################################################################################################################################
|
|
2477 # Font: Arial size 10
|
|
2478 sub Format_A10
|
|
2479 {
|
|
2480 my ($wb, $format) = @_;
|
|
2481 $$format = $wb->add_format(font=>'Arial', size=>10); $$format->set_align('center');
|
|
2482 }
|
|
2483 # Font: Arial size 11 bold and center
|
|
2484 sub Format_A11Bold
|
|
2485 {
|
|
2486 my ($wb, $format) = @_;
|
|
2487 $$format = $wb->add_format(font=>'Arial', size=>11, bold=>1); $$format->set_align('center');
|
|
2488 }
|
|
2489 # Font: Arial size 10 italic red and center
|
|
2490 sub Format_A10ItalicRed
|
|
2491 {
|
|
2492 my ($wb, $format) = @_;
|
|
2493 $$format = $wb->add_format(font=>'Arial', size=>10, italic=>1, color => 'red'); $$format->set_align('center');
|
|
2494 }
|
|
2495 # Format: Arialt size 11 bold and left
|
|
2496 sub Format_A11BoldLeft
|
|
2497 {
|
|
2498 my ($wb, $format) = @_;
|
|
2499 $$format = $wb->add_format(valign =>'left', font=>'Arial', size=>11, bold=>1);
|
|
2500 }
|
|
2501 # Font: Arialt size 10 bold and left
|
|
2502 sub Format_A10BoldLeft
|
|
2503 {
|
|
2504 my ($wb, $format) = @_;
|
|
2505 $$format = $wb->add_format(valign =>'left', font=>'Arial', size=>10, bold=>1);
|
|
2506 }
|
|
2507 # Define the format of the border of the section (for delimiting the different section of the report)
|
|
2508 sub Format_section
|
|
2509 {
|
|
2510 my ($wb, $format_topLeft, $format_topRight, $format_bottomLeft, $format_bottomRight, $format_top, $format_right, $format_bottom, $format_left) = @_;
|
|
2511
|
|
2512 $$format_topLeft = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12);
|
|
2513 $$format_topLeft->set_top(2); $$format_topLeft->set_top_color('blue');
|
|
2514 $$format_topLeft->set_left(2); $$format_topLeft->set_left_color('blue');
|
|
2515
|
|
2516 $$format_topRight = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12);
|
|
2517 $$format_topRight->set_top(2); $$format_topRight->set_top_color('blue');
|
|
2518 $$format_topRight->set_right(2); $$format_topRight->set_right_color('blue');
|
|
2519
|
|
2520 $$format_bottomLeft = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12);
|
|
2521 $$format_bottomLeft->set_bottom(2); $$format_bottomLeft->set_bottom_color('blue');
|
|
2522 $$format_bottomLeft->set_left(2); $$format_bottomLeft->set_left_color('blue');
|
|
2523
|
|
2524 $$format_bottomRight = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12);
|
|
2525 $$format_bottomRight->set_bottom(2); $$format_bottomRight->set_bottom_color('blue');
|
|
2526 $$format_bottomRight->set_right(2); $$format_bottomRight->set_right_color('blue');
|
|
2527
|
|
2528 $$format_top = $wb->add_format(); $$format_top->set_top(2); $$format_top->set_top_color('blue');
|
|
2529 $$format_right = $wb->add_format(); $$format_right->set_right(2); $$format_right->set_right_color('blue');
|
|
2530 $$format_bottom = $wb->add_format(); $$format_bottom->set_bottom(2); $$format_bottom->set_bottom_color('blue');
|
|
2531 $$format_left = $wb->add_format(); $$format_left->set_left(2); $$format_left->set_left_color('blue');
|
|
2532 }
|
|
2533 # Define the header
|
|
2534 sub Format_Header
|
|
2535 {
|
|
2536 my ($wb, $format_CA, $format_CG, $format_CT, $format_TA, $format_TC, $format_TG, $format_TG2, $format_LeftHeader, $format_RightHeader, $format_LeftHeader2) = @_;
|
|
2537
|
|
2538 my ($blue, $black, $red, $gray, $green, $pink);
|
|
2539 Color($wb, \$blue, \$black, \$red, \$gray, \$green, \$pink);
|
|
2540
|
|
2541 my ($bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink);
|
|
2542 BackgroundColor($wb, \$bgColor_blue, \$bgColor_black, \$bgColor_red, \$bgColor_gray, \$bgColor_green, \$bgColor_pink);
|
|
2543
|
|
2544
|
|
2545 $$format_CA = $wb->add_format(bg_color => $blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CA->set_align('center'); $$format_CA->set_center_across();
|
|
2546 $$format_CG = $wb->add_format(bg_color => $black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CG->set_align('center'); $$format_CG->set_center_across();
|
|
2547 $$format_CT = $wb->add_format(bg_color => $red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CT->set_align('center'); $$format_CT->set_center_across();
|
|
2548 $$format_TA = $wb->add_format(bg_color => $gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TA->set_align('center'); $$format_TA->set_center_across();
|
|
2549 $$format_TC = $wb->add_format(bg_color => $green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TC->set_align('center'); $$format_TC->set_center_across();
|
|
2550 $$format_TG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TG->set_align('center'); $$format_TG->set_center_across();
|
|
2551 $$format_TG->set_right(2); $$format_TG->set_right_color('blue');
|
|
2552
|
|
2553 $$format_TG2 = $wb->add_format(bg_color => $pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TG2->set_align('center'); $$format_TG2->set_center_across();
|
|
2554
|
|
2555 $$format_LeftHeader = $wb->add_format(bold=>1, font=>'Arial', size=>11); $$format_LeftHeader->set_align('center'); $$format_LeftHeader->set_left(2); $$format_LeftHeader->set_left_color('blue');
|
|
2556 $$format_LeftHeader2 = $wb->add_format(bold=>1, font=>'Arial', size=>11); $$format_LeftHeader2->set_left(2); $$format_LeftHeader2->set_left_color('blue');
|
|
2557 $$format_RightHeader = $wb->add_format(bold=>1, font=>'Arial', size=>11); $$format_RightHeader->set_align('center'); $$format_RightHeader->set_right(2); $$format_RightHeader->set_right_color('blue');
|
|
2558 }
|
|
2559 # Define the header for the part "Strand bias by segment"
|
|
2560 sub Format_HeaderSBSDistrBySegAndFunc
|
|
2561 {
|
|
2562 my ($wb, $format_LeftCA, $format_LeftCG, $format_LeftCT, $format_LeftTA, $format_LeftTC, $format_LeftTG, $format_RightCA, $format_RightCG, $format_RightCT, $format_RightTA, $format_RightTC, $format_RightTG) = @_;
|
|
2563
|
|
2564 my ($bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink);
|
|
2565 BackgroundColor($wb, \$bgColor_blue, \$bgColor_black, \$bgColor_red, \$bgColor_gray, \$bgColor_green, \$bgColor_pink);
|
|
2566
|
|
2567 $$format_LeftCA = $wb->add_format(bg_color=>$bgColor_blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftCA->set_align('center'); $$format_LeftCA->set_left(2); $$format_LeftCA->set_left_color('blue');
|
|
2568 $$format_LeftCG = $wb->add_format(bg_color=>$bgColor_black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftCG->set_align('center'); $$format_LeftCG->set_left(2); $$format_LeftCG->set_left_color('blue');
|
|
2569 $$format_LeftCT = $wb->add_format(bg_color=>$bgColor_red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftCT->set_align('center'); $$format_LeftCT->set_left(2); $$format_LeftCT->set_left_color('blue');
|
|
2570 $$format_LeftTA = $wb->add_format(bg_color=>$bgColor_gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftTA->set_align('center'); $$format_LeftTA->set_left(2); $$format_LeftTA->set_left_color('blue');
|
|
2571 $$format_LeftTC = $wb->add_format(bg_color=>$bgColor_green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftTC->set_align('center'); $$format_LeftTC->set_left(2); $$format_LeftTC->set_left_color('blue');
|
|
2572 $$format_LeftTG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftTG->set_align('center'); $$format_LeftTG->set_left(2); $$format_LeftTG->set_left_color('blue');
|
|
2573
|
|
2574
|
|
2575 $$format_RightCA = $wb->add_format(bg_color=>$bgColor_blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightCA->set_align('center'); $$format_RightCA->set_right(2); $$format_RightCA->set_right_color('blue');
|
|
2576 $$format_RightCG = $wb->add_format(bg_color=>$bgColor_black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightCG->set_align('center'); $$format_RightCG->set_right(2); $$format_RightCG->set_right_color('blue');
|
|
2577 $$format_RightCT = $wb->add_format(bg_color=>$bgColor_red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightCT->set_align('center'); $$format_RightCT->set_right(2); $$format_RightCT->set_right_color('blue');
|
|
2578 $$format_RightTA = $wb->add_format(bg_color=>$bgColor_gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightTA->set_align('center'); $$format_RightTA->set_right(2); $$format_RightTA->set_right_color('blue');
|
|
2579 $$format_RightTC = $wb->add_format(bg_color=>$bgColor_green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightTC->set_align('center'); $$format_RightTC->set_right(2); $$format_RightTC->set_right_color('blue');
|
|
2580 $$format_RightTG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightTG->set_align('center'); $$format_RightTG->set_right(2); $$format_RightTG->set_right_color('blue');
|
|
2581 }
|
|
2582 # Define the header for the part "Trinucleotide sequence context on the coding strand"
|
|
2583 sub Format_Header12MutType
|
|
2584 {
|
|
2585 my ($wb, $format_CA, $format_CG, $format_CT, $format_TA, $format_TC, $format_TG) = @_;
|
|
2586
|
|
2587 my ($bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink);
|
|
2588 BackgroundColor($wb, \$bgColor_blue, \$bgColor_black, \$bgColor_red, \$bgColor_gray, \$bgColor_green, \$bgColor_pink);
|
|
2589
|
|
2590 $$format_CA = $wb->add_format(bg_color=>$bgColor_blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CA->set_align('center');
|
|
2591 $$format_CG = $wb->add_format(bg_color=>$bgColor_black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CG->set_align('center');
|
|
2592 $$format_CT = $wb->add_format(bg_color=>$bgColor_red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CT->set_align('center');
|
|
2593 $$format_TA = $wb->add_format(bg_color=>$bgColor_gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TA->set_align('center');
|
|
2594 $$format_TC = $wb->add_format(bg_color=>$bgColor_green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TC->set_align('center');
|
|
2595 $$format_TG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TG->set_align('center');
|
|
2596 }
|
|
2597 # Define the format for the text that needs a section border
|
|
2598 sub Format_TextSection
|
|
2599 {
|
|
2600 my ($wb, $formatT_left, $formatT_right, $formatT_bottomRight, $formatT_bottomLeft, $formatT_bottom, $formatT_bottomHeader, $formatT_bottomRightHeader, $formatT_bottomHeader2, $formatT_rightHeader) = @_;
|
|
2601
|
|
2602 $$formatT_left = $wb->add_format(valign=>'center', font=>'Arial', size=>10);
|
|
2603 $$formatT_left->set_left(2); $$formatT_left->set_left_color('blue');
|
|
2604
|
|
2605 $$formatT_right = $wb->add_format(valign=>'center', font=>'Arial', size=>10);
|
|
2606 $$formatT_right->set_right(2); $$formatT_right->set_right_color('blue');
|
|
2607
|
|
2608 $$formatT_bottomRight = $wb->add_format(valign=>'center', font=>'Arial', size=>10);
|
|
2609 $$formatT_bottomRight->set_bottom(2); $$formatT_bottomRight->set_bottom_color('blue');
|
|
2610 $$formatT_bottomRight->set_right(2); $$formatT_bottomRight->set_right_color('blue');
|
|
2611
|
|
2612 $$formatT_bottomLeft = $wb->add_format(valign=>'center', font=>'Arial', size=>10);
|
|
2613 $$formatT_bottomLeft->set_bottom(2); $$formatT_bottomLeft->set_bottom_color('blue');
|
|
2614 $$formatT_bottomLeft->set_left(2); $$formatT_bottomLeft->set_left_color('blue');
|
|
2615
|
|
2616 $$formatT_bottom = $wb->add_format(valign=>'center', font=>'Arial', size=>10);
|
|
2617 $$formatT_bottom->set_bottom(2); $$formatT_bottom->set_bottom_color('blue');
|
|
2618
|
|
2619 my $bgColor_totallighGray = $wb->set_custom_color(54, 230, 230, 230);
|
|
2620 $$formatT_bottomHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_bottomHeader->set_align('center');
|
|
2621 $$formatT_bottomHeader->set_bottom(2); $$formatT_bottomHeader->set_bottom_color('blue');
|
|
2622
|
|
2623 $$formatT_bottomRightHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_bottomRightHeader->set_align('center');
|
|
2624 $$formatT_bottomRightHeader->set_bottom(2); $$formatT_bottomRightHeader->set_bottom_color('blue');
|
|
2625 $$formatT_bottomRightHeader->set_right(2); $$formatT_bottomRightHeader->set_right_color('blue');
|
|
2626
|
|
2627 $$formatT_bottomHeader2 = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_bottomHeader2->set_align('center');
|
|
2628
|
|
2629 $$formatT_rightHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_rightHeader->set_align('center');
|
|
2630 $$formatT_rightHeader->set_right(2); $$formatT_rightHeader->set_right_color('blue');
|
|
2631 }
|
|
2632 # Define the format for the graphs titles
|
|
2633 sub Format_GraphTitle
|
|
2634 {
|
|
2635 my ($wb, $formatT_graphTitle) = @_;
|
|
2636
|
|
2637 $$formatT_graphTitle = $wb->add_format(font=>'Arial', size=>12, bold=>1);
|
|
2638 }
|
|
2639 # Define the format of the border of the tables
|
|
2640 sub Format_Table
|
|
2641 {
|
|
2642 my ($wb, $table_topleft, $table_topRight, $table_bottomleft, $table_bottomRight, $table_top, $table_right, $table_bottom, $table_bottomItalicRed, $table_left, $table_bottomrightHeader, $table_left2, $table_middleHeader, $table_middleHeader2) = @_;
|
|
2643
|
|
2644 $$table_topleft = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_topleft->set_top(1); $$table_topleft->set_left(1);
|
|
2645 $$table_topRight = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_topRight->set_top(1); $$table_topRight->set_right(1);
|
|
2646 $$table_bottomleft = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_bottomleft->set_bottom(1); $$table_bottomleft->set_left(1);
|
|
2647 $$table_bottomRight = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$table_bottomRight->set_bottom(1); $$table_bottomRight->set_right(1);
|
|
2648
|
|
2649 $$table_top = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_top->set_top(1);
|
|
2650 $$table_right = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$table_right->set_right(1);
|
|
2651 $$table_bottom = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$table_bottom->set_bottom(1);
|
|
2652 $$table_bottomItalicRed = $wb->add_format(valign=>'center', font=>'Arial', size=>10, italic=>1, color => 'red'); $$table_bottomItalicRed->set_bottom(1);
|
|
2653 $$table_left = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_left->set_left(1);
|
|
2654
|
|
2655 my $bgColor_totallighGray = $wb->set_custom_color(54, 230, 230, 230);
|
|
2656 $$table_bottomrightHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>10); $$table_bottomrightHeader->set_bottom(1); $$table_bottomrightHeader->set_right(1);
|
|
2657
|
|
2658 $$table_left2 = $wb->add_format(valign=>'left', font=>'Arial', size=>10); $$table_left2->set_left(1);
|
|
2659
|
|
2660 $$table_middleHeader = $wb->add_format(valign=>'center', bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>10);
|
|
2661 $$table_middleHeader2 = $wb->add_format(valign=>'center', bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>10); $$table_middleHeader2->set_bottom(1);
|
|
2662 }
|
|
2663 # Define the color
|
|
2664 sub Color
|
|
2665 {
|
|
2666 my ($wb, $blue, $black, $red, $gray, $green, $pink) = @_;
|
|
2667
|
|
2668 $$blue = $wb->set_custom_color(40, 0, 0, 204);# C:G>A:T in blue
|
|
2669 $$black = $wb->set_custom_color(41, 0, 0, 0);# C:G>G:C in black
|
|
2670 $$red = $wb->set_custom_color(42, 255, 0, 0);# C:G>T:A in red
|
|
2671 $$gray = $wb->set_custom_color(43, 205, 205, 205); # T:A>A:T in light gray
|
|
2672 $$green = $wb->set_custom_color(44, 0, 204, 51);# T:A>C:G in green
|
|
2673 $$pink = $wb->set_custom_color(45, 255, 192, 203);# T:A>G:C in pink
|
|
2674 }
|
|
2675 sub BackgroundColor
|
|
2676 {
|
|
2677 my ($wb, $bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink) = @_;
|
|
2678
|
|
2679 $$bgColor_blue = $wb->set_custom_color(48, 0, 0, 204);
|
|
2680 $$bgColor_black = $wb->set_custom_color(49, 0, 0, 0);
|
|
2681 $$bgColor_red = $wb->set_custom_color(50, 255, 0, 0);
|
|
2682 $$bgColor_gray = $wb->set_custom_color(51, 205, 205, 205);
|
|
2683 $$bgColor_green = $wb->set_custom_color(52, 0, 204, 51);
|
|
2684 $$bgColor_pink = $wb->set_custom_color(53, 255, 192, 203);
|
|
2685 }
|
|
2686
|
|
2687
|
|
2688
|
|
2689
|
|
2690 =head1 NAME
|
|
2691
|
|
2692 mutSpec-Stat
|
|
2693
|
|
2694 =head1 SYNOPSIS
|
|
2695
|
|
2696 mutSpecstat.pl [arguments] <query-file>
|
|
2697
|
|
2698 <query-file> a folder with one or multiple VCFs
|
|
2699
|
|
2700 Arguments:
|
|
2701 -h, --help print help message
|
|
2702 -m, --man print complete documentation
|
|
2703 -v, --verbose use verbose output
|
|
2704 --refGenome the reference genome to use (human, mouse or rat genomes)
|
|
2705 -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file
|
|
2706 --temp <string> the path for saving the temporary files
|
|
2707 --pathSeqRefGenome the path to the fasta reference sequences
|
|
2708 --poolData generate the pool of all the samples (optional)
|
|
2709 --reportSample generate a report for each sample (optional)
|
|
2710
|
|
2711
|
|
2712 Function: automatically run a pipeline and calculate various statistics on mutations
|
|
2713
|
|
2714 Example: mutSpecstat.pl --refGenome hg19 --outfile output_directory --temp path_to_temporary_directory --pathRscript path_to_R_scripts --pathSeqRefGenome path_fasta_ref_seq --poolData --reportSample inputFolder
|
|
2715
|
|
2716 Version: 02-2017 (February 2016)
|
|
2717
|
|
2718
|
|
2719 =head1 OPTIONS
|
|
2720
|
|
2721 =over 8
|
|
2722
|
|
2723 =item B<--help>
|
|
2724
|
|
2725 print a brief usage message and detailed explanation of options.
|
|
2726
|
|
2727 =item B<--man>
|
|
2728
|
|
2729 print the complete manual of the program.
|
|
2730
|
|
2731 =item B<--verbose>
|
|
2732
|
|
2733 use verbose output.
|
|
2734
|
|
2735 =item B<--refGenome>
|
|
2736
|
|
2737 the reference genome to use, could be human, mouse or rat genomes.
|
|
2738
|
|
2739 =item B<--outfile>
|
|
2740
|
|
2741 the directory of output file names. If it is nor specify the same directory as the input file is used.
|
|
2742
|
|
2743 =item B<--temp>
|
|
2744
|
|
2745 the path for saving temporary files generated by the script.
|
|
2746 If any is specify a temporary folder is created in the same directory where the script is running.
|
|
2747 Deleted when the script is finish
|
|
2748
|
|
2749 =item B<--pathSeqRefGenome>
|
|
2750
|
|
2751 The path to the fasta reference sequences
|
|
2752
|
|
2753 =item B<--poolData only for the report>
|
|
2754
|
|
2755 calculate the statistics on the pool of all the data pass in input
|
|
2756
|
|
2757 =item B<--reportSample only for the report>
|
|
2758
|
|
2759 generate a report for each samples
|
|
2760
|
|
2761 =head1 DESCRIPTION
|
|
2762
|
|
2763 mutSpecstat is a perl script for calculated various statistics on mutations
|
|
2764 An Excel report containing the mutation type distribution per functional region, the strand bias and the sequence context on genomic and coding sequence is created.
|
|
2765 The different statistics are illustrated using ggplot2.
|
|
2766
|
|
2767 =cut
|